Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 15

kvazaar.changes Changed

@@ -1,4 +1,34 @@
 -------------------------------------------------------------------
+Wed Apr 22 16:16:28 UTC 2020 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 2.0.0
+  * Several unmentioned improvements and fixes
+  Highlights:
+  * Updated presets
+  * Updated GOP definitions using QP offset model.
+    + There is now even longer hierarchical GOP --gop=16
+  * Much faster and improved bipred
+  * Alternative and better rate control algorithm, optimal bit
+    allocation (--rc-algorithm oba)
+  * Variance adaptive quantization (--vaq)
+  Features:
+  * Option to set QP offset for intra frames (--intra-qp-offset,
+    automatical by default)
+  * Zero-coeff-rdo is now configurable (--zero-coeff-rdo)
+  * Optional intra frame analysis for rate control (--intra-bits)
+  * Optional machine learning based depth constraints for intra
+    search (--ml-pu-depth-intra)
+  * PU depths are now separately configurable for each GOP layer
+  User Interface:
+  * Report bitrate and some kind of (cumulative) average QP
+  Optimizations:
+  * More AVX2 opimizations for SAO
+  * More AVX2 opimizations for transforms
+  * More AVX2 opimizations for intra prediction
+  * AVX2 strategy for variance calculation
+- Bump sover to 6
+
+-------------------------------------------------------------------
 Tue Jul  9 20:15:25 UTC 2019 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 1.3.0

kvazaar.spec Changed

kvazaar-1.3.0.tar.gz/.gitignore -> kvazaar-2.0.0.tar.gz/.gitignore Changed

kvazaar-1.3.0.tar.gz/.gitlab-ci.yml -> kvazaar-2.0.0.tar.gz/.gitlab-ci.yml Changed

kvazaar-1.3.0.tar.gz/.travis-install.bash -> kvazaar-2.0.0.tar.gz/.travis-install.bash Changed

kvazaar-1.3.0.tar.gz/.travis.yml -> kvazaar-2.0.0.tar.gz/.travis.yml Changed

kvazaar-1.3.0.tar.gz/README.md -> kvazaar-2.0.0.tar.gz/README.md Changed

@@ -22,6 +22,7 @@
 - [Compiling Kvazaar](#compiling-kvazaar)
   - [Required libraries](#required-libraries)
   - [Autotools](#autotools)
+  - [Autotools on MinGW](#autotools-on-mingw)
   - [OS X](#os-x)
   - [Visual Studio](#visual-studio)
   - [Docker](#docker)
@@ -113,11 +114,16 @@
                                    - 0: Only send VPS with the first frame.
                                    - N: Send VPS with every Nth intra frame.
   -r, --ref <integer>        : Number of reference frames, in range 1..15 [4]
-      --gop <string>         : GOP structure [8]
-                                   - 0: Disabled
-                                   - 8: B-frame pyramid of length 8
-                                   - lp-<string>: Low-delay P-frame GOP
+      --gop <string>         : GOP structure [lp-g4d3t1]
+                                   -  0: Disabled
+                                   -  8: B-frame pyramid of length 8
+                                   - 16: B-frame pyramid of length 16
+                                   - lp-<string>: Low-delay P/B-frame GOP
                                      (e.g. lp-g8d4t2, see README)
+      --intra-qp-offset <int>: QP offset for intra frames [-51..51] [auto]
+                                   - N: Set QP offset to N.
+                                   - auto: Select offset automatically based
+                                     on GOP length.
       --(no-)open-gop        : Use open GOP configuration. [enabled]
       --cqmfile <filename>   : Read custom quantization matrices from a file.
       --scaling-list <string>: Set scaling list mode. [off]
@@ -127,6 +133,15 @@
       --bitrate <integer>    : Target bitrate [0]
                                    - 0: Disable rate control.
                                    - N: Target N bits per second.
+      --rc-algorithm <string>: Select used rc-algorithm. [lambda]
+                                   - lambda: rate control from:
+                                     DOI: 10.1109/TIP.2014.2336550 
+                                   - oba: DOI: 10.1109/TCSVT.2016.2589878
+      --(no-)intra-bits      : Use Hadamard cost based allocation for intra
+                               frames. Default on for gop 8 and off for lp-gop
+      --(no-)clip-neighbour  : On oba based rate control whether to clip 
+                               lambda values to same frame's ctus or previous'.
+                               Default on for RA GOPS and disabled for LP.
       --(no-)lossless        : Use lossless coding. [disabled]
       --mv-constraint <string> : Constrain movement vectors. [none]
                                    - none: No constraint
@@ -150,6 +165,9 @@
       --high-tier            : Used with --level. Use high tier bitrate limits
                                instead of the main tier limits during encoding.
                                High tier requires level 4 or higher.
+      --(no-)vaq <integer>   : Enable variance adaptive quantization with given
+                               strength, in range 1..20. Recommended: 5.
+                               [disabled]
 
 Compression tools:
       --(no-)deblock <beta:tc> : Deblocking filter. [0:0]
@@ -173,6 +191,8 @@
                                         chroma mode search.
       --(no-)mv-rdo          : Rate-distortion optimized motion vector costs
                                [disabled]
+      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero
+                               residual improves the RD cost. [enabled]
       --(no-)full-intra-search : Try all intra modes during rough search.
                                [disabled]
       --(no-)transform-skip  : Try transform skip [disabled]
@@ -192,8 +212,19 @@
                                    - 4: + 1/4-pixel diagonal
       --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]
                                    - 0, 1, 2, 3: from 64x64 to 8x8
+                                   - Accepts a list of values separated by ','
+                                     for setting separate depths per GOP layer
+                                     (values can be omitted to use the first
+                                     value for the respective layer).
       --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]
                                    - 0, 1, 2, 3, 4: from 64x64 to 4x4
+                                   - Accepts a list of values separated by ','
+                                     for setting separate depths per GOP layer
+                                     (values can be omitted to use the first
+                                     value for the respective layer).
+      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine
+                                learning trees, overrides the
+                                --pu-depth-intra parameter. [disabled]
       --tr-depth-intra <int> : Transform split depth for intra blocks [0]
       --(no-)bipred          : Bi-prediction [disabled]
       --cu-split-termination <string> : CU split search termination [zero]
@@ -246,6 +277,13 @@
                                    - tiles: Put tiles in independent slices.
                                    - wpp: Put rows in dependent slices.
                                    - tiles+wpp: Do both.
+      --partial-coding <x-offset>!<y-offset>!<slice-width>!<slice-height>
+                             : Encode partial frame.
+                               Parts must be merged to form a valid bitstream.
+                               X and Y are CTU offsets.
+                               Slice width and height must be divisible by CTU
+                               in pixels unless it is the last CTU row/column.
+                               This parameter is used by kvaShare.
 
 Video Usability Information:
       --sar <width:height>   : Specify sample aspect ratio
@@ -299,20 +337,20 @@
 
 |                      | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p   |
 | -------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
-| rd                   | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 2     | 2     | 2     |
+| rd                   | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 2     | 2     | 2     |
 | pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-4   | 1-4   | 1-4   | 1-4   | 1-4   |
-| pu-depth-inter       | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   |
-| me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    |
-| gop                  | g4d4t1| g4d4t1| g4d4t1| g4d4t1| g4d4t1| 8     | 8     | 8     | 8     | 8     |
+| pu-depth-inter       | 1-2   | 1-2   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   |
+| me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    | tz    |
+| gop                  | 8     | 8     | 8     | 8     | 8     | 16    | 16    | 16    | 16    | 16    |
 | ref                  | 1     | 1     | 1     | 1     | 2     | 4     | 4     | 4     | 4     | 4     |
-| bipred               | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     |
+| bipred               | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
 | deblock              | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
 | signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     |
-| subme                | 2     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     | 4     |
+| subme                | 0     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     | 4     |
 | sao                  | off   | full  | full  | full  | full  | full  | full  | full  | full  | full  |
 | rdoq                 | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     |
 | rdoq-skip            | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
-| transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     |
 | mv-rdo               | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
 | full-intra-search    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
 | smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     |
@@ -320,7 +358,7 @@
 | cu-split-termination | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | off   |
 | me-early-termination | sens. | sens. | sens. | sens. | sens. | on    | on    | off   | off   | off   |
 | intra-rdo-et         | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
-| early-skip           | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| early-skip           | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 0     |
 | fast-residual-cost   | 28    | 28    | 28    | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
 | max-merge            | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     |
 
@@ -357,6 +395,12 @@
 
 See `./configure --help` for more options.
 
+### Autotools on MinGW
+It is recommended to use Clang instead of GCC in MinGW environments. GCC also works, but AVX2 optimizations will be disabled because of a known GCC issue from 2012, so performance will suffer badly. Instead of `./configure`, run
+
+    CC=clang ./configure
+
+to build Kvazaar using Clang.
 
 ### OS X
 - Install Homebrew
@@ -365,7 +409,7 @@
 
 
 ### Visual Studio
-- At least VisualStudio 2015 is required.
+- At least VisualStudio 2015.2 is required.
 - Project files can be found under build/.
 - Requires external [vsyasm.exe](http://yasm.tortall.net/Download.html)
   in %PATH%

kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-2.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -138,6 +138,7 @@
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\src\constraint.c" />
     <ClCompile Include="..\..\src\extras\crypto.cpp" />
     <ClCompile Include="..\..\src\extras\libmd5.c" />
     <ClCompile Include="..\..\src\input_frame_buffer.c" />
@@ -159,6 +160,7 @@
     <ClCompile Include="..\..\src\imagelist.c" />
     <ClCompile Include="..\..\src\inter.c" />
     <ClCompile Include="..\..\src\intra.c" />
+    <ClCompile Include="..\..\src\ml_intra_cu_depth_pred.c" />
     <ClCompile Include="..\..\src\nal.c" />
     <ClCompile Include="..\..\src\rate_control.c" />
     <ClCompile Include="..\..\src\rdo.c" />
@@ -199,9 +201,11 @@
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
     <ClInclude Include="..\..\src\checkpoint.h" />
+    <ClInclude Include="..\..\src\constraint.h" />
     <ClInclude Include="..\..\src\cu.h" />
     <ClInclude Include="..\..\src\extras\crypto.h" />
     <ClInclude Include="..\..\src\extras\libmd5.h" />
+    <ClInclude Include="..\..\src\gop.h" />
     <ClInclude Include="..\..\src\image.h" />
     <ClInclude Include="..\..\src\imagelist.h" />
     <ClCompile Include="..\..\src\strategies\altivec\picture-altivec.c" />
@@ -259,6 +263,7 @@
     <ClInclude Include="..\..\src\input_frame_buffer.h" />
     <ClInclude Include="..\..\src\kvazaar_internal.h" />
     <ClInclude Include="..\..\src\kvz_math.h" />
+    <ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
     <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h" />

kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-2.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -52,6 +52,9 @@
     <Filter Include="Threadwrapper">
       <UniqueIdentifier>{f4abece9-e209-4817-a57e-c64ca7c5e05c}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Constraint">
+      <UniqueIdentifier>{895fc8cc-6f08-49a7-b377-b5c38a44d1b1}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\strategies\strategies-nal.c">
@@ -239,6 +242,12 @@
     <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp">
       <Filter>Threadwrapper</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\constraint.c">
+      <Filter>Constraint</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\ml_intra_cu_depth_pred.c">
+      <Filter>Constraint</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -453,6 +462,15 @@
     <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h">
       <Filter>Threadwrapper</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\constraint.h">
+      <Filter>Constraint</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h">
+      <Filter>Constraint</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\gop.h">
+      <Filter>Control</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">

kvazaar-1.3.0.tar.gz/configure.ac -> kvazaar-2.0.0.tar.gz/configure.ac Changed

@@ -22,7 +22,7 @@
 #   - Increment when making new releases and major or minor was not changed since last release.
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
-ver_major=4
+ver_major=6
 ver_minor=2
 ver_release=0
 
@@ -45,16 +45,31 @@
 
 LT_INIT([win32-dll])
 
+AC_CANONICAL_HOST
+
+flag_gcc_on_mingw="false"
+case x"${host_os}" in
+  x"cygwin"*|x"mingw"*)
+    if test x"${CC}" = x"gcc" ; then
+      flag_gcc_on_mingw="true"
+    fi
+esac
+
 AX_CHECK_COMPILE_FLAG([-maltivec],[flag_altivec="true"])
 AX_CHECK_COMPILE_FLAG([-mavx2],   [flag_avx2="true"])
 AX_CHECK_COMPILE_FLAG([-msse4.1], [flag_sse4_1="true"])
 AX_CHECK_COMPILE_FLAG([-msse2],   [flag_sse2="true"])
 AX_CHECK_COMPILE_FLAG([-mbmi],    [flag_bmi="true"])
 AX_CHECK_COMPILE_FLAG([-mabm],    [flag_abm="true"])
+AX_CHECK_COMPILE_FLAG([-mpopcnt], [flag_popcnt="true"])
+AX_CHECK_COMPILE_FLAG([-mlzcnt],  [flag_lzcnt="true"])
 AX_CHECK_COMPILE_FLAG([-mbmi2],   [flag_bmi2="true"])
 
+# Do we need -mpopcnt and -mlzcnt, or -mabm to use POPCNT and LZCNT
+# instructions? Ask GCC and Clang, and they have different answers.
 AM_CONDITIONAL([HAVE_ALTIVEC], [test x"$flag_altivec" = x"true"])
-AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true"])
+AM_CONDITIONAL([HAVE_AVX2_GCC], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true" -a x"$flag_gcc_on_mingw" = x"false"])
+AM_CONDITIONAL([HAVE_AVX2_CLANG], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_popcnt" = x"true" -a x"$flag_lzcnt" = x"true" -a x"$flag_bmi2" = x"true" -a x"$flag_gcc_on_mingw" = x"false"])
 AM_CONDITIONAL([HAVE_SSE4_1], [test x"$flag_sse4_1" = x"true"])
 AM_CONDITIONAL([HAVE_SSE2], [test x"$flag_sse2" = x"true"])
 
@@ -130,6 +145,15 @@
                ]
          )
         ],
+         [midipix*], [
+         AS_IF(
+               [test "x$BITS" = "x32"], [
+                ASFLAGS="$ASFLAGS -fwin32 -DPREFIX -DHAVE_ALIGNED_STACK=0"
+               ], [
+                ASFLAGS="$ASFLAGS -fwin64 -DHAVE_ALIGNED_STACK=1"
+               ]
+         )
+        ],
         [linux*|*kfreebsd*], [
          ASFLAGS="$ASFLAGS -f elf$BITS"
          LDFLAGS="$LDFLAGS -Wl,-z,noexecstack"

kvazaar-1.3.0.tar.gz/doc/kvazaar.1 -> kvazaar-2.0.0.tar.gz/doc/kvazaar.1 Changed

@@ -1,4 +1,4 @@
-.TH KVAZAAR "1" "July 2019" "kvazaar v1.3.0" "User Commands"
+.TH KVAZAAR "1" "April 2020" "kvazaar v2.0.0" "User Commands"
 .SH NAME
 kvazaar \- open source HEVC encoder
 .SH SYNOPSIS
@@ -115,20 +115,25 @@
 Number of reference frames, in range 1..15 [4]
 .TP
 \fB\-\-gop <string>        
-GOP structure [8]
-    \- 0: Disabled
-    \- 8: B\-frame pyramid of length 8
-    \- lp\-<string>: Low\-delay P\-frame GOP
+GOP structure [lp\-g4d3t1]
+    \-  0: Disabled
+    \-  8: B\-frame pyramid of length 8
+    \- 16: B\-frame pyramid of length 16
+    \- lp\-<string>: Low\-delay P/B\-frame GOP
       (e.g. lp\-g8d4t2, see README)
 .TP
-\fB\-\-(no\-)open\-gop
+\fB\-\-intra\-qp\-offset <int>: QP offset for intra frames [\-51..51] [auto]
+    \- N: Set QP offset to N.
+    \- auto: Select offset automatically based
+      on GOP length.
+.TP
+\fB\-\-(no\-)open\-gop       
 Use open GOP configuration. [enabled]
 .TP
 \fB\-\-cqmfile <filename>  
 Read custom quantization matrices from a file.
 .TP
-\fB\-\-scaling-list <string>
-Set scaling list mode. [off]
+\fB\-\-scaling\-list <string>: Set scaling list mode. [off]
     \- off: Disable scaling lists.
     \- custom: use custom list (with \-\-cqmfile).
     \- default: Use default lists.
@@ -138,6 +143,20 @@
     \- 0: Disable rate control.
     \- N: Target N bits per second.
 .TP
+\fB\-\-rc\-algorithm <string>: Select used rc\-algorithm. [lambda]
+    \- lambda: rate control from:
+      DOI: 10.1109/TIP.2014.2336550 
+    \- oba: DOI: 10.1109/TCSVT.2016.2589878
+.TP
+\fB\-\-(no\-)intra\-bits     
+Use Hadamard cost based allocation for intra
+frames. Default on for gop 8 and off for lp\-gop
+.TP
+\fB\-\-(no\-)clip\-neighbour 
+On oba based rate control whether to clip 
+lambda values to same frame's ctus or previous'.
+Default on for RA GOPS and disabled for LP.
+.TP
 \fB\-\-(no\-)lossless       
 Use lossless coding. [disabled]
 .TP
@@ -176,6 +195,11 @@
 Used with \-\-level. Use high tier bitrate limits
 instead of the main tier limits during encoding.
 High tier requires level 4 or higher.
+.TP
+\fB\-\-(no\-)vaq <integer>  
+Enable variance adaptive quantization with given
+strength, in range 1..20. Recommended: 5.
+[disabled]
 
 .SS "Compression tools:"
 .TP
@@ -218,6 +242,10 @@
 Rate\-distortion optimized motion vector costs
 [disabled]
 .TP
+\fB\-\-(no\-)zero\-coeff\-rdo 
+If a CU is set inter, check if forcing zero
+residual improves the RD cost. [enabled]
+.TP
 \fB\-\-(no\-)full\-intra\-search
 Try all intra modes during rough search.
 [disabled]
@@ -248,10 +276,23 @@
 \fB\-\-pu\-depth\-inter <int>\-<int>
 Inter prediction units sizes [0\-3]
     \- 0, 1, 2, 3: from 64x64 to 8x8
+    \- Accepts a list of values separated by ','
+      for setting separate depths per GOP layer
+      (values can be omitted to use the first
+      value for the respective layer).
 .TP
 \fB\-\-pu\-depth\-intra <int>\-<int>
 Intra prediction units sizes [1\-4]
     \- 0, 1, 2, 3, 4: from 64x64 to 4x4
+    \- Accepts a list of values separated by ','
+      for setting separate depths per GOP layer
+      (values can be omitted to use the first
+      value for the respective layer).
+.TP
+\fB\-\-ml\-pu\-depth\-intra   
+Predict the pu\-depth\-intra using machine
+ learning trees, overrides the
+ \-\-pu\-depth\-intra parameter. [disabled]
 .TP
 \fB\-\-tr\-depth\-intra <int>
 Transform split depth for intra blocks [0]
@@ -282,7 +323,8 @@
 Try to find skip cu from merge candidates.
 Perform no further search if skip is found.
 For rd=0..1: Try the first candidate.
-For rd=2.. : Try the best candidate based
+For rd=2..
+Try the best candidate based
              on luma satd cost. [enabled]
 .TP
 \fB\-\-max\-merge <integer> 
@@ -336,6 +378,15 @@
     \- tiles: Put tiles in independent slices.
     \- wpp: Put rows in dependent slices.
     \- tiles+wpp: Do both.
+.TP
+\fB\-\-partial\-coding <x\-offset>!<y\-offset>!<slice\-width>!<slice\-height>
+                            
+Encode partial frame.
+Parts must be merged to form a valid bitstream.
+X and Y are CTU offsets.
+Slice width and height must be divisible by CTU
+in pixels unless it is the last CTU row/column.
+This parameter is used by kvaShare.
 
 .SS "Video Usability Information:"
 .TP

kvazaar-1.3.0.tar.gz/src/Makefile.am -> kvazaar-2.0.0.tar.gz/src/Makefile.am Changed

@@ -53,6 +53,8 @@
 	checkpoint.h \
 	cfg.c \
 	cfg.h \
+	constraint.c \
+	constraint.h \ 
 	context.c \
 	context.h \
 	cu.c \
@@ -72,6 +74,7 @@
 	filter.c \
 	filter.h \
 	global.h \
+	gop.h \
 	image.c \
 	image.h \
 	imagelist.c \
@@ -85,6 +88,8 @@
 	kvazaar.c \
 	kvazaar_internal.h \
 	kvz_math.h \
+	ml_intra_cu_depth_pred.c \
+	ml_intra_cu_depth_pred.h \
 	nal.c \
 	nal.h \
 	rate_control.c \
@@ -126,6 +131,9 @@
 	strategies/generic/sao-generic.h \
 	strategies/generic/encode_coding_tree-generic.c \
 	strategies/generic/encode_coding_tree-generic.h \
+	strategies/missing-intel-intrinsics.h \
+	strategies/optimized_sad_func_ptr_t.h \
+	strategies/generic/sao_shared_generics.h \
 	strategies/strategies-common.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -179,6 +187,7 @@
 	strategies/altivec/picture-altivec.h
 
 libavx2_la_SOURCES = \
+	strategies/avx2/avx2_common_functions.h \
 	strategies/avx2/dct-avx2.c \
 	strategies/avx2/dct-avx2.h \
 	strategies/avx2/intra-avx2.c \
@@ -189,6 +198,7 @@
 	strategies/avx2/picture-avx2.h \
 	strategies/avx2/quant-avx2.c \
 	strategies/avx2/quant-avx2.h \
+	strategies/avx2/reg_sad_pow2_widths-avx2.h \
 	strategies/avx2/sao-avx2.c \
 	strategies/avx2/sao-avx2.h \
 	strategies/avx2/encode_coding_tree-avx2.c \
@@ -200,7 +210,8 @@
 
 libsse41_la_SOURCES = \
 	strategies/sse41/picture-sse41.c \
-	strategies/sse41/picture-sse41.h
+	strategies/sse41/picture-sse41.h \
+	strategies/sse41/reg_sad_pow2_widths-sse41.h
 
 if HAVE_PPC
 
@@ -212,9 +223,12 @@
 
 if HAVE_X86
 
-if HAVE_AVX2
+if HAVE_AVX2_GCC
 libavx2_la_CFLAGS = -mavx2 -mbmi -mabm -mbmi2
 endif
+if HAVE_AVX2_CLANG
+libavx2_la_CFLAGS = -mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2
+endif
 if HAVE_SSE4_1
 libsse41_la_CFLAGS = -msse4.1
 endif

kvazaar-1.3.0.tar.gz/src/cabac.c -> kvazaar-2.0.0.tar.gz/src/cabac.c Changed

@@ -309,14 +309,14 @@
    else
     if(r_param==2) {
        if( base_level ==1) {
-    	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+         uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
          state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 3;
          CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
          //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
        } else
          if( base_level ==2) {
            if(codeNumber<=7 || codeNumber>=12) {
-        	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
              state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 3;
              CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
@@ -365,7 +365,7 @@
                //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
              } else
                if(codeNumber<=21){
-            	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+               uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
                  state->crypto_prev_pos  = 4+(( (Suffix&1) + ( state->crypto_prev_pos^key )) & 1);
                  CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
                  //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);

kvazaar-1.3.0.tar.gz/src/cabac.h -> kvazaar-2.0.0.tar.gz/src/cabac.h Changed

kvazaar-1.3.0.tar.gz/src/cfg.c -> kvazaar-2.0.0.tar.gz/src/cfg.c Changed

@@ -19,6 +19,7 @@
  ****************************************************************************/
 
 #include "cfg.h"
+#include "gop.h"
 
 #include <limits.h>
 #include <stdio.h>
@@ -40,6 +41,8 @@
   cfg->framerate_num   = 25;
   cfg->framerate_denom = 1;
   cfg->qp              = 22;
+  cfg->intra_qp_offset = 0;
+  cfg->intra_qp_offset_auto = true;
   cfg->intra_period    = 64;
   cfg->vps_period      = 0;
   cfg->deblock_enable  = 1;
@@ -98,10 +101,14 @@
   cfg->cpuid = 1;
 
   // Defaults for what sizes of PUs are tried.
-  cfg->pu_depth_inter.min = 2; // 0-3
-  cfg->pu_depth_inter.max = 3; // 0-3
-  cfg->pu_depth_intra.min = 2; // 0-4
-  cfg->pu_depth_intra.max = 3; // 0-4
+  memset( cfg->pu_depth_inter.min, -1, sizeof( cfg->pu_depth_inter.min ) );
+  memset( cfg->pu_depth_inter.max, -1, sizeof( cfg->pu_depth_inter.max ) );
+  memset( cfg->pu_depth_intra.min, -1, sizeof( cfg->pu_depth_intra.min ) );
+  memset( cfg->pu_depth_intra.max, -1, sizeof( cfg->pu_depth_intra.max ) );
+  *cfg->pu_depth_inter.min = 2; // 0-3
+  *cfg->pu_depth_inter.max = 3; // 0-3
+  *cfg->pu_depth_intra.min = 2; // 0-4
+  *cfg->pu_depth_intra.max = 3; // 0-4
 
   cfg->add_encoder_info = true;
   cfg->calc_psnr = true;
@@ -136,11 +143,26 @@
 
   cfg->me_max_steps = (uint32_t)-1;
 
+  cfg->vaq = 0;
+
   cfg->scaling_list = KVZ_SCALING_LIST_OFF;
 
   cfg->max_merge = 5;
   cfg->early_skip = true;
 
+  cfg->ml_pu_depth_intra = false;
+
+  cfg->partial_coding.startCTU_x = 0;
+  cfg->partial_coding.startCTU_y = 0;
+  cfg->partial_coding.fullWidth = 0;
+  cfg->partial_coding.fullHeight = 0;
+
+  cfg->zero_coeff_rdo = true;
+
+  cfg->rc_algorithm = KVZ_NO_RC;
+  cfg->intra_bit_allocation = false;
+  cfg->clip_neighbour = true;
+
   return 1;
 }
 
@@ -297,6 +319,45 @@
   return 1;
 }
 
+static int parse_pu_depth_list( const char *array, int32_t *depths_min, int32_t *depths_max, int size )
+{
+    char *list = strdup( array );
+    char *token;
+    int i = 0;
+    int ptr = -1;
+    int len = strlen( list );
+    int retval = 1;
+
+    //Reset depths in case multiple pu depth parameters are given
+    if(size > 1) memset( depths_max + 1, -1, (size - 1) * sizeof( *depths_max ) );
+    if(size > 1) memset( depths_min + 1, -1, (size - 1) * sizeof( *depths_min ) );
+
+    token = strtok( list, "," );
+    while( ptr < len && list[ptr + 1] == ',' )
+    {
+        i++;
+        ptr++;
+    }
+    while( retval && token != NULL && i < size ) {
+        retval &= (sscanf( token, "%d-%d", &depths_min[i], &depths_max[i] ) == 2);
+        ptr += (retval ? 4 : 0);
+        i++;
+        token = strtok( NULL, "," );
+        while(ptr < len && list[ptr + 1] == ',' ){
+          i++;
+          ptr++;
+        }
+    }
+
+    if( i >= size && ( token != NULL ) ) {
+        fprintf( stderr, "parsing failed : too many values.\n" );
+        retval = 0;
+    }
+    
+    free( list );
+    return retval;
+}
+
 static int parse_slice_specification(const char* const arg, int32_t * const nslices, int32_t** const array) {
   const char* current_arg = NULL;
   int32_t current_value;
@@ -386,19 +447,21 @@
 
   static const char * const scaling_list_names[] = { "off", "custom", "default", NULL };
 
+  static const char * const rc_algorithm_names[] = { "no-rc", "lambda", "oba", NULL };
   static const char * const preset_values[11][25*2] = {
+
       {
         "ultrafast",
         "rd", "0",
         "pu-depth-intra", "2-3",
-        "pu-depth-inter", "2-3",
+        "pu-depth-inter", "1-2",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
-        "subme", "2",
+        "subme", "0",
         "sao", "off",
         "rdoq", "0",
         "rdoq-skip", "0",
@@ -419,11 +482,11 @@
         "superfast",
         "rd", "0",
         "pu-depth-intra", "2-3",
-        "pu-depth-inter", "2-3",
+        "pu-depth-inter", "1-2",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
@@ -449,9 +512,9 @@
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "1-3",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
@@ -477,9 +540,9 @@
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "1-3",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
@@ -505,9 +568,9 @@
         "pu-depth-intra", "1-3",
         "pu-depth-inter", "1-3",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "2",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
@@ -533,9 +596,9 @@
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "hexbs",
-        "gop", "8",
+        "gop", "16",
         "ref", "4",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
@@ -557,11 +620,11 @@
       },

kvazaar-1.3.0.tar.gz/src/cli.c -> kvazaar-2.0.0.tar.gz/src/cli.c Changed

@@ -133,10 +133,22 @@
   { "set-qp-in-cu",             no_argument, NULL, 0 },
   { "open-gop",                 no_argument, NULL, 0 },
   { "no-open-gop",              no_argument, NULL, 0 },
+  { "vaq",                required_argument, NULL, 0 },
+  { "no-vaq",                   no_argument, NULL, 0 },
   { "scaling-list",       required_argument, NULL, 0 },
   { "max-merge",          required_argument, NULL, 0 },
   { "early-skip",               no_argument, NULL, 0 },
   { "no-early-skip",            no_argument, NULL, 0 },
+  { "ml-pu-depth-intra",        no_argument, NULL, 0 },
+  { "partial-coding",     required_argument, NULL, 0 },
+  { "zero-coeff-rdo",           no_argument, NULL, 0 },
+  { "no-zero-coeff-rdo",        no_argument, NULL, 0 },
+  { "intra-qp-offset",    required_argument, NULL, 0 },
+  { "rc-algorithm",       required_argument, NULL, 0 },
+  { "intra-bits",               no_argument, NULL, 0 },
+  { "no-intra-bits",            no_argument, NULL, 0 },
+  { "clip-neighbour",           no_argument, NULL, 0 },
+  { "no-clip-neighbour",        no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -396,11 +408,16 @@
     "                                   - 0: Only send VPS with the first frame.\n"
     "                                   - N: Send VPS with every Nth intra frame.\n"
     "  -r, --ref <integer>        : Number of reference frames, in range 1..15 [4]\n"
-    "      --gop <string>         : GOP structure [8]\n"
-    "                                   - 0: Disabled\n"
-    "                                   - 8: B-frame pyramid of length 8\n"
-    "                                   - lp-<string>: Low-delay P-frame GOP\n"
+    "      --gop <string>         : GOP structure [lp-g4d3t1]\n"
+    "                                   -  0: Disabled\n"
+    "                                   -  8: B-frame pyramid of length 8\n"
+    "                                   - 16: B-frame pyramid of length 16\n"
+    "                                   - lp-<string>: Low-delay P/B-frame GOP\n"
     "                                     (e.g. lp-g8d4t2, see README)\n"
+    "      --intra-qp-offset <int>: QP offset for intra frames [-51..51] [auto]\n"
+    "                                   - N: Set QP offset to N.\n"
+    "                                   - auto: Select offset automatically based\n"
+    "                                     on GOP length.\n"
     "      --(no-)open-gop        : Use open GOP configuration. [enabled]\n"
     "      --cqmfile <filename>   : Read custom quantization matrices from a file.\n"
     "      --scaling-list <string>: Set scaling list mode. [off]\n"
@@ -410,6 +427,15 @@
     "      --bitrate <integer>    : Target bitrate [0]\n"
     "                                   - 0: Disable rate control.\n"
     "                                   - N: Target N bits per second.\n"
+    "      --rc-algorithm <string>: Select used rc-algorithm. [lambda]\n"
+    "                                   - lambda: rate control from:\n"
+    "                                     DOI: 10.1109/TIP.2014.2336550 \n"
+    "                                   - oba: DOI: 10.1109/TCSVT.2016.2589878\n"
+    "      --(no-)intra-bits      : Use Hadamard cost based allocation for intra\n"
+    "                               frames. Default on for gop 8 and off for lp-gop\n"
+    "      --(no-)clip-neighbour  : On oba based rate control whether to clip \n"
+    "                               lambda values to same frame's ctus or previous'.\n"
+    "                               Default on for RA GOPS and disabled for LP.\n"
     "      --(no-)lossless        : Use lossless coding. [disabled]\n"
     "      --mv-constraint <string> : Constrain movement vectors. [none]\n"
     "                                   - none: No constraint\n"
@@ -433,6 +459,9 @@
     "      --high-tier            : Used with --level. Use high tier bitrate limits\n"
     "                               instead of the main tier limits during encoding.\n"
     "                               High tier requires level 4 or higher.\n"
+    "      --(no-)vaq <integer>   : Enable variance adaptive quantization with given\n"
+    "                               strength, in range 1..20. Recommended: 5.\n"
+    "                               [disabled]\n"
     "\n"
     /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Compression tools:\n"
@@ -457,6 +486,8 @@
     "                                        chroma mode search.\n"
     "      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs\n"
     "                               [disabled]\n"
+    "      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero\n"
+    "                               residual improves the RD cost. [enabled]\n"
     "      --(no-)full-intra-search : Try all intra modes during rough search.\n"
     "                               [disabled]\n"
     "      --(no-)transform-skip  : Try transform skip [disabled]\n"
@@ -476,8 +507,19 @@
     "                                   - 4: + 1/4-pixel diagonal\n"
     "      --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]\n"
     "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
+    "                                   - Accepts a list of values separated by ','\n"
+    "                                     for setting separate depths per GOP layer\n"
+    "                                     (values can be omitted to use the first\n"
+    "                                     value for the respective layer).\n"
     "      --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]\n"
     "                                   - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
+    "                                   - Accepts a list of values separated by ','\n"
+    "                                     for setting separate depths per GOP layer\n"
+    "                                     (values can be omitted to use the first\n"
+    "                                     value for the respective layer).\n"
+    "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
+    "                                learning trees, overrides the\n"
+    "                                --pu-depth-intra parameter. [disabled]\n"
     "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
     "      --(no-)bipred          : Bi-prediction [disabled]\n"
     "      --cu-split-termination <string> : CU split search termination [zero]\n"
@@ -531,6 +573,13 @@
     "                                   - tiles: Put tiles in independent slices.\n"
     "                                   - wpp: Put rows in dependent slices.\n"
     "                                   - tiles+wpp: Do both.\n"
+    "      --partial-coding <x-offset>!<y-offset>!<slice-width>!<slice-height>\n"
+    "                             : Encode partial frame.\n" 
+    "                               Parts must be merged to form a valid bitstream.\n"
+    "                               X and Y are CTU offsets.\n"
+    "                               Slice width and height must be divisible by CTU\n"
+    "                               in pixels unless it is the last CTU row/column.\n"
+    "                               This parameter is used by kvaShare.\n"
     "\n"
     /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Video Usability Information:\n"
@@ -564,13 +613,16 @@
 void print_frame_info(const kvz_frame_info *const info,
                       const double frame_psnr[3],
                       const uint32_t bytes,
-                      const bool print_psnr)
+                      const bool print_psnr,
+                      const double avg_qp)
 {
-  fprintf(stderr, "POC %4d QP %2d (%c-frame) %10d bits",
+  fprintf(stderr, "POC %4d QP %2d AVG QP %.1f (%c-frame) %10d bits",
           info->poc,
           info->qp,
+          avg_qp,
           "BPI"[info->slice_type % 3],
           bytes << 3);
+
   if (print_psnr) {
     fprintf(stderr, " PSNR Y %2.4f U %2.4f V %2.4f",
             frame_psnr[0], frame_psnr[1], frame_psnr[2]);

kvazaar-1.3.0.tar.gz/src/cli.h -> kvazaar-2.0.0.tar.gz/src/cli.h Changed

kvazaar-2.0.0.tar.gz/src/constraint.c Added

@@ -0,0 +1,59 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "constraint.h"
+
+ /**
+  * \brief Allocate the constraint_t structure.
+  *
+  * \param state   encoder state
+  * \return the pointer of constraint_t structure
+  */
+void * kvz_init_constraint(encoder_state_t* state, const encoder_control_t * const encoder) {
+  constraint_t* constr = NULL;
+  // Allocate the constraint_t strucutre
+  constr = MALLOC(constraint_t, 1);
+  if (!constr) {
+    fprintf(stderr, "Memory allocation failed!\n");
+    assert(0);
+  }
+
+  // Allocate the ml_intra_ctu_pred_t structure
+  constr->ml_intra_depth_ctu = NULL;
+  if (encoder->cfg.ml_pu_depth_intra) // TODO: Change this by a new param !!
+  {
+    constr->ml_intra_depth_ctu = kvz_init_ml_intra_depth_const();
+  }
+  return constr;
+}
+
+/**
+ * \brief Deallocate the constraint_t structure.
+ *
+ * \param state   encoder state
+ */
+void kvz_constraint_free(encoder_state_t* state) {
+  constraint_t* constr = state->constraint;
+  if (constr->ml_intra_depth_ctu) 
+  {
+    kvz_end_ml_intra_depth_const(constr->ml_intra_depth_ctu);
+  }
+  FREE_POINTER(constr);
+}

kvazaar-2.0.0.tar.gz/src/constraint.h Added

@@ -0,0 +1,40 @@
+#ifndef CONSTRAINT_H_
+#define CONSTRAINT_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_intra_cu_depth_pred.h"
+#include "encoderstate.h"
+
+
+ /* Constraint structure:
+  * Each field corresponds to a constraint technique. The encoder tests if the constraint
+  * pointer is allocated to apply the technique.
+ */
+typedef struct {
+  // Structure used for the CTU depth prediction using Machine Learning in All Intra 
+  ml_intra_ctu_pred_t * ml_intra_depth_ctu;
+} constraint_t;
+
+
+void * kvz_init_constraint(encoder_state_t* state, const encoder_control_t * const);
+void kvz_constraint_free(encoder_state_t* state);
+
+#endif
\ No newline at end of file

kvazaar-1.3.0.tar.gz/src/encmain.c -> kvazaar-2.0.0.tar.gz/src/encmain.c Changed

@@ -305,6 +305,10 @@
   } while (picture_written);
 }
 
+static double calc_avg_qp(uint64_t qp_sum, uint32_t frames_done)
+{
+  return (double)qp_sum / (double)frames_done;
+}
 
 /**
  * \brief Program main function.
@@ -432,6 +436,7 @@
     uint64_t bitstream_length = 0;
     uint32_t frames_done = 0;
     double psnr_sum[3] = { 0.0, 0.0, 0.0 };
+    uint64_t qp_sum = 0;
 
     // how many bits have been written this second? used for checking if framerate exceeds level's limits
     uint64_t bits_this_second = 0;
@@ -597,12 +602,15 @@
                                 opts->config->height);
         }
 
+        qp_sum      += info_out.qp;
         frames_done += 1;
+
         psnr_sum[0] += frame_psnr[0];
         psnr_sum[1] += frame_psnr[1];
         psnr_sum[2] += frame_psnr[2];
 
-        print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr);
+        print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr,
+                         calc_avg_qp(qp_sum, frames_done));
       }
 
       api->picture_free(cur_in_img);
@@ -632,12 +640,38 @@
     fprintf(stderr, " Total CPU time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC);
 
     {
+      const double mega = (double)(1 << 20);
+
       double encoding_time = ( (double)(encoding_end_cpu_time - encoding_start_cpu_time) ) / (double) CLOCKS_PER_SEC;
       double wall_time = KVZ_CLOCK_T_AS_DOUBLE(encoding_end_real_time) - KVZ_CLOCK_T_AS_DOUBLE(encoding_start_real_time);
-      fprintf(stderr, " Encoding time: %.3f s.\n", encoding_time);
+
+      double encoding_cpu = 100.0 * encoding_time / wall_time;
+      double encoding_fps = (double)frames_done   / wall_time;
+
+      double n_bits       = (double)(bitstream_length * 8);
+      double sf_num       = (double)encoder->cfg.framerate_num;
+      double sf_den       = (double)encoder->cfg.framerate_denom;
+      double sequence_fps =         sf_num / sf_den;
+
+      double sequence_t   = (double)frames_done / sequence_fps;
+      double bitrate_bps  = (double)n_bits      / sequence_t;
+      double bitrate_mbps =         bitrate_bps / mega;
+
+      double avg_qp       = calc_avg_qp(qp_sum, frames_done);
+
+#ifdef _WIN32
+      if (encoding_cpu > 100.0) {
+        encoding_cpu = 100.0;
+      }
+#endif
+      fprintf(stderr, " Encoding time: %.3f s.\n",      encoding_time);
       fprintf(stderr, " Encoding wall time: %.3f s.\n", wall_time);
-      fprintf(stderr, " Encoding CPU usage: %.2f%%\n", encoding_time/wall_time*100.f);
-      fprintf(stderr, " FPS: %.2f\n", ((double)frames_done)/wall_time);
+
+      fprintf(stderr, " Encoding CPU usage: %.2f%%\n",  encoding_cpu);
+      fprintf(stderr, " FPS: %.2f\n",                   encoding_fps);
+
+      fprintf(stderr, " Bitrate: %.3f Mbps\n",          bitrate_mbps);
+      fprintf(stderr, " AVG QP: %.1f\n",                avg_qp);
     }
     pthread_join(input_thread, NULL);
   }

kvazaar-1.3.0.tar.gz/src/encode_coding_tree.c -> kvazaar-2.0.0.tar.gz/src/encode_coding_tree.c Changed

kvazaar-1.3.0.tar.gz/src/encoder.c -> kvazaar-2.0.0.tar.gz/src/encoder.c Changed

@@ -27,7 +27,9 @@
 #include <stdlib.h>
 
 #include "cfg.h"
+#include "gop.h"
 #include "strategyselector.h"
+#include "kvz_math.h"
 
 
 /**
@@ -233,10 +235,26 @@
 
   if (encoder->cfg.gop_len > 0) {
     if (encoder->cfg.gop_lowdelay) {
-      kvz_config_process_lp_gop(&encoder->cfg);
+      if (encoder->cfg.gop_len == 4 && encoder->cfg.ref_frames == 4) {
+        memcpy(encoder->cfg.gop, kvz_gop_lowdelay4, sizeof(kvz_gop_lowdelay4));
+      } else {
+        kvz_config_process_lp_gop(&encoder->cfg);
+      }
     }
+  } 
+  
+  if( encoder->cfg.intra_qp_offset_auto ) {
+      encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? -kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1 : 0;
+  }
+
+  // Disable GOP and QP offset for all-intra coding
+  if (encoder->cfg.intra_period == 1) {
+    encoder->cfg.gop_len = 0;
+    encoder->cfg.intra_qp_offset = 0;
   }
 
+  encoder->poc_lsb_bits = MAX(4, kvz_math_ceil_log2(encoder->cfg.gop_len * 2 + 1));
+
   encoder->max_inter_ref_lcu.right = 1;
   encoder->max_inter_ref_lcu.down  = 1;
 
@@ -332,7 +350,9 @@
   }
   encoder->target_avg_bpp = encoder->target_avg_bppic / encoder->in.pixels_per_pic;
 
-  if (!encoder_control_init_gop_layer_weights(encoder)) {
+  if (encoder->cfg.target_bitrate > 0 &&
+      !encoder_control_init_gop_layer_weights(encoder))
+  {
     goto init_failed;
   }
 
@@ -356,7 +376,7 @@
   // for SMP and AMP partition units.
   encoder->tr_depth_inter = 0;
 
-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
+  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
     encoder->max_qp_delta_depth = 0;
   } else {
     encoder->max_qp_delta_depth = -1;
@@ -592,11 +612,16 @@
 #endif //KVZ_DEBUG
   }
 
-  assert(WITHIN(encoder->cfg.pu_depth_inter.min, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX));
-  assert(WITHIN(encoder->cfg.pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX));
-  assert(WITHIN(encoder->cfg.pu_depth_intra.min, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX));
-  assert(WITHIN(encoder->cfg.pu_depth_intra.max, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX));
+  for( size_t i = 0; i < KVZ_MAX_GOP_LAYERS; i++ )
+  {
+      if( encoder->cfg.pu_depth_inter.min[i] < 0 || cfg->pu_depth_inter.max[i] < 0 ) continue;
+      assert( WITHIN( encoder->cfg.pu_depth_inter.min[i], PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX ) );
+      assert( WITHIN( encoder->cfg.pu_depth_inter.max[i], PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX ) );
 
+      if( encoder->cfg.pu_depth_intra.min[i] < 0 || cfg->pu_depth_intra.max[i] < 0 ) continue;
+      assert( WITHIN( encoder->cfg.pu_depth_intra.min[i], PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX ) );
+      assert( WITHIN( encoder->cfg.pu_depth_intra.max[i], PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX ) );
+  }
   // Disable in-loop filters, sign hiding and transform skip when using
   // lossless coding.
   if (encoder->cfg.lossless) {
@@ -722,7 +747,8 @@
  * \return 1 on success, 0 on failure.
  *
  * Selects appropriate weights for layers according to the target bpp.
- * Only GOP structures with exactly four layers are supported.
+ * Only GOP structures with exactly four layers are supported with the.
+ * exception of experimental GOP 16.
  */
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const encoder)
 {
@@ -795,10 +821,33 @@
         }
       }
       break;
-
+    case 5:
+      if(!encoder->cfg.gop_lowdelay) {
+        // These are obtained by running HM with RA GOP 16 collecting the ratio of bits spent for each
+        // layer from the CTC sequences and then fitting power curve
+        encoder->gop_layer_weights[0] = 13.0060187535 * pow(encoder->target_avg_bpp, -0.3727651453);
+        encoder->gop_layer_weights[1] = 7.3654107392 * pow(encoder->target_avg_bpp, -0.0854329266);
+        encoder->gop_layer_weights[2] = 3.6563990701 * pow(encoder->target_avg_bpp, -0.0576990493);
+        encoder->gop_layer_weights[3] = 2.1486937288 * pow(encoder->target_avg_bpp, -0.0155389471);
+        encoder->gop_layer_weights[4] = 1;        
+      } 
+      else {
+        fprintf(stderr, "Unsupported amount of layers (%d) for lowdelay GOP\n", num_layers);
+        return 0;
+      }
+      break;
     default:
-      fprintf(stderr, "Unsupported number of GOP layers (%d)\n", num_layers);
-      return 0;
+      if (!encoder->cfg.gop_lowdelay && encoder->cfg.gop_len == 16) {
+        fprintf(stdout, 
+                "Rate control: Using experimental weights for GOP layers (%d)\n",
+                num_layers);
+        for (int i = 0; i < MAX_GOP_LAYERS; ++i) {
+          encoder->gop_layer_weights[i] = (i == 0) ? 10 : 2;
+        }
+      } else {
+        fprintf(stderr, "Unsupported number of GOP layers (%d)\n", num_layers);
+        return 0;
+      }
   }
 
   // Normalize weights so that the sum of weights in a GOP is one.

kvazaar-1.3.0.tar.gz/src/encoder.h -> kvazaar-2.0.0.tar.gz/src/encoder.h Changed

kvazaar-1.3.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-2.0.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -39,6 +39,7 @@
 #include "tables.h"
 #include "threadqueue.h"
 #include "videoframe.h"
+#include "rate_control.h"
 
 
 static void encoder_state_write_bitstream_aud(encoder_state_t * const state)
@@ -346,8 +347,14 @@
     WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
   }
 
-  WRITE_UE(stream, encoder->in.width, "pic_width_in_luma_samples");
-  WRITE_UE(stream, encoder->in.height, "pic_height_in_luma_samples");
+  if (encoder->cfg.partial_coding.fullWidth != 0) {
+    WRITE_UE(stream, encoder->cfg.partial_coding.fullWidth, "pic_width_in_luma_samples");
+    WRITE_UE(stream, encoder->cfg.partial_coding.fullHeight, "pic_height_in_luma_samples");
+  }
+  else {
+    WRITE_UE(stream, encoder->in.width, "pic_width_in_luma_samples");
+    WRITE_UE(stream, encoder->in.height, "pic_height_in_luma_samples");
+  }
 
   if (encoder->in.width != encoder->in.real_width || encoder->in.height != encoder->in.real_height) {
     // The standard does not seem to allow setting conf_win values such that
@@ -371,18 +378,22 @@
 
   WRITE_UE(stream, encoder->bitdepth-8, "bit_depth_luma_minus8");
   WRITE_UE(stream, encoder->bitdepth-8, "bit_depth_chroma_minus8");
-  WRITE_UE(stream, 1, "log2_max_pic_order_cnt_lsb_minus4");
+  WRITE_UE(stream, encoder->poc_lsb_bits - 4, "log2_max_pic_order_cnt_lsb_minus4");
+
   WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
 
   //for each layer
   if (encoder->cfg.gop_lowdelay) {
-    WRITE_UE(stream, encoder->cfg.ref_frames, "sps_max_dec_pic_buffering");
-    WRITE_UE(stream, 0, "sps_num_reorder_pics");
+    const int dpb = encoder->cfg.ref_frames;
+    WRITE_UE(stream, dpb - 1, "sps_max_dec_pic_buffering_minus1");
+    WRITE_UE(stream, 0, "sps_max_num_reorder_pics");
   } else {
-    WRITE_UE(stream, encoder->cfg.ref_frames + encoder->cfg.gop_len, "sps_max_dec_pic_buffering");
-    WRITE_UE(stream, encoder->cfg.gop_len, "sps_num_reorder_pics");
+    // Clip to non-negative values to prevent problems with GOP=0
+    const int dpb = MIN(16, encoder->cfg.gop_len);
+    WRITE_UE(stream, MAX(dpb - 1, 0), "sps_max_dec_pic_buffering_minus1");
+    WRITE_UE(stream, MAX(encoder->cfg.gop_len - 1, 0), "sps_max_num_reorder_pics");
   }
-  WRITE_UE(stream, 0, "sps_max_latency_increase");
+  WRITE_UE(stream, 0, "sps_max_latency_increase_plus1");
   //end for
 
   WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
@@ -709,16 +720,18 @@
   if (state->frame->pictype != KVZ_NAL_IDR_W_RADL
       && state->frame->pictype != KVZ_NAL_IDR_N_LP)
   {
+    const int poc_lsb = state->frame->poc & ((1 << encoder->poc_lsb_bits) - 1);
+    WRITE_U(stream, poc_lsb, encoder->poc_lsb_bits, "pic_order_cnt_lsb");
+
     int last_poc = 0;
     int poc_shift = 0;
 
-      WRITE_U(stream, state->frame->poc&0x1f, 5, "pic_order_cnt_lsb");
-      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
-      WRITE_UE(stream, ref_negative, "num_negative_pics");
-      WRITE_UE(stream, ref_positive, "num_positive_pics");
-    for (j = 0; j < ref_negative; j++) {      
+    WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
+    WRITE_UE(stream, ref_negative, "num_negative_pics");
+    WRITE_UE(stream, ref_positive, "num_positive_pics");
+    for (j = 0; j < ref_negative; j++) {
       int8_t delta_poc = 0;
-      
+
       if (encoder->cfg.gop_len) {
         int8_t found = 0;
         do {
@@ -832,6 +845,11 @@
   printf("=========== Slice ===========\n");
 #endif
 
+  if (encoder->cfg.partial_coding.fullWidth != 0) {
+    state->slice->start_in_rs = encoder->cfg.partial_coding.startCTU_x +
+      CEILDIV(encoder->cfg.partial_coding.fullWidth, 64) * encoder->cfg.partial_coding.startCTU_y;
+  }
+
   bool first_slice_segment_in_pic = (state->slice->start_in_rs == 0);
   if ((state->encoder_control->cfg.slices & KVZ_SLICES_WPP)
       && state->wfrow->lcu_offset_y > 0)
@@ -854,6 +872,9 @@
     }
 
     int lcu_cnt = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
+    if (encoder->cfg.partial_coding.fullWidth != 0) {
+      lcu_cnt = CEILDIV(encoder->cfg.partial_coding.fullWidth, 64) * CEILDIV(encoder->cfg.partial_coding.fullHeight, 64);
+    }
     int num_bits = kvz_math_ceil_log2(lcu_cnt);
     int slice_start_rs = state->slice->start_in_rs;
     if (state->encoder_control->cfg.slices & KVZ_SLICES_WPP) {
@@ -1043,8 +1064,11 @@
     state->frame->total_bits_coded = state->previous_encoder_state->frame->total_bits_coded;
   }
   state->frame->total_bits_coded += newpos - curpos;
+  if(state->encoder_control->cfg.rc_algorithm == KVZ_OBA) {
+    kvz_update_after_picture(state);
+  }
 
-    state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
+  state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
   state->frame->cur_gop_bits_coded += newpos - curpos;
 }

kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-2.0.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

@@ -34,6 +34,7 @@
 #include "kvazaar.h"
 #include "threadqueue.h"
 #include "videoframe.h"
+#include "rate_control.h"
 
 
 static int encoder_state_config_frame_init(encoder_state_t * const state) {
@@ -46,15 +47,39 @@
   state->frame->num = 0;
   state->frame->poc = 0;
   state->frame->total_bits_coded = 0;
+  state->frame->cur_frame_bits_coded = 0;
   state->frame->cur_gop_bits_coded = 0;
   state->frame->prepared = 0;
   state->frame->done = 1;
+
   state->frame->rc_alpha = 3.2003;
   state->frame->rc_beta = -1.367;
+  state->frame->icost = 0;
 
   const encoder_control_t * const encoder = state->encoder_control;
   const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
-  state->frame->lcu_stats = MALLOC(lcu_stats_t, num_lcus);
+  state->frame->lcu_stats = calloc(num_lcus, sizeof(lcu_stats_t));
+  state->frame->aq_offsets = MALLOC(double, num_lcus);
+
+  for (int y = 0; y < encoder->in.height_in_lcu; y++) {
+    for (int x = 0; x < encoder->in.width_in_lcu; x++) {
+      int temp = MIN(encoder->cfg.width - x * 64, 64) * MIN(encoder->cfg.height - y * 64, 64);
+      state->frame->lcu_stats[x + y * encoder->in.width_in_lcu].pixels = temp;
+    }
+  }
+
+  state->frame->c_para = malloc(sizeof(double) * num_lcus);
+  if(state->frame->c_para == NULL) {
+    return 0;
+  }
+  state->frame->k_para = malloc(sizeof(double) * num_lcus);
+  if (state->frame->k_para == NULL) {
+    return 0;
+  }
+
+  pthread_mutex_init(&state->frame->rc_lock, NULL);
+
+  state->frame->new_ratecontrol = kvz_get_rc_data(NULL);
 
   return 1;
 }
@@ -62,8 +87,13 @@
 static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
   if (state->frame == NULL) return;
 
+  pthread_mutex_destroy(&state->frame->rc_lock);
+  if (state->frame->c_para) FREE_POINTER(state->frame->c_para);
+  if (state->frame->k_para) FREE_POINTER(state->frame->k_para);
+
   kvz_image_list_destroy(state->frame->ref);
   FREE_POINTER(state->frame->lcu_stats);
+  FREE_POINTER(state->frame->aq_offsets);
 }
 
 static int encoder_state_config_tile_init(encoder_state_t * const state, 
@@ -348,7 +378,9 @@
     if (!child_state->slice) child_state->slice = parent_state->slice;
     if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
   }
-  
+  // Intialization of the constraint structure
+  child_state->constraint = kvz_init_constraint(child_state->constraint, child_state->encoder_control);
+
   kvz_bitstream_init(&child_state->stream);
   
   // Set CABAC output bitstream
@@ -681,7 +713,7 @@
     for (i = 0; state->children[i].encoder_control; ++i) {
       kvz_encoder_state_finalize(&state->children[i]);
     }
-    
+
     FREE_POINTER(state->children);
   }
   
@@ -706,6 +738,11 @@
     FREE_POINTER(state->frame);
   }
   
+  if (state->constraint) {
+    // End of the constraint structure
+    kvz_constraint_free(state);
+  }
+
   kvz_bitstream_finalize(&state->stream);
 
   kvz_threadqueue_free_job(&state->tqj_recon_done);

kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.h -> kvazaar-2.0.0.tar.gz/src/encoder_state-ctors_dtors.h Changed

kvazaar-1.3.0.tar.gz/src/encoderstate.c -> kvazaar-2.0.0.tar.gz/src/encoderstate.c Changed

@@ -37,6 +37,8 @@
 #include "tables.h"
 #include "threadqueue.h"
 
+#include "strategies/strategies-picture.h"
+
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -616,7 +618,17 @@
   const encoder_control_t * const encoder = state->encoder_control;
   videoframe_t* const frame = state->tile->frame;
 
-  kvz_set_lcu_lambda_and_qp(state, lcu->position);
+  switch (encoder->cfg.rc_algorithm) {
+    case KVZ_NO_RC:
+    case KVZ_LAMBDA:
+      kvz_set_lcu_lambda_and_qp(state, lcu->position);
+      break;
+    case KVZ_OBA:
+      kvz_set_ctu_qp_lambda(state, lcu->position);
+      break;
+    default:
+      assert(0);
+  }
 
   lcu_coeff_t coeff;
   state->coeff = &coeff;
@@ -702,9 +714,27 @@
     }
   }
 
+  pthread_mutex_lock(&state->frame->rc_lock);
   const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits;
+  state->frame->cur_frame_bits_coded += bits;
+  // This variable is used differently by intra and inter frames and shouldn't
+  // be touched in intra frames here
+  state->frame->remaining_weight -= !state->frame->is_irap ?
+    kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->original_weight :
+    0;
+  pthread_mutex_unlock(&state->frame->rc_lock);
   kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->bits = bits;
 
+  uint8_t not_skip = false;
+  for(int y = 0; y < 64 && !not_skip; y+=8) {
+    for(int x = 0; x < 64 && !not_skip; x+=8) {
+      not_skip |= !kvz_cu_array_at_const(state->tile->frame->cu_array,
+        lcu->position_px.x + x,
+        lcu->position_px.y + y)->skipped;
+    }
+  }
+  kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->skipped = !not_skip;
+
   //Wavefronts need the context to be copied to the next row
   if (state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && lcu->index == 1) {
     int j;
@@ -803,6 +833,11 @@
           }
           kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[dep_lcu->id]);
 
+          //TODO: Preparation for the lock free implementation of the new rc
+          if (ref_state->frame->slicetype == KVZ_SLICE_I && ref_state->frame->num != 0 && state->encoder_control->cfg.owf > 1 && true) {
+            kvz_threadqueue_job_dep_add(job[0], ref_state->previous_encoder_state->tile->wf_jobs[dep_lcu->id]);
+          }
+
           // Very spesific bug that happens when owf length is longer than the
           // gop length. Takes care of that.
           if(!state->encoder_control->cfg.gop_lowdelay &&
@@ -1163,6 +1198,12 @@
   kvz_threadqueue_free_job(&state->tqj_bitstream_written);
   kvz_threadqueue_free_job(&state->tqj_recon_done);
 
+  //Copy the constraint pointer
+  // TODO: Try to do it in the if (state->is_leaf)
+  //if (state->parent != NULL) {
+    // state->constraint = state->parent->constraint;
+  //}
+
   for (int i = 0; state->children[i].encoder_control; ++i) {
     encoder_state_init_children(&state->children[i]);
   }
@@ -1184,6 +1225,21 @@
   }
 }
 
+// Check if lcu is edge lcu. Return false if frame dimensions are 64 divisible
+static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
+{
+  if (xdiv64 && ydiv64) {
+    return false;
+  }
+  int last_row_first_id = (lcus_y - 1) * lcus_x;
+  if ((id % lcus_x == lcus_x - 1 && !xdiv64) || (id >= last_row_first_id && !ydiv64)) {
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
   assert(state->type == ENCODER_STATE_TYPE_MAIN);
 
@@ -1197,11 +1253,108 @@
       state->tile->frame->height
   );
 
+  // Variance adaptive quantization
+  if (cfg->vaq) {
+    const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+    double d = cfg->vaq * 0.1; // Empirically decided constant. Affects delta-QP strength
+    
+    // Calculate frame pixel variance
+    uint32_t len = state->tile->frame->width * state->tile->frame->height;
+    uint32_t c_len = len / 4;
+    double frame_var = kvz_pixel_var(state->tile->frame->source->y, len);
+    if (has_chroma) {
+      frame_var += kvz_pixel_var(state->tile->frame->source->u, c_len);
+      frame_var += kvz_pixel_var(state->tile->frame->source->v, c_len);
+    }
+
+    // Loop through LCUs
+    // For each LCU calculate: D * (log(LCU pixel variance) - log(frame pixel variance))
+    unsigned x_lim = state->tile->frame->width_in_lcu;
+    unsigned y_lim = state->tile->frame->height_in_lcu;
+    
+    unsigned id = 0;
+    for (int y = 0; y < y_lim; ++y) {
+      for (int x = 0; x < x_lim; ++x) {
+        kvz_pixel tmp[LCU_LUMA_SIZE];
+        int pxl_x = x * LCU_WIDTH;
+        int pxl_y = y * LCU_WIDTH;
+        int x_max = MIN(pxl_x + LCU_WIDTH, frame->width) - pxl_x;
+        int y_max = MIN(pxl_y + LCU_WIDTH, frame->height) - pxl_y;
+        
+        bool xdiv64 = false;
+        bool ydiv64 = false;
+        if (frame->width % 64 == 0) xdiv64 = true;
+        if (frame->height % 64 == 0) ydiv64 = true;
+
+        // Luma variance
+        if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
+          kvz_pixels_blit(&state->tile->frame->source->y[pxl_x + pxl_y * state->tile->frame->source->stride], tmp,
+            x_max, y_max, state->tile->frame->source->stride, LCU_WIDTH);
+        } else {
+          // Extend edge pixels for edge lcus
+          for (int y = 0; y < LCU_WIDTH; y++) {
+            for (int x = 0; x < LCU_WIDTH; x++) {
+              int src_y = CLIP(0, frame->height - 1, pxl_y + y);
+              int src_x = CLIP(0, frame->width - 1, pxl_x + x);
+              tmp[y * LCU_WIDTH + x] = state->tile->frame->source->y[src_y * state->tile->frame->source->stride + src_x];
+            }
+          }
+        }
+        
+        double lcu_var = kvz_pixel_var(tmp, LCU_LUMA_SIZE);
+
+        if (has_chroma) {
+          // Add chroma variance if not monochrome
+          int32_t c_stride = state->tile->frame->source->stride >> 1;
+          kvz_pixel chromau_tmp[LCU_CHROMA_SIZE];
+          kvz_pixel chromav_tmp[LCU_CHROMA_SIZE];
+          int lcu_chroma_width = LCU_WIDTH >> 1;
+          int c_pxl_x = x * lcu_chroma_width;
+          int c_pxl_y = y * lcu_chroma_width;
+          int c_x_max = MIN(c_pxl_x + lcu_chroma_width, frame->width >> 1) - c_pxl_x;
+          int c_y_max = MIN(c_pxl_y + lcu_chroma_width, frame->height >> 1) - c_pxl_y;
+
+          if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
+            kvz_pixels_blit(&state->tile->frame->source->u[c_pxl_x + c_pxl_y * c_stride], chromau_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
+            kvz_pixels_blit(&state->tile->frame->source->v[c_pxl_x + c_pxl_y * c_stride], chromav_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
+          }
+          else {
+            for (int y = 0; y < lcu_chroma_width; y++) {
+              for (int x = 0; x < lcu_chroma_width; x++) {
+                int src_y = CLIP(0, (frame->height >> 1) - 1, c_pxl_y + y);
+                int src_x = CLIP(0, (frame->width >> 1) - 1, c_pxl_x + x);
+                chromau_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->u[src_y * c_stride + src_x];
+                chromav_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->v[src_y * c_stride + src_x];
+              }
+            }
+          }
+          lcu_var += kvz_pixel_var(chromau_tmp, LCU_CHROMA_SIZE);
+          lcu_var += kvz_pixel_var(chromav_tmp, LCU_CHROMA_SIZE);
+        }
+                
+        state->frame->aq_offsets[id] = d * (log(lcu_var) - log(frame_var));
+        id++; 
+      }
+    }
+  }
+  // Variance adaptive quantization - END
+
   // Use this flag to handle closed gop irap picture selection.
   // If set to true, irap is already set and we avoid
   // setting it based on the intra period
   bool is_closed_normal_gop = false;
 
+  encoder_state_t *previous = state->previous_encoder_state;
+  int owf = MIN(state->encoder_control->cfg.owf, state->frame->num);

kvazaar-1.3.0.tar.gz/src/encoderstate.h -> kvazaar-2.0.0.tar.gz/src/encoderstate.h Changed

@@ -39,6 +39,7 @@
 #include "videoframe.h"
 #include "extras/crypto.h"
 
+struct kvz_rc_data;
 
 typedef enum {
   ENCODER_STATE_TYPE_INVALID = 'i',
@@ -53,9 +54,13 @@
   //! \brief Number of bits that were spent
   uint32_t bits;
 
+  uint32_t pixels;
+
   //! \brief Weight of the LCU for rate control
   double weight;
 
+  double original_weight;
+
   //! \brief Lambda value which was used for this LCU
   double lambda;
 
@@ -64,6 +69,11 @@
 
   //! \brief Rate control beta parameter
   double rc_beta;
+  double distortion;
+  int i_cost;
+
+  int8_t qp;
+  uint8_t skipped;
 } lcu_stats_t;
 
 
@@ -111,6 +121,9 @@
   //! Number of bits written in the current GOP.
   uint64_t cur_gop_bits_coded;
 
+  //! Number of bits written in the current frame.
+  uint64_t cur_frame_bits_coded;
+
   //! Number of bits targeted for the current GOP.
   double cur_gop_target_bits;
 
@@ -141,11 +154,27 @@
    */
   lcu_stats_t *lcu_stats;
 
+  pthread_mutex_t rc_lock;
+
+  struct kvz_rc_data *new_ratecontrol;
+
+  struct encoder_state_t const *previous_layer_state;
+
+  /**
+  * \brief Calculated adaptive QP offset for each LCU.
+  */
+  double *aq_offsets;
+
   /**
    * \brief Whether next NAL is the first NAL in the access unit.
    */
   bool first_nal;
+  double icost;
+  double remaining_weight;
+  double i_bits_left;
 
+  double *c_para;
+  double *k_para;
 } encoder_state_config_frame_t;
 
 typedef struct encoder_state_config_tile_t {
@@ -236,7 +265,7 @@
   
   //Pointer to the encoder_state of the previous frame
   struct encoder_state_t *previous_encoder_state;
-  
+    
   encoder_state_config_frame_t  *frame;
   encoder_state_config_tile_t   *tile;
   encoder_state_config_slice_t  *slice;
@@ -288,6 +317,11 @@
   //Jobs to wait for
   threadqueue_job_t * tqj_recon_done; //Reconstruction is done
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
+
+  //Constraint structure  
+  void * constraint;
+
+
 } encoder_state_t;
 
 void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame);

kvazaar-1.3.0.tar.gz/src/global.h -> kvazaar-2.0.0.tar.gz/src/global.h Changed

kvazaar-2.0.0.tar.gz/src/gop.h Added

@@ -0,0 +1,400 @@
+#ifndef GOP_H_
+#define GOP_H_
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2018 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify it under
+* the terms of the GNU Lesser General Public License as published by the
+* Free Software Foundation; either version 2.1 of the License, or (at your
+* option) any later version.
+*
+* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+* FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+* more details.
+*
+* You should have received a copy of the GNU General Public License along
+* with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+#include <kvazaar.h>
+
+
+static const kvz_gop_config kvz_gop_lowdelay4[4] = {
+  {
+    .poc_offset = 1,
+    .layer      = 1,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.5,
+    .qp_model_scale  = 0.2590,
+    .is_ref     = 1,
+    .ref_neg_count = 4,
+    .ref_neg = { 1, 5, 9, 13 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 2,
+    .layer      = 1,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.5,
+    .qp_model_scale  = 0.2590,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 2, 6, 10 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 3,
+    .layer      = 1,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.5,
+    .qp_model_scale  = 0.2590,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 3, 7, 11 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 4,
+    .layer      = 1,
+    .qp_offset  = 1,
+    .qp_factor  = 1.0,
+    .qp_model_offset = 0.0,
+    .qp_model_scale  = 0.0,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 4, 8, 12 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+};
+
+
+static const kvz_gop_config kvz_gop_ra8[8] = {
+  {
+    .poc_offset = 8,
+    .layer      = 1,
+    .qp_offset  = 0,
+    .qp_factor  = 1.0,
+    .qp_model_offset = 0.0,
+    .qp_model_scale  = 0.0,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 8, 12, 16 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 4,
+    .layer      = 2,
+    .qp_offset  = 3,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.25,
+    .qp_model_scale  = 0.25,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 4, 8 },
+    .ref_pos_count = 1,
+    .ref_pos = { 4 },
+  },
+  {
+    .poc_offset = 2,
+    .layer      = 3,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.25,
+    .qp_model_scale  = 0.25,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 6 },
+    .ref_pos_count = 2,
+    .ref_pos = { 2, 6 },
+  },
+  {
+    .poc_offset = 1,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 1,
+    .ref_neg = { 1 },
+    .ref_pos_count = 3,
+    .ref_pos = { 1, 3, 7 },
+  },
+  {
+    .poc_offset = 3,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 3 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 5 },
+  },
+  {
+    .poc_offset = 6,
+    .layer      = 3,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.25,
+    .qp_model_scale  = 0.25,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 6 },
+    .ref_pos_count = 1,
+    .ref_pos = { 2 },
+  },
+  {
+    .poc_offset = 5,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 5 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 3 },
+  },
+  {
+    .poc_offset = 7,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 3, 7 },
+    .ref_pos_count = 1,
+    .ref_pos = { 1 },
+  },
+};
+
+static const kvz_gop_config kvz_gop_ra16[16] = {
+  {
+    .poc_offset = 16,
+    .layer      = 1,
+    .qp_offset  = 1,
+    .qp_factor  = 1.0,
+    .qp_model_offset = 0.0,
+    .qp_model_scale  = 0.0,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 16, 24, 32 },

kvazaar-1.3.0.tar.gz/src/input_frame_buffer.c -> kvazaar-2.0.0.tar.gz/src/input_frame_buffer.c Changed

@@ -43,15 +43,18 @@
  *
  * The caller must not modify img_in after calling this function.
  *
- * \param buf     an input frame buffer
- * \param state   a main encoder state
- * \param img_in  input frame or NULL
+ * \param buf         an input frame buffer
+ * \param state       a main encoder state
+ * \param img_in      input frame or NULL
+ * \param first_done  whether the first frame has been done,
+ *                    needed for the OBA rc
  * \return        pointer to the next picture, or NULL if no picture is
  *                available
  */
 kvz_picture* kvz_encoder_feed_frame(input_frame_buffer_t *buf,
                                     encoder_state_t *const state,
-                                    kvz_picture *const img_in)
+                                    kvz_picture *const img_in, 
+                                    int first_done)
 {
   const encoder_control_t* const encoder = state->encoder_control;
   const kvz_config* const cfg = &encoder->cfg;
@@ -82,7 +85,7 @@
     buf->num_out++;
     return kvz_image_copy_ref(img_in);
   }
-
+  
   if (img_in != NULL) {
     // Index of the next input picture, in range [-1, +inf). Values
     // i and j refer to the same indices in buf->pic_buffer iff
@@ -140,7 +143,7 @@
     dts_out = buf->pts_buffer[gop_buf_size - 1] + buf->delay;
     gop_offset = 0; // highest quality picture
 
-  } else {
+  } else if(first_done) {
     gop_offset = (buf->num_out - 1) % cfg->gop_len;
     
     // For closed gop, calculate the gop_offset again
@@ -183,6 +186,9 @@
       dts_out = buf->pts_buffer[dts_idx % gop_buf_size];
     }
   }
+  else {
+    return NULL;
+  }
 
   // Index in buf->pic_buffer and buf->pts_buffer.
   int buf_idx = (idx_out + gop_buf_size) % gop_buf_size;

kvazaar-1.3.0.tar.gz/src/input_frame_buffer.h -> kvazaar-2.0.0.tar.gz/src/input_frame_buffer.h Changed

kvazaar-1.3.0.tar.gz/src/inter.c -> kvazaar-2.0.0.tar.gz/src/inter.c Changed

@@ -301,15 +301,17 @@
 /**
  * \brief Reconstruct an inter PU using uniprediction.
  *
- * \param state         encoder state
- * \param ref           picture to copy the data from
- * \param xpos          PU x position
- * \param ypos          PU y position
- * \param width         PU width
- * \param height        PU height
- * \param mv_param      motion vector
- * \param lcu           destination lcu
- * \param hi_prec_out   destination of high precision output, or NULL if not needed
+ * \param state          encoder state
+ * \param ref            picture to copy the data from
+ * \param xpos           PU x position
+ * \param ypos           PU y position
+ * \param width          PU width
+ * \param height         PU height
+ * \param mv_param       motion vector
+ * \param lcu            destination lcu
+ * \param hi_prec_out    destination of high precision output, or NULL if not needed
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
 */
 static void inter_recon_unipred(const encoder_state_t * const state,
                                 const kvz_picture * const ref,
@@ -319,7 +321,9 @@
                                 int32_t height,
                                 const int16_t mv_param[2],
                                 lcu_t *lcu,
-                                hi_prec_buf_t *hi_prec_out)
+                                hi_prec_buf_t *hi_prec_out,
+                                bool predict_luma,
+                                bool predict_chroma)
 {
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
@@ -340,38 +344,43 @@
   const int8_t fractional_luma = ((mv_param[0] & 3) || (mv_param[1] & 3));
 
   // Generate prediction for luma.
-  if (fractional_luma) {
-    // With a fractional MV, do interpolation.
-    if (state->encoder_control->cfg.bipred && hi_prec_out) {
-      inter_recon_14bit_frac_luma(state, ref,
-                                  pu_in_tile.x, pu_in_tile.y,
-                                  width, height,
-                                  mv_param, hi_prec_out);
-    } else {
-      inter_recon_frac_luma(state, ref,
-                            pu_in_tile.x, pu_in_tile.y,
-                            width, height,
-                            mv_param, lcu);
+  if (predict_luma) {
+    if (fractional_luma) {
+      // With a fractional MV, do interpolation.
+      if (state->encoder_control->cfg.bipred && hi_prec_out) {
+        inter_recon_14bit_frac_luma(state, ref,
+          pu_in_tile.x, pu_in_tile.y,
+          width, height,
+          mv_param, hi_prec_out);
+      }
+      else {
+        inter_recon_frac_luma(state, ref,
+          pu_in_tile.x, pu_in_tile.y,
+          width, height,
+          mv_param, lcu);
+      }
     }
-  } else {
-    // With an integer MV, copy pixels directly from the reference.
-    const int lcu_pu_index = pu_in_lcu.y * LCU_WIDTH + pu_in_lcu.x;
-    if (mv_is_outside_frame) {
-      inter_cp_with_ext_border(ref->y, ref->width,
-                               ref->width, ref->height,
-                               &lcu->rec.y[lcu_pu_index], LCU_WIDTH,
-                               width, height,
-                               &mv_in_frame);
-    } else {
-      const int frame_mv_index = mv_in_frame.y * ref->width + mv_in_frame.x;
-      kvz_pixels_blit(&ref->y[frame_mv_index],
-                      &lcu->rec.y[lcu_pu_index],
-                      width, height,
-                      ref->width, LCU_WIDTH);
+    else {
+      // With an integer MV, copy pixels directly from the reference.
+      const int lcu_pu_index = pu_in_lcu.y * LCU_WIDTH + pu_in_lcu.x;
+      if (mv_is_outside_frame) {
+        inter_cp_with_ext_border(ref->y, ref->width,
+          ref->width, ref->height,
+          &lcu->rec.y[lcu_pu_index], LCU_WIDTH,
+          width, height,
+          &mv_in_frame);
+      }
+      else {
+        const int frame_mv_index = mv_in_frame.y * ref->width + mv_in_frame.x;
+        kvz_pixels_blit(&ref->y[frame_mv_index],
+          &lcu->rec.y[lcu_pu_index],
+          width, height,
+          ref->width, LCU_WIDTH);
+      }
     }
   }
 
-  if (state->encoder_control->chroma_format == KVZ_CSP_400) {
+  if (!predict_chroma) {
     return;
   }
 
@@ -422,15 +431,17 @@
 /**
  * \brief Reconstruct bi-pred inter PU
  *
- * \param state     encoder state
- * \param ref1      reference picture to copy the data from
- * \param ref2      other reference picture to copy the data from
- * \param xpos      PU x position
- * \param ypos      PU y position
- * \param width     PU width
- * \param height    PU height
- * \param mv_param  motion vectors
- * \param lcu       destination lcu
+ * \param state          encoder state
+ * \param ref1           reference picture to copy the data from
+ * \param ref2           other reference picture to copy the data from
+ * \param xpos           PU x position
+ * \param ypos           PU y position
+ * \param width          PU width
+ * \param height         PU height
+ * \param mv_param       motion vectors
+ * \param lcu            destination lcu
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
  */
 void kvz_inter_recon_bipred(const encoder_state_t * const state,
                             const kvz_picture * ref1,
@@ -440,7 +451,9 @@
                             int32_t width,
                             int32_t height,
                             int16_t mv_param[2][2],
-                            lcu_t* lcu)
+                            lcu_t* lcu,
+                            bool predict_luma,
+                            bool predict_chroma)
 {
   kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH];
   kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C];
@@ -459,7 +472,8 @@
 
 
   //Reconstruct both predictors
-  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0);
+  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0,
+                      predict_luma, predict_chroma);
   if (!hi_prec_luma_rec0){
     memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); // copy to temp_lcu_y
   }
@@ -467,10 +481,15 @@
     memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_u
     memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_v
   }
-  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1);
+  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1,
+                      predict_luma, predict_chroma);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
-  kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
+  kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, 
+                               hi_prec_chroma_rec0, hi_prec_chroma_rec1,
+                               height, width, ypos, xpos,
+                               high_precision_rec0, high_precision_rec1,
+                               lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v, predict_luma, predict_chroma);
  
   if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0);
   if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
@@ -488,54 +507,87 @@
  * \param x       x-coordinate of the CU in pixels
  * \param y       y-coordinate of the CU in pixels
  * \param width   CU width
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
  */
 void kvz_inter_recon_cu(const encoder_state_t * const state,
                         lcu_t *lcu,
                         int32_t x,
                         int32_t y,
-                        int32_t width)
+                        int32_t width,
+                        bool predict_luma,
+                        bool predict_chroma)
 {
   cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-
   const int num_pu = kvz_part_mode_num_parts[cu->part_size];
   for (int i = 0; i < num_pu; ++i) {
-    const int pu_x = PU_GET_X(cu->part_size, width, x, i);
-    const int pu_y = PU_GET_Y(cu->part_size, width, y, i);
-    const int pu_w = PU_GET_W(cu->part_size, width, i);
-    const int pu_h = PU_GET_H(cu->part_size, width, i);

kvazaar-1.3.0.tar.gz/src/inter.h -> kvazaar-2.0.0.tar.gz/src/inter.h Changed

@@ -44,7 +44,18 @@
                         lcu_t *lcu,
                         int32_t x,
                         int32_t y,
-                        int32_t width);
+                        int32_t width,
+                        bool predict_luma,
+                        bool predict_chroma);
+
+void kvz_inter_pred_pu(const encoder_state_t * const state,
+  lcu_t *lcu,
+  int32_t x,
+  int32_t y,
+  int32_t width,
+  bool predict_luma,
+  bool predict_chroma,
+  int i_pu);
 
 void kvz_inter_recon_bipred(const encoder_state_t * const state,
                             const kvz_picture * ref1,
@@ -54,7 +65,9 @@
                             int32_t width,
                             int32_t height,
                             int16_t mv_param[2][2],
-                            lcu_t* lcu);
+                            lcu_t* lcu,
+                            bool predict_luma,
+                            bool predict_chroma);
 
 
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,

kvazaar-1.3.0.tar.gz/src/intra.c -> kvazaar-2.0.0.tar.gz/src/intra.c Changed

@@ -237,47 +237,6 @@
 }
 
 
-/**
-* \brief Generage intra DC prediction with post filtering applied.
-* \param log2_width    Log2 of width, range 2..5.
-* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
-* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
-* \param dst           Buffer of size width*width.
-*/
-static void intra_pred_filtered_dc(
-  const int_fast8_t log2_width,
-  const kvz_pixel *const ref_top,
-  const kvz_pixel *const ref_left,
-  kvz_pixel *const out_block)
-{
-  assert(log2_width >= 2 && log2_width <= 5);
-
-  const int_fast8_t width = 1 << log2_width;
-
-  int_fast16_t sum = 0;
-  for (int_fast8_t i = 0; i < width; ++i) {
-    sum += ref_top[i + 1];
-    sum += ref_left[i + 1];
-  }
-
-  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
-
-  // Filter top-left with ([1 2 1] / 4)
-  out_block[0] = (ref_left[1] + 2 * dc_val + ref_top[1] + 2) / 4;
-
-  // Filter rest of the boundary with ([1 3] / 4)
-  for (int_fast8_t x = 1; x < width; ++x) {
-    out_block[x] = (ref_top[x + 1] + 3 * dc_val + 2) / 4;
-  }
-  for (int_fast8_t y = 1; y < width; ++y) {
-    out_block[y * width] = (ref_left[y + 1] + 3 * dc_val + 2) / 4;
-    for (int_fast8_t x = 1; x < width; ++x) {
-      out_block[y * width + x] = dc_val;
-    }
-  }
-}
-
-
 void kvz_intra_predict(
   kvz_intra_references *refs,
   int_fast8_t log2_width,
@@ -314,7 +273,7 @@
   } else if (mode == 1) {
     // Do extra post filtering for edge pixels of luma DC mode.
     if (color == COLOR_Y && width < 32) {
-      intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
+      kvz_intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
     } else {
       intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst);
     }
@@ -665,7 +624,18 @@
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
+  // Reset CBFs because CBFs might have been set
+  // for depth earlier
+  if (mode_luma >= 0) {
+    cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
+  }
+  if (mode_chroma >= 0) {
+    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
+    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+  }
+
   if (depth == 0 || cur_cu->tr_depth > depth) {
+
     const int offset = width / 2;
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
@@ -682,7 +652,7 @@
       LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
     };
 
-    if (mode_luma != -1 && depth < MAX_DEPTH) {
+    if (mode_luma != -1 && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
     }
     if (mode_chroma != -1 && depth <= MAX_DEPTH) {
@@ -701,6 +671,6 @@
       intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_V);
     }
 
-    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu);
+    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
   }
 }

kvazaar-1.3.0.tar.gz/src/kvazaar.c -> kvazaar-2.0.0.tar.gz/src/kvazaar.c Changed

@@ -38,6 +38,7 @@
 #include "strategyselector.h"
 #include "threadqueue.h"
 #include "videoframe.h"
+#include "rate_control.h"
 
 
 static void kvazaar_close(kvz_encoder *encoder)
@@ -53,7 +54,8 @@
       kvz_picture *pic = NULL;
       while ((pic = kvz_encoder_feed_frame(&encoder->input_buffer,
                                            &encoder->states[0],
-                                           NULL)) != NULL) {
+                                           NULL,
+                                           1)) != NULL) {
         kvz_image_free(pic);
         pic = NULL;
       }
@@ -64,6 +66,7 @@
     }
     FREE_POINTER(encoder->states);
 
+    kvz_free_rc_data();
     // Discard const from the pointer.
     kvz_encoder_control_free((void*) encoder->control);
     encoder->control = NULL;
@@ -99,6 +102,11 @@
   encoder->frames_started = 0;
   encoder->frames_done = 0;
 
+  // Assure that the rc data allocation was successful
+  if(!kvz_get_rc_data(encoder->control)) {
+    goto kvazaar_open_failure;
+  }
+
   kvz_init_input_frame_buffer(&encoder->input_buffer);
 
   encoder->states = calloc(encoder->num_encoder_states, sizeof(encoder_state_t));
@@ -108,7 +116,6 @@
 
   for (unsigned i = 0; i < encoder->num_encoder_states; ++i) {
     encoder->states[i].encoder_control = encoder->control;
-
     if (!kvz_encoder_state_init(&encoder->states[i], NULL)) {
       goto kvazaar_open_failure;
     }
@@ -246,7 +253,10 @@
     CHECKPOINT_MARK("read source frame: %d", state->frame->num + enc->control->cfg.seek);
   }
 
-  kvz_picture* frame = kvz_encoder_feed_frame(&enc->input_buffer, state, pic_in);
+  kvz_picture* frame = kvz_encoder_feed_frame(
+    &enc->input_buffer, state, pic_in,
+    enc->frames_done || state->encoder_control->cfg.rc_algorithm != KVZ_OBA
+  );
   if (frame) {
     assert(state->frame->num == enc->frames_started);
     // Start encoding.
@@ -265,8 +275,9 @@
   }
 
   encoder_state_t *output_state = &enc->states[enc->out_state_num];
-  if (!output_state->frame->done &&
-      (pic_in == NULL || enc->cur_state_num == enc->out_state_num)) {
+  if ((!output_state->frame->done &&
+       (pic_in == NULL || enc->cur_state_num == enc->out_state_num)) ||
+       (state->frame->num == 0  && state->encoder_control->cfg.rc_algorithm == KVZ_OBA)) {
 
     kvz_threadqueue_waitfor(enc->control->threadqueue, output_state->tqj_bitstream_written);
     // The job pointer must be set to NULL here since it won't be usable after

kvazaar-1.3.0.tar.gz/src/kvazaar.h -> kvazaar-2.0.0.tar.gz/src/kvazaar.h Changed

@@ -64,6 +64,11 @@
  */
 #define KVZ_MAX_GOP_LENGTH 32
 
+ /**
+ * Maximum amount of GoP layers.
+ */
+#define KVZ_MAX_GOP_LAYERS 6
+
 /**
  * Size of data chunks.
  */
@@ -213,6 +218,12 @@
   KVZ_SCALING_LIST_DEFAULT = 2,  
 };
 
+enum kvz_rc_algorithm
+{
+  KVZ_NO_RC = 0,
+  KVZ_LAMBDA = 1,
+  KVZ_OBA = 2,
+};
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"[format])
 
@@ -229,6 +240,8 @@
   int8_t ref_pos[16];  /*!< \brief reference picture offset list */
   int8_t ref_neg_count;/*!< \brief Reference picture count */
   int8_t ref_neg[16];  /*!< \brief reference picture offset list */
+  double qp_model_offset;
+  double qp_model_scale;
 } kvz_gop_config;
 
 /**
@@ -306,8 +319,8 @@
   int32_t cpuid;
 
   struct {
-    int32_t min;
-    int32_t max;
+    int32_t min[KVZ_MAX_GOP_LAYERS];
+    int32_t max[KVZ_MAX_GOP_LAYERS];
   } pu_depth_inter, pu_depth_intra;
 
   int32_t add_encoder_info;
@@ -372,6 +385,11 @@
   /** \brief Maximum steps that hexagonal and diagonal motion estimation can use. -1 to disable */
   uint32_t me_max_steps;
 
+  /** \brief Offset to add to QP for intra frames */
+  int8_t intra_qp_offset;
+  /** \brief Select intra QP Offset based on GOP length */
+  uint8_t intra_qp_offset_auto;
+
   /** \brief Minimum QP that uses CABAC for residual cost instead of a fast estimate. */
   int8_t fast_residual_cost_limit;
 
@@ -381,6 +399,8 @@
   /** \brief Flag to enable/disable open GOP configuration */
   int8_t open_gop;
 
+	int32_t vaq; /** \brief Enable variance adaptive quantization*/
+
   /** \brief Type of scaling lists to use */
   int8_t scaling_list;
 
@@ -390,6 +410,30 @@
   /** \brief Enable Early Skip Mode Decision */
   uint8_t early_skip;
 
+  /** \brief Enable Machine learning CU depth prediction for Intra encoding. */
+  uint8_t ml_pu_depth_intra;  
+  
+  /** \brief Used for partial frame encoding*/
+  struct {
+    uint8_t startCTU_x;
+    uint8_t startCTU_y;
+    uint16_t fullWidth;
+    uint16_t fullHeight;
+  } partial_coding;
+
+  /** \brief Always consider CU without any quantized residual */
+  uint8_t zero_coeff_rdo;
+
+  /** \brief Currently unused parameter for OBA rc */
+  int8_t frame_allocation;
+
+  /** \brief used rc scheme, 0 for QP */
+  int8_t rc_algorithm;
+
+  /** \brief whether to use hadamard based bit allocation for intra frames or not */
+  uint8_t intra_bit_allocation;
+
+  uint8_t clip_neighbour;
 } kvz_config;
 
 /**

kvazaar-2.0.0.tar.gz/src/ml_classifier_intra_depth_pred.c Added

@@ -0,0 +1,808 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_classifier_intra_depth_pred.h"
+
+
+int tree_predict_merge_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->merge_variance <= 140.3129)
+	{
+		if (p_features->var_of_sub_var <= 569.6553)
+		{
+			if (p_features->merge_variance <= 20.8854)
+			{
+				*p_nb_iter = 19428.0;
+				*p_nb_bad = 1740.0;
+				return -1.0000;
+			}
+			else if (p_features->sub_variance_0 <= 9.1015)
+			{
+				if (p_features->merge_variance <= 39.132)
+				{
+					*p_nb_iter = 1166.0;
+					*p_nb_bad = 358.0;
+					return -1.0000;
+				}
+				else {
+					*p_nb_iter = 1049.0;
+					*p_nb_bad = 392.0;
+					return 1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 9371.0;
+				*p_nb_bad = 1805.0;
+				return -1.0000;
+			}
+		}
+		else if (p_features->sub_variance_2 <= 23.3193)
+		{
+			*p_nb_iter = 1059.0;
+			*p_nb_bad = 329.0;
+			return 1.0000;
+		}
+		else if (p_features->sub_variance_1 <= 30.7348)
+		{
+			*p_nb_iter = 1042.0;
+			*p_nb_bad = 395.0;
+			return 1.0000;
+		}
+		else {
+			*p_nb_iter = 1756.0;
+			*p_nb_bad = 588.0;
+			return -1.0000;
+		}
+	}
+	else if (p_features->merge_variance <= 857.8047)
+	{
+		if (p_features->var_of_sub_var <= 66593.5553)
+		{
+			if (p_features->sub_variance_0 <= 12.1697)
+			{
+				*p_nb_iter = 2006.0;
+				*p_nb_bad = 374.0;
+				return 1.0000;
+			}
+			else if (p_features->neigh_variance_C <= 646.8204)
+			{
+				if (p_features->neigh_variance_A <= 664.7609)
+				{
+					if (p_features->neigh_variance_B <= 571.2004)
+					{
+						if (p_features->var_of_sub_mean <= 4.1069)
+						{
+							*p_nb_iter = 1208.0;
+							*p_nb_bad = 399.0;
+							return 1.0000;
+						}
+						else if (p_features->var_of_sub_var <= 11832.6635)
+						{
+							*p_nb_iter = 8701.0;
+							*p_nb_bad = 3037.0;
+							return -1.0000;
+						}
+						else if (p_features->neigh_variance_A <= 142.298)
+						{
+							*p_nb_iter = 1025.0;
+							*p_nb_bad = 290.0;
+							return 1.0000;
+						}
+						else if (p_features->variance <= 394.4839)
+						{
+							*p_nb_iter = 1156.0;
+							*p_nb_bad = 489.0;
+							return 1.0000;
+						}
+						else {
+							*p_nb_iter = 1150.0;
+							*p_nb_bad = 503.0;
+							return -1.0000;
+						}
+					}
+					else {
+						*p_nb_iter = 1777.0;
+						*p_nb_bad = 558.0;
+						return 1.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 1587.0;
+					*p_nb_bad = 411.0;
+					return 1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 1980.0;
+				*p_nb_bad = 474.0;
+				return 1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 3613.0;
+			*p_nb_bad = 475.0;
+			return 1.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 20926.0;
+		*p_nb_bad = 1873.0;
+		return 1.0000;
+	}
+}
+
+
+
+int tree_predict_merge_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->merge_variance <= 119.4611)
+	{
+		if (p_features->var_of_sub_var <= 1078.0638)
+		{
+			if (p_features->neigh_variance_B <= 70.2189)
+			{
+				*p_nb_iter = 29253.0;
+				*p_nb_bad = 3837.0;
+				return -1.0000;
+			}
+			else if (p_features->variance <= 20.8711)
+			{
+				*p_nb_iter = 1292.0;
+				*p_nb_bad = 458.0;
+				return 2.0000;
+			}
+			else {
+				*p_nb_iter = 1707.0;
+				*p_nb_bad = 399.0;
+				return -1.0000;
+			}
+		}
+		else if (p_features->var_of_sub_var <= 3300.4034)
+		{
+			*p_nb_iter = 1554.0;
+			*p_nb_bad = 675.0;
+			return -1.0000;
+		}
+		else {
+			*p_nb_iter = 1540.0;
+			*p_nb_bad = 429.0;
+			return 2.0000;
+		}
+	}
+	else if (p_features->merge_variance <= 696.1989)
+	{
+		if (p_features->var_of_sub_var <= 31803.3242)
+		{
+			if (p_features->sub_variance_2 <= 10.3845)
+			{
+				*p_nb_iter = 3473.0;
+				*p_nb_bad = 768.0;
+				return 2.0000;
+			}
+			else if (p_features->neigh_variance_C <= 571.5329)

kvazaar-2.0.0.tar.gz/src/ml_classifier_intra_depth_pred.h Added

@@ -0,0 +1,38 @@
+#ifndef ML_CLASSIFIER_INTRA_DEPTH_PRED
+#define ML_CLASSIFIER_INTRA_DEPTH_PRED
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_intra_cu_depth_pred.h"
+
+
+int tree_predict_merge_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_merge_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_merge_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_merge_depth_4(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+
+
+int tree_predict_split_depth_0(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_split_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_split_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_split_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+
+#endif
\ No newline at end of file

kvazaar-2.0.0.tar.gz/src/ml_intra_cu_depth_pred.c Added

@@ -0,0 +1,1744 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_intra_cu_depth_pred.h"
+
+
+static int tree_predict_merge_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->merge_variance <= 140.3129)
+		{
+				if (p_features->var_of_sub_var <= 569.6553)
+				{
+						if (p_features->merge_variance <= 20.8854)
+						{
+								*p_nb_iter = 19428.0;
+								*p_nb_bad = 1740.0;
+								return -1.0000;
+						}
+						else if (p_features->sub_variance_0 <= 9.1015)
+						{
+								if (p_features->merge_variance <= 39.132)
+								{
+										*p_nb_iter = 1166.0;
+										*p_nb_bad = 358.0;
+										return -1.0000;
+								}
+								else {
+										*p_nb_iter = 1049.0;
+										*p_nb_bad = 392.0;
+										return 1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 9371.0;
+								*p_nb_bad = 1805.0;
+								return -1.0000;
+						}
+				}
+				else if (p_features->sub_variance_2 <= 23.3193)
+				{
+						*p_nb_iter = 1059.0;
+						*p_nb_bad = 329.0;
+						return 1.0000;
+				}
+				else if (p_features->sub_variance_1 <= 30.7348)
+				{
+						*p_nb_iter = 1042.0;
+						*p_nb_bad = 395.0;
+						return 1.0000;
+				}
+				else {
+						*p_nb_iter = 1756.0;
+						*p_nb_bad = 588.0;
+						return -1.0000;
+				}
+		}
+		else if (p_features->merge_variance <= 857.8047)
+		{
+				if (p_features->var_of_sub_var <= 66593.5553)
+				{
+						if (p_features->sub_variance_0 <= 12.1697)
+						{
+								*p_nb_iter = 2006.0;
+								*p_nb_bad = 374.0;
+								return 1.0000;
+						}
+						else if (p_features->neigh_variance_C <= 646.8204)
+						{
+								if (p_features->neigh_variance_A <= 664.7609)
+								{
+										if (p_features->neigh_variance_B <= 571.2004)
+										{
+												if (p_features->var_of_sub_mean <= 4.1069)
+												{
+														*p_nb_iter = 1208.0;
+														*p_nb_bad = 399.0;
+														return 1.0000;
+												}
+												else if (p_features->var_of_sub_var <= 11832.6635)
+												{
+														*p_nb_iter = 8701.0;
+														*p_nb_bad = 3037.0;
+														return -1.0000;
+												}
+												else if (p_features->neigh_variance_A <= 142.298)
+												{
+														*p_nb_iter = 1025.0;
+														*p_nb_bad = 290.0;
+														return 1.0000;
+												}
+												else if (p_features->variance <= 394.4839)
+												{
+														*p_nb_iter = 1156.0;
+														*p_nb_bad = 489.0;
+														return 1.0000;
+												}
+												else {
+														*p_nb_iter = 1150.0;
+														*p_nb_bad = 503.0;
+														return -1.0000;
+												}
+										}
+										else {
+												*p_nb_iter = 1777.0;
+												*p_nb_bad = 558.0;
+												return 1.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 1587.0;
+										*p_nb_bad = 411.0;
+										return 1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 1980.0;
+								*p_nb_bad = 474.0;
+								return 1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 3613.0;
+						*p_nb_bad = 475.0;
+						return 1.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 20926.0;
+				*p_nb_bad = 1873.0;
+				return 1.0000;
+		}
+}
+
+
+
+static int tree_predict_merge_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->merge_variance <= 119.4611)
+		{
+				if (p_features->var_of_sub_var <= 1078.0638)
+				{
+						if (p_features->neigh_variance_B <= 70.2189)
+						{
+								*p_nb_iter = 29253.0;
+								*p_nb_bad = 3837.0;
+								return -1.0000;
+						}
+						else if (p_features->variance <= 20.8711)
+						{
+								*p_nb_iter = 1292.0;
+								*p_nb_bad = 458.0;
+								return 2.0000;
+						}
+						else {
+								*p_nb_iter = 1707.0;
+								*p_nb_bad = 399.0;
+								return -1.0000;
+						}
+				}
+				else if (p_features->var_of_sub_var <= 3300.4034)
+				{
+						*p_nb_iter = 1554.0;
+						*p_nb_bad = 675.0;
+						return -1.0000;
+				}
+				else {
+						*p_nb_iter = 1540.0;
+						*p_nb_bad = 429.0;
+						return 2.0000;
+				}
+		}
+		else if (p_features->merge_variance <= 696.1989)
+		{
+				if (p_features->var_of_sub_var <= 31803.3242)
+				{
+						if (p_features->sub_variance_2 <= 10.3845)
+						{
+								*p_nb_iter = 3473.0;
+								*p_nb_bad = 768.0;
+								return 2.0000;
+						}
+						else if (p_features->neigh_variance_C <= 571.5329)

kvazaar-2.0.0.tar.gz/src/ml_intra_cu_depth_pred.h Added

@@ -0,0 +1,90 @@
+#ifndef ML_INTRA_CU_DEPTH_PRED_H_
+#define ML_INTRA_CU_DEPTH_PRED_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include <stdio.h>
+#include "global.h" // IWYU pragma: keep
+
+
+
+
+#define LCU_DEPTH_MAT_SIZE 64
+#define RESTRAINED_FLAG 1
+
+#define pow2(x) ((x)*(x))
+#define CR_XMAX(x_px, block_size, width)       (MIN((x_px) + (block_size), (width))  - (x_px))
+#define CR_YMAX(y_px, block_size, height)      (MIN((y_px) + (block_size), (height)) - (y_px))
+#define CR_GET_X_LCU(lcu_id, nb_lcu_width)     (((lcu_id) % (nb_lcu_width)) << 6)
+#define CR_GET_Y_LCU(lcu_id, nb_lcu_width)     (((lcu_id) / (nb_lcu_width)) << 6)
+#define CR_GET_CU_D3(x, y, depth) ((x)*(1 << (3-depth)) + ((y) << (6 - depth)))
+#define CR_GET_CU_D4(x, y, depth) ((x)*(1 << (4-depth)) + ((y) << (8 - depth)))
+#define CR_GET_DEPTH_MIN(x, y, depth_min_mat) *(depth_min_mat + (x >> 3) + ((y >> 3) << 3))
+#define CR_GET_DEPTH_MAX(x, y, depth_max_mat) *(depth_max_mat + (x >> 3) + ((y >> 3) << 3))
+
+typedef struct {
+	int32_t x;
+	int32_t y;
+}vect_2D;
+
+
+ // Structure used for the CTU depth prediction using Machine Learning 
+ // in All Intra 
+typedef struct {
+	/*!< Number of depth to add to the QT prediction in ''one-shot'' */
+	int8_t   i_nb_addDepth;
+	/*!< Apply an extra Upper Expansion in the upper_depth */
+	bool	 b_extra_up_exp;
+	/*!< Matrix used to store the upper and lower QT prediction*/
+	uint8_t* _mat_upper_depth; 
+	uint8_t* _mat_lower_depth;
+} ml_intra_ctu_pred_t;
+
+
+
+/*
+ * brief generic structure used for the features
+ *
+ */
+typedef struct {
+	double variance;
+	double merge_variance;
+	double sub_variance_0;
+	double sub_variance_1;
+	double sub_variance_2;
+	double sub_variance_3;
+	double neigh_variance_A;
+	double neigh_variance_B;
+	double neigh_variance_C;
+	double var_of_sub_mean;
+	int 	qp;
+	//int   NB_pixels;
+	double var_of_sub_var;
+}features_s;
+
+
+typedef int (*tree_predict)(features_s*, double*, double*);
+
+ml_intra_ctu_pred_t* kvz_init_ml_intra_depth_const(void);
+void kvz_end_ml_intra_depth_const(ml_intra_ctu_pred_t * ml_intra_depth_ctu);
+
+void kvz_lcu_luma_depth_pred(ml_intra_ctu_pred_t* ml_intra_depth_ctu, uint8_t* luma_px, int8_t qp);
+
+#endif
\ No newline at end of file

kvazaar-1.3.0.tar.gz/src/rate_control.c -> kvazaar-2.0.0.tar.gz/src/rate_control.c Changed

@@ -24,11 +24,15 @@
 
 #include "encoder.h"
 #include "kvazaar.h"
+#include "pthread.h"
 
 
 static const int SMOOTHING_WINDOW = 40;
 static const double MIN_LAMBDA    = 0.1;
 static const double MAX_LAMBDA    = 10000;
+#define BETA1 1.2517
+
+static kvz_rc_data *data;
 
 /**
  * \brief Clip lambda value to a valid range.
@@ -38,6 +42,73 @@
   return CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
 }
 
+kvz_rc_data * kvz_get_rc_data(const encoder_control_t * const encoder) {
+  if (data != NULL || encoder == NULL) return data;
+
+  data = calloc(1, sizeof(kvz_rc_data));
+
+  if (data == NULL) return NULL;
+  if (pthread_mutex_init(&data->ck_frame_lock, NULL) != 0) return NULL;
+  if (pthread_mutex_init(&data->lambda_lock, NULL) != 0) return NULL;
+  if (pthread_mutex_init(&data->intra_lock, NULL) != 0) return NULL;
+  for (int (i) = 0; (i) < KVZ_MAX_GOP_LAYERS; ++(i)) {
+    if (pthread_rwlock_init(&data->ck_ctu_lock[i], NULL) != 0) return NULL;
+  }
+
+  const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
+
+  for (int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
+    data->c_para[i] = malloc(sizeof(double) * num_lcus);
+    if (data->c_para[i] == NULL) return NULL;
+
+    data->k_para[i] = malloc(sizeof(double) * num_lcus);
+    if (data->k_para[i] == NULL) return NULL;
+
+    data->pic_c_para[i] = 5.0;
+    data->pic_k_para[i] = -0.1;
+
+    for (int j = 0; j < num_lcus; j++) {
+      data->c_para[i][j] = 5.0;
+      data->k_para[i][j] = -0.1;
+    }
+  }
+  data->intra_bpp = calloc(num_lcus, sizeof(double));
+  if (data->intra_bpp == NULL) return NULL;
+  data->intra_dis = calloc(num_lcus, sizeof(double));
+  if (data->intra_dis == NULL) return NULL;
+
+  memset(data->previous_lambdas, 0, sizeof(data->previous_lambdas));
+
+  data->previous_frame_lambda = 0.0;
+
+  data->intra_pic_bpp = 0.0;
+  data->intra_pic_distortion = 0.0;
+
+  data->intra_alpha = 6.7542000000000000;
+  data->intra_beta = 1.7860000000000000;
+  return data;
+}
+
+void kvz_free_rc_data() {
+  if (data == NULL) return;
+
+  pthread_mutex_destroy(&data->ck_frame_lock);
+  pthread_mutex_destroy(&data->lambda_lock);
+  pthread_mutex_destroy(&data->intra_lock);
+  for (int i = 0; i < KVZ_MAX_GOP_LAYERS; ++i) {
+    pthread_rwlock_destroy(&data->ck_ctu_lock[i]);
+  }
+
+  if (data->intra_bpp) FREE_POINTER(data->intra_bpp);
+  if (data->intra_dis) FREE_POINTER(data->intra_dis);
+  for (int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
+    if (data->c_para[i]) FREE_POINTER(data->c_para[i]);
+    if (data->k_para[i]) FREE_POINTER(data->k_para[i]);
+  }
+  FREE_POINTER(data);
+}
+
+
 /**
  * \brief Update alpha and beta parameters.
  *
@@ -95,6 +166,96 @@
   return MAX(200, gop_target_bits);
 }
 
+static int xCalcHADs8x8_ISlice(kvz_pixel * piOrg, int y, int iStrideOrg)
+{
+  piOrg += y * iStrideOrg;
+  int i, j;
+  int diff[64], m1[8][8], m2[8][8], m3[8][8], iSumHad = 0;
+
+  for (int k = 0; k < 64; k += 8) {
+    diff[k + 0] = piOrg[0];
+    diff[k + 1] = piOrg[1];
+    diff[k + 2] = piOrg[2];
+    diff[k + 3] = piOrg[3];
+    diff[k + 4] = piOrg[4];
+    diff[k + 5] = piOrg[5];
+    diff[k + 6] = piOrg[6];
+    diff[k + 7] = piOrg[7];
+
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++) {
+    int jj = j << 3;
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++) {
+    m3[0][i] = m2[0][i] + m2[4][i];
+    m3[1][i] = m2[1][i] + m2[5][i];
+    m3[2][i] = m2[2][i] + m2[6][i];
+    m3[3][i] = m2[3][i] + m2[7][i];
+    m3[4][i] = m2[0][i] - m2[4][i];
+    m3[5][i] = m2[1][i] - m2[5][i];
+    m3[6][i] = m2[2][i] - m2[6][i];
+    m3[7][i] = m2[3][i] - m2[7][i];
+
+    m1[0][i] = m3[0][i] + m3[2][i];
+    m1[1][i] = m3[1][i] + m3[3][i];
+    m1[2][i] = m3[0][i] - m3[2][i];
+    m1[3][i] = m3[1][i] - m3[3][i];
+    m1[4][i] = m3[4][i] + m3[6][i];
+    m1[5][i] = m3[5][i] + m3[7][i];
+    m1[6][i] = m3[4][i] - m3[6][i];
+    m1[7][i] = m3[5][i] - m3[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      iSumHad += abs(m2[i][j]);
+    }
+  }
+  iSumHad -= abs(m2[0][0]);
+  iSumHad = (iSumHad + 2) >> 2;
+  return(iSumHad);
+}
+
 /**
  * Estimate number of bits used for headers of the current picture.
  * \param state   the main encoder state
@@ -155,6 +316,29 @@
       state->previous_encoder_state->frame->cur_gop_target_bits;
   }
 
+  if (state->frame->is_irap && encoder->cfg.intra_bit_allocation) {
+    int total_cost = 0;
+    for (int y = 0; y < encoder->cfg.height; y += 8) {
+      for (int x = 0; x < encoder->cfg.width; x += 8) {
+        int cost = xCalcHADs8x8_ISlice(state->tile->frame->source->y + x, y, state->tile->frame->source->stride);
+        total_cost += cost;
+        kvz_get_lcu_stats(state, x / 64, y / 64)->i_cost += cost;
+      }
+    }

kvazaar-1.3.0.tar.gz/src/rate_control.h -> kvazaar-2.0.0.tar.gz/src/rate_control.h Changed

@@ -29,10 +29,39 @@
 #include "global.h" // IWYU pragma: keep
 
 #include "encoderstate.h"
+#include "pthread.h"
+
+typedef struct kvz_rc_data {
+  double *c_para[KVZ_MAX_GOP_LAYERS];
+  double *k_para[KVZ_MAX_GOP_LAYERS];
+  double pic_c_para[KVZ_MAX_GOP_LAYERS];
+  double pic_k_para[KVZ_MAX_GOP_LAYERS];
+  double previous_lambdas[KVZ_MAX_GOP_LAYERS + 1];
+  double previous_frame_lambda;
+  double *intra_bpp;
+  double *intra_dis;
+  double intra_pic_distortion;
+  double intra_pic_bpp;
+
+  double intra_alpha;
+  double intra_beta;
+
+  pthread_rwlock_t ck_ctu_lock[KVZ_MAX_GOP_LAYERS];
+  pthread_mutex_t ck_frame_lock;
+  pthread_mutex_t lambda_lock;
+  pthread_mutex_t intra_lock;
+} kvz_rc_data;
+
+kvz_rc_data * kvz_get_rc_data(const encoder_control_t * const encoder);
+void kvz_free_rc_data();
 
 void kvz_set_picture_lambda_and_qp(encoder_state_t * const state);
 
 void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
                                vector2d_t pos);
 
+void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos);
+void kvz_update_after_picture(encoder_state_t * const state);
+void kvz_estimate_pic_lambda(encoder_state_t * const state);
+
 #endif // RATE_CONTROL_H_

kvazaar-1.3.0.tar.gz/src/sao.c -> kvazaar-2.0.0.tar.gz/src/sao.c Changed

@@ -157,29 +157,36 @@
   return mode_bits;
 }
 
-
 /**
  * \brief calculate an array of intensity correlations for each intensity value
  */
+// NOTE: There's also an AVX2 variant of this in strategies/avx2/sao-avx2.c.
+// It has to be separate, because it returns the offset array in different
+// format (an array of YMM vectors).
 void kvz_calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i)
 {
-  int val;
-  int values = (1<<encoder->bitdepth);
-  int shift = encoder->bitdepth-5;
-  int band_pos = (color_i == COLOR_V) ? 1 : 0;
+  int32_t val;
+  const int32_t values = (1<<encoder->bitdepth);
+  const int32_t shift = encoder->bitdepth-5;
+  const int32_t band_pos = (color_i == COLOR_V) ? 1 : 0;
+  const int32_t cur_bp   = sao->band_position[band_pos];
 
   // Loop through all intensity values and construct an offset array
   for (val = 0; val < values; val++) {
-    int cur_band = val>>shift;
-    if (cur_band >= sao->band_position[band_pos] && cur_band < sao->band_position[band_pos] + 4) {
-      offset[val] = CLIP(0, values - 1, val + sao->offsets[cur_band - sao->band_position[band_pos] + 1 + 5 * band_pos]);
+    int32_t cur_band     = val >> shift;
+    int32_t cb_minus_cbp = cur_band - cur_bp;
+
+    if (cb_minus_cbp >= 0 && cb_minus_cbp <= 3) {
+      uint32_t offset_id    = cb_minus_cbp + 1 + 5 * band_pos;
+      int32_t val_unclipped = val + sao->offsets[offset_id];
+      offset[val] = CLIP(0, values - 1, val_unclipped);
+
     } else {
       offset[val] = val;
     }
   }
 }
 
-
 /**
  * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
  * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
@@ -254,8 +261,11 @@
   //Loop pixels and take top 5 bits to classify different bands
   for (y = 0; y < block_height; ++y) {
     for (x = 0; x < block_width; ++x) {
-      sao_bands[0][rec_data[y * block_width + x]>>shift] += orig_data[y * block_width + x] - rec_data[y * block_width + x];
-      sao_bands[1][rec_data[y * block_width + x]>>shift]++;
+      int32_t curr_pos = y * block_width + x;
+
+      kvz_pixel sb_index = rec_data[curr_pos] >> shift;
+      sao_bands[0][sb_index] += orig_data[curr_pos] - rec_data[curr_pos];
+      sao_bands[1][sb_index]++;
     }
   }
 }

kvazaar-1.3.0.tar.gz/src/search.c -> kvazaar-2.0.0.tar.gz/src/search.c Changed

@@ -455,6 +455,11 @@
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
+  struct {
+    int32_t min;
+    int32_t max;
+  } pu_depth_inter, pu_depth_intra;
+
   lcu_t *const lcu = &work_tree[depth];
 
   int x_local = SUB_SCU(x);
@@ -466,6 +471,21 @@
     return 0;
   }
 
+  int gop_layer = ctrl->cfg.gop_len != 0 ? ctrl->cfg.gop[state->frame->gop_offset].layer - 1 : 0;
+
+  // Assign correct depth limit
+  constraint_t* constr = state->constraint;
+ if(constr->ml_intra_depth_ctu) {
+    pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8];
+    pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8];
+  }
+  else {
+    pu_depth_intra.min = ctrl->cfg.pu_depth_intra.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.min[gop_layer] : ctrl->cfg.pu_depth_intra.min[0];
+    pu_depth_intra.max = ctrl->cfg.pu_depth_intra.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.max[gop_layer] : ctrl->cfg.pu_depth_intra.max[0];
+  }
+  pu_depth_inter.min = ctrl->cfg.pu_depth_inter.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.min[gop_layer] : ctrl->cfg.pu_depth_inter.min[0];
+  pu_depth_inter.max = ctrl->cfg.pu_depth_inter.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.max[gop_layer] : ctrl->cfg.pu_depth_inter.max[0];
+
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   // Assign correct depth
   cur_cu->depth = depth > MAX_DEPTH ? MAX_DEPTH : depth;
@@ -479,12 +499,12 @@
   if (x + cu_width <= frame->width &&
       y + cu_width <= frame->height)
   {
-    int cu_width_inter_min = LCU_WIDTH >> ctrl->cfg.pu_depth_inter.max;
+    int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max;
     bool can_use_inter =
       state->frame->slicetype != KVZ_SLICE_I &&
       depth <= MAX_DEPTH &&
       (
-        WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
+        WITHIN(depth, pu_depth_inter.min, pu_depth_inter.max) ||
         // When the split was forced because the CTU is partially outside the
         // frame, we permit inter coding even if pu_depth_inter would
         // otherwise forbid it.
@@ -520,11 +540,11 @@
         const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
         for (int i = first_mode; i <= last_mode; ++i) {
           kvz_search_cu_smp(state,
-		                    x, y,
-		                    depth,
-		                    mp_modes[i],
-		                    &work_tree[depth + 1],
-		                    &mode_cost, &mode_bitcost);
+                            x, y,
+                            depth,
+                            mp_modes[i],
+                            &work_tree[depth + 1],
+                            &mode_cost, &mode_bitcost);
           if (mode_cost < cost) {
             cost = mode_cost;
             inter_bitcost = mode_bitcost;
@@ -543,9 +563,9 @@
                       && cost / (cu_width * cu_width) < INTRA_THRESHOLD)
                       || (ctrl->cfg.early_skip && cur_cu->skipped);
 
-    int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max;
+    int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
     bool can_use_intra =
-        WITHIN(depth, ctrl->cfg.pu_depth_intra.min, ctrl->cfg.pu_depth_intra.max) ||
+        WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
         // When the split was forced because the CTU is partially outside
         // the frame, we permit intra coding even if pu_depth_intra would
         // otherwise forbid it.
@@ -604,20 +624,21 @@
         }
         kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
 
-        kvz_inter_recon_cu(state, lcu, x, y, cu_width);
+        const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+        kvz_inter_recon_cu(state, lcu, x, y, cu_width, true, has_chroma);
 
-        if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
           //Calculate cost for zero coeffs
           inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
 
         }
 
-        const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
         kvz_quantize_lcu_residual(state,
           true, has_chroma,
           x, y, depth,
           NULL,
-          lcu);
+          lcu,
+          false);
 
         int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
@@ -650,7 +671,7 @@
 
     cost += mode_bits * state->lambda;
 
-    if (inter_zero_coeff_cost <= cost) {
+    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
       cost = inter_zero_coeff_cost;
 
       // Restore saved pixels from lower level of the working tree.
@@ -677,9 +698,9 @@
     // If the CU is partially outside the frame, we need to split it even
     // if pu_depth_intra and pu_depth_inter would not permit it.
     cur_cu->type == CU_NOTSET ||
-    depth < ctrl->cfg.pu_depth_intra.max ||
+    depth < pu_depth_intra.max ||
     (state->frame->slicetype != KVZ_SLICE_I &&
-      depth < ctrl->cfg.pu_depth_inter.max);
+      depth < pu_depth_inter.max);
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
@@ -937,11 +958,21 @@
     work_tree[depth] = work_tree[0];
   }
 
+  // If the ML depth prediction is enabled, 
+  // generate the depth prediction interval 
+  // for the current lcu
+  constraint_t* constr = state->constraint;
+  if (constr->ml_intra_depth_ctu) {
+    kvz_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree[0].ref.y, state->qp);
+  }
+
   // Start search from depth 0.
   double cost = search_cu(state, x, y, 0, work_tree);
 
   // Save squared cost for rate control.
-  kvz_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight = cost * cost;
+  if(state->encoder_control->cfg.rc_algorithm == KVZ_LAMBDA) {
+    kvz_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight = cost * cost;
+  }
 
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.

kvazaar-1.3.0.tar.gz/src/search.h -> kvazaar-2.0.0.tar.gz/src/search.h Changed

kvazaar-1.3.0.tar.gz/src/search_inter.c -> kvazaar-2.0.0.tar.gz/src/search_inter.c Changed

@@ -1135,15 +1135,56 @@
   if (src.malloc_used) free(src.buffer);
 }
 
+/**
+* \brief Calculate the scaled MV
+*/
+static INLINE int16_t get_scaled_mv(int16_t mv, int scale)
+{
+  int32_t scaled = scale * mv;
+  return CLIP(-32768, 32767, (scaled + 127 + (scaled < 0)) >> 8);
+}
+/**
+* \brief Scale the MV according to the POC difference
+*
+* \param current_poc        POC of current frame
+* \param current_ref_poc    POC of reference frame
+* \param neighbor_poc       POC of neighbor frame
+* \param neighbor_ref_poc   POC of neighbors reference frame
+* \param mv_cand            MV candidates to scale
+*/
+static void apply_mv_scaling(int32_t current_poc,
+  int32_t current_ref_poc,
+  int32_t neighbor_poc,
+  int32_t neighbor_ref_poc,
+  vector2d_t* mv_cand)
+{
+  int32_t diff_current = current_poc - current_ref_poc;
+  int32_t diff_neighbor = neighbor_poc - neighbor_ref_poc;
+
+  if (diff_current == diff_neighbor) return;
+  if (diff_neighbor == 0) return;
+
+  diff_current = CLIP(-128, 127, diff_current);
+  diff_neighbor = CLIP(-128, 127, diff_neighbor);
+
+  int scale = CLIP(-4096, 4095,
+    (diff_current * ((0x4000 + (abs(diff_neighbor) >> 1)) / diff_neighbor) + 32) >> 6);
+
+  mv_cand->x = get_scaled_mv(mv_cand->x, scale);
+  mv_cand->y = get_scaled_mv(mv_cand->y, scale);
+}
+
 
 /**
  * \brief Perform inter search for a single reference frame.
  */
 static void search_pu_inter_ref(inter_search_info_t *info,
-                                int depth,
-                                lcu_t *lcu, cu_info_t *cur_cu,
-                                double *inter_cost,
-                                uint32_t *inter_bitcost)
+  int depth,
+  lcu_t *lcu, cu_info_t *cur_cu,
+  double *inter_cost,
+  uint32_t *inter_bitcost,
+  double *best_LX_cost,
+  cu_info_t *unipred_LX)
 {
   const kvz_config *cfg = &info->state->encoder_control->cfg;
 
@@ -1153,20 +1194,20 @@
   int8_t LX_idx;
   // max value of LX_idx plus one
   const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0],
-                                       info->state->frame->ref_LX_size[1]);
+    info->state->frame->ref_LX_size[1]);
 
   for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++)
   {
     // check if ref_idx is in L0
     if (LX_idx < info->state->frame->ref_LX_size[0] &&
-        info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) {
+      info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) {
       ref_list = 0;
       break;
     }
 
     // check if ref_idx is in L1
     if (LX_idx < info->state->frame->ref_LX_size[1] &&
-        info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) {
+      info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) {
       ref_list = 1;
       break;
     }
@@ -1194,22 +1235,57 @@
   cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
 
   vector2d_t mv = { 0, 0 };
-  {
-    // Take starting point for MV search from previous frame.
-    // When temporal motion vector candidates are added, there is probably
-    // no point to this anymore, but for now it helps.
-    const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1);
-    const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1);
-    const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx];
-    const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y);
-    if (ref_cu->type == CU_INTER) {
-      if (ref_cu->inter.mv_dir & 1) {
-        mv.x = ref_cu->inter.mv[0][0];
-        mv.y = ref_cu->inter.mv[0][1];
-      } else {
-        mv.x = ref_cu->inter.mv[1][0];
-        mv.y = ref_cu->inter.mv[1][1];
+
+  // Take starting point for MV search from previous frame.
+  // When temporal motion vector candidates are added, there is probably
+  // no point to this anymore, but for now it helps.
+  const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1);
+  const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1);
+  const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx];
+  const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y);
+  if (ref_cu->type == CU_INTER) {
+    if (ref_cu->inter.mv_dir & 1) {
+      mv.x = ref_cu->inter.mv[0][0];
+      mv.y = ref_cu->inter.mv[0][1];
+    }
+    else {
+      mv.x = ref_cu->inter.mv[1][0];
+      mv.y = ref_cu->inter.mv[1][1];
+    }
+    // Apply mv scaling if neighbor poc is available
+    if (info->state->frame->ref_LX_size[ref_list] > 0) {
+      // When there are reference pictures from the future (POC > current POC)
+      // in L0 or L1, the primary list for the colocated PU is the inverse of
+      // collocated_from_l0_flag. Otherwise it is equal to reflist.
+      //
+      // Kvazaar always sets collocated_from_l0_flag so the list is L1 when
+      // there are future references.
+      int col_list = ref_list;
+      for (int i = 0; i < info->state->frame->ref->used_size; i++) {
+        if (info->state->frame->ref->pocs[i] > info->state->frame->poc) {
+          col_list = 1;
+          break;
+        }
       }
+      if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) {
+        // Use the other list if the colocated PU does not have a MV for the
+        // primary list.
+        col_list = 1 - col_list;
+      }
+
+      uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx];
+      // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument
+      apply_mv_scaling(
+        info->state->frame->poc,
+        info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]],
+        info->state->frame->ref->pocs[neighbor_poc_index],
+        info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[
+          info->state->frame->ref->ref_LXs[neighbor_poc_index]
+          [col_list]
+          [ref_cu->inter.mv_ref[col_list]]
+        ],
+        &mv
+      );
     }
   }
 
@@ -1303,6 +1379,23 @@
     *inter_cost = info->best_cost;
     *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
   }
+
+
+  // Update best unipreds for biprediction
+  if (info->best_cost < best_LX_cost[ref_list]) {
+    bool valid_mv = fracmv_within_tile(info, mv.x, mv.y);
+    if (valid_mv) {
+      // Map reference index to L0/L1 pictures
+      unipred_LX[ref_list].inter.mv_dir = ref_list + 1;
+      unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx;
+      unipred_LX[ref_list].inter.mv[ref_list][0] = (int16_t)mv.x;
+      unipred_LX[ref_list].inter.mv[ref_list][1] = (int16_t)mv.y;
+
+      CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand);
+
+      best_LX_cost[ref_list] = info->best_cost;
+    }
+  }
 }
 
 
@@ -1365,7 +1458,9 @@
                            width,
                            height,
                            mv,
-                           lcu);
+                           lcu,
+                           true,
+                           false);
 
     const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
     const kvz_pixel *src = &frame->source->y[x + y * frame->source->width];
@@ -1442,6 +1537,37 @@
 }
 
 /**
+ * \brief Check if an identical merge candidate exists in a list
+ *
+ * \param all_cand        Full list of available merge candidates

kvazaar-1.3.0.tar.gz/src/search_intra.c -> kvazaar-2.0.0.tar.gz/src/search_intra.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/altivec/picture-altivec.c -> kvazaar-2.0.0.tar.gz/src/strategies/altivec/picture-altivec.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/avx2/avx2_common_functions.h -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/avx2_common_functions.h Changed

@@ -3,6 +3,30 @@
 
 #include <immintrin.h>
 
+// The calling convention used by MSVC on 32-bit builds will essentially
+// disallow functions to have more than 3 XMM/YMM parameters, because it
+// will not provide more than 8-byte param alignment, and only the first
+// three vector params will be carried in SIMD registers. Now the
+// vectorcall convention could probably be problematic in globally visible
+// funcitons, but likely not in static ones.
+#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
+  #define FIX_W32 __vectorcall
+#else
+  #define FIX_W32
+#endif
+
+// Non-inline functions defined in this header are likely to trigger a
+// warning for each module including this header that does NOT use them,
+// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
+// Tell 'em we actually want to do that, it's not an accident.
+#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
+  #define FIX_UNUSED __attribute__((unused))
+#else
+  #define FIX_UNUSED
+#endif
+
+#define FIX_NOINLINE FIX_W32 FIX_UNUSED
+
 /*
  * Reorder coefficients from raster to scan order
  * Fun fact: Once upon a time, doing this in a loop looked like this:
@@ -111,4 +135,19 @@
   *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
 }
 
+static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)
+{
+  __m256i sum1 = v;
+  __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum3 = _mm256_add_epi32        (sum1, sum2);
+  __m256i sum4 = _mm256_shuffle_epi32    (sum3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum5 = _mm256_add_epi32        (sum3, sum4);
+  __m256i sum6 = _mm256_shuffle_epi32    (sum5, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i sum7 = _mm256_add_epi32        (sum5, sum6);
+
+  __m128i sum8 = _mm256_castsi256_si128  (sum7);
+  int32_t sum9 = _mm_cvtsi128_si32       (sum8);
+  return  sum9;
+}
+
 #endif

kvazaar-1.3.0.tar.gz/src/strategies/avx2/dct-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/dct-avx2.c Changed

@@ -47,262 +47,834 @@
 * \brief AVX2 transformations.
 */
 
+static INLINE __m256i swap_lanes(__m256i v)
+{
+  return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(1, 0, 3, 2));
+}
+
+static INLINE __m256i truncate(__m256i v, __m256i debias, int32_t shift)
+{
+  __m256i truncable = _mm256_add_epi32 (v,         debias);
+  return              _mm256_srai_epi32(truncable, shift);
+}
+
 // 4x4 matrix multiplication with value clipping.
 // Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses,
 //             destination for the result and the shift value for clipping.
-static void mul_clip_matrix_4x4_avx2(const int16_t *left, const int16_t *right, int16_t *dst, int32_t shift)
+static __m256i mul_clip_matrix_4x4_avx2(const __m256i left, const __m256i right, int shift)
 {
-  __m256i b[2], a, result, even[2], odd[2];
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
 
-  const int32_t add = 1 << (shift - 1);
+  __m256i right_los = _mm256_permute4x64_epi64(right, _MM_SHUFFLE(2, 0, 2, 0));
+  __m256i right_his = _mm256_permute4x64_epi64(right, _MM_SHUFFLE(3, 1, 3, 1));
 
-  a = _mm256_loadu_si256((__m256i*) left);
-  b[0] = _mm256_loadu_si256((__m256i*) right);
+  __m256i right_cols_up = _mm256_unpacklo_epi16(right_los, right_his);
+  __m256i right_cols_dn = _mm256_unpackhi_epi16(right_los, right_his);
 
-  // Interleave values in both 128-bit lanes
-  b[0] = _mm256_unpacklo_epi16(b[0], _mm256_srli_si256(b[0], 8));
-  b[1] = _mm256_permute2x128_si256(b[0], b[0], 1 + 16);
-  b[0] = _mm256_permute2x128_si256(b[0], b[0], 0);
+  __m256i left_slice1 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i left_slice2 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(1, 1, 1, 1));
+  __m256i left_slice3 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(2, 2, 2, 2));
+  __m256i left_slice4 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(3, 3, 3, 3));
 
-  // Fill both 128-lanes with the first pair of 16-bit factors in the lane.
-  even[0] = _mm256_shuffle_epi32(a, 0);
-  odd[0] = _mm256_shuffle_epi32(a, 1 + 4 + 16 + 64);
+  __m256i prod1 = _mm256_madd_epi16(left_slice1, right_cols_up);
+  __m256i prod2 = _mm256_madd_epi16(left_slice2, right_cols_dn);
+  __m256i prod3 = _mm256_madd_epi16(left_slice3, right_cols_up);
+  __m256i prod4 = _mm256_madd_epi16(left_slice4, right_cols_dn);
 
-  // Multiply packed elements and sum pairs. Input 16-bit output 32-bit.
-  even[0] = _mm256_madd_epi16(even[0], b[0]);
-  odd[0] = _mm256_madd_epi16(odd[0], b[1]);
+  __m256i rows_up = _mm256_add_epi32(prod1, prod2);
+  __m256i rows_dn = _mm256_add_epi32(prod3, prod4);
 
-  // Add the halves of the dot product and
-  // round.
-  result = _mm256_add_epi32(even[0], odd[0]);
-  result = _mm256_add_epi32(result, _mm256_set1_epi32(add));
-  result = _mm256_srai_epi32(result, shift);
+  __m256i rows_up_tr = truncate(rows_up, debias, shift);
+  __m256i rows_dn_tr = truncate(rows_dn, debias, shift);
 
-  //Repeat for the remaining parts
-  even[1] = _mm256_shuffle_epi32(a, 2 + 8 + 32 + 128);
-  odd[1] = _mm256_shuffle_epi32(a, 3 + 12 + 48 + 192);
+  __m256i result = _mm256_packs_epi32(rows_up_tr, rows_dn_tr);
+  return result;
+}
 
-  even[1] = _mm256_madd_epi16(even[1], b[0]);
-  odd[1] = _mm256_madd_epi16(odd[1], b[1]);
+static void matrix_dst_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit[4] + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit[4] + 8;
+  const int16_t *tdst = &kvz_g_dst_4_t[0][0];
+  const int16_t *dst  = &kvz_g_dst_4  [0][0];
 
-  odd[1] = _mm256_add_epi32(even[1], odd[1]);
-  odd[1] = _mm256_add_epi32(odd[1], _mm256_set1_epi32(add));
-  odd[1] = _mm256_srai_epi32(odd[1], shift);
+  __m256i tdst_v = _mm256_load_si256((const __m256i *) tdst);
+  __m256i  dst_v = _mm256_load_si256((const __m256i *)  dst);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
 
-  // Truncate to 16-bit values
-  result = _mm256_packs_epi32(result, odd[1]);
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(in_v,  tdst_v, shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(dst_v, tmp,    shift_2nd);
 
-  _mm256_storeu_si256((__m256i*)dst, result);
+  _mm256_store_si256((__m256i *)output, result);
+}
+
+static void matrix_idst_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+
+  const int16_t *tdst = &kvz_g_dst_4_t[0][0];
+  const int16_t *dst  = &kvz_g_dst_4  [0][0];
+
+  __m256i tdst_v = _mm256_load_si256((const __m256i *)tdst);
+  __m256i  dst_v = _mm256_load_si256((const __m256i *) dst);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
+
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(tdst_v, in_v,  shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(tmp,    dst_v, shift_2nd);
+
+  _mm256_store_si256((__m256i *)output, result);
+}
+
+static void matrix_dct_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit[4] + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit[4] + 8;
+  const int16_t *tdct = &kvz_g_dct_4_t[0][0];
+  const int16_t *dct  = &kvz_g_dct_4  [0][0];
+
+  __m256i tdct_v = _mm256_load_si256((const __m256i *) tdct);
+  __m256i  dct_v = _mm256_load_si256((const __m256i *)  dct);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
+
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(in_v,  tdct_v, shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(dct_v, tmp,    shift_2nd);
+
+  _mm256_store_si256((__m256i *)output, result);
+}
+
+static void matrix_idct_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+
+  const int16_t *tdct = &kvz_g_dct_4_t[0][0];
+  const int16_t *dct  = &kvz_g_dct_4  [0][0];
+
+  __m256i tdct_v = _mm256_load_si256((const __m256i *)tdct);
+  __m256i  dct_v = _mm256_load_si256((const __m256i *) dct);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
+
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(tdct_v, in_v,  shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(tmp,    dct_v, shift_2nd);
+
+  _mm256_store_si256((__m256i *)output, result);
 }
 
-// 8x8 matrix multiplication with value clipping.
-// Parameters: Two 8x8 matrices containing 16-bit values in consecutive addresses,
-//             destination for the result and the shift value for clipping.
-//
 static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift)
 {
-  int i, j;
-  __m256i b[2], accu[8], even[2], odd[2];
+  const __m256i transp_mask = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15));
+
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  __m256i left_dr[4] = {
+    _mm256_load_si256((const __m256i *)left + 0),
+    _mm256_load_si256((const __m256i *)left + 1),
+    _mm256_load_si256((const __m256i *)left + 2),
+    _mm256_load_si256((const __m256i *)left + 3),
+  };
+  __m256i right_dr[4] = {
+    _mm256_load_si256((const __m256i *)right + 0),
+    _mm256_load_si256((const __m256i *)right + 1),
+    _mm256_load_si256((const __m256i *)right + 2),
+    _mm256_load_si256((const __m256i *)right + 3),
+  };
+
+  __m256i rdrs_rearr[8];
+
+  // Rearrange right matrix
+  for (int32_t dry = 0; dry < 4; dry++) {
+    __m256i rdr = right_dr[dry];
+    __m256i rdr_los = _mm256_permute4x64_epi64(rdr, _MM_SHUFFLE(2, 0, 2, 0));
+    __m256i rdr_his = _mm256_permute4x64_epi64(rdr, _MM_SHUFFLE(3, 1, 3, 1));
+
+    __m256i rdr_lo_rearr = _mm256_shuffle_epi8(rdr_los, transp_mask);
+    __m256i rdr_hi_rearr = _mm256_shuffle_epi8(rdr_his, transp_mask);
+
+    rdrs_rearr[dry * 2 + 0] = rdr_lo_rearr;
+    rdrs_rearr[dry * 2 + 1] = rdr_hi_rearr;
+  }
+
+  // Double-Row Y for destination matrix
+  for (int32_t dry = 0; dry < 4; dry++) {
+    __m256i ldr = left_dr[dry];
 
-  const int32_t add = 1 << (shift - 1);
+    __m256i ldr_slice12 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i ldr_slice34 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i ldr_slice56 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i ldr_slice78 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(3, 3, 3, 3));

kvazaar-1.3.0.tar.gz/src/strategies/avx2/intra-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/intra-avx2.c Changed

@@ -26,6 +26,7 @@
 
 #include "kvazaar.h"
 #include "strategyselector.h"
+#include "strategies/missing-intel-intrinsics.h"
 
 
  /**
@@ -416,7 +417,7 @@
       tmp_ref[x + width] = ref_main[x];
     }
     // Get a pointer to block index 0 in tmp_ref.
-    ref_main = &tmp_ref[width];
+    ref_main = tmp_ref + width;
 
     // Extend the side reference to the negative indices of main reference.
     int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
@@ -453,7 +454,6 @@
   }
 }
 
-
 /**
  * \brief Generate planar prediction.
  * \param log2_width    Log2 of width, range 2..5.
@@ -500,19 +500,411 @@
         _mm_storel_epi64((__m128i*)&(dst[y * width + x]), chunk);
       }
     }
-
   } else {
-    // Unoptimized version for reference.
-    for (int y = 0; y < width; ++y) {
-      for (int x = 0; x < width; ++x) {
-        int_fast16_t hor = (width - 1 - x) * ref_left[y + 1] + (x + 1) * top_right;
-        int_fast16_t ver = (width - 1 - y) * ref_top[x + 1] + (y + 1) * bottom_left;
-        dst[y * width + x] = (ver + hor + width) >> (log2_width + 1);
-      }
+    // Only if log2_width == 2 <=> width == 4
+    assert(width == 4);
+    const __m128i rl_shufmask = _mm_setr_epi32(0x04040404, 0x05050505,
+                                               0x06060606, 0x07070707);
+
+    const __m128i xp1   = _mm_set1_epi32  (0x04030201);
+    const __m128i yp1   = _mm_shuffle_epi8(xp1,   rl_shufmask);
+
+    const __m128i rdist = _mm_set1_epi32  (0x00010203);
+    const __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask);
+
+    const __m128i wid16 = _mm_set1_epi16  (width);
+    const __m128i tr    = _mm_set1_epi8   (top_right);
+    const __m128i bl    = _mm_set1_epi8   (bottom_left);
+
+    uint32_t rt14    = *(const uint32_t *)(ref_top  + 1);
+    uint32_t rl14    = *(const uint32_t *)(ref_left + 1);
+    uint64_t rt14_64 = (uint64_t)rt14;
+    uint64_t rl14_64 = (uint64_t)rl14;
+    uint64_t rtl14   = rt14_64 | (rl14_64 << 32);
+
+    __m128i rtl_v    = _mm_cvtsi64_si128   (rtl14);
+    __m128i rt       = _mm_broadcastd_epi32(rtl_v);
+    __m128i rl       = _mm_shuffle_epi8    (rtl_v,    rl_shufmask);
+
+    __m128i rtrl_l   = _mm_unpacklo_epi8   (rt,       rl);
+    __m128i rtrl_h   = _mm_unpackhi_epi8   (rt,       rl);
+
+    __m128i bdrd_l   = _mm_unpacklo_epi8   (bdist,    rdist);
+    __m128i bdrd_h   = _mm_unpackhi_epi8   (bdist,    rdist);
+
+    __m128i hvs_lo   = _mm_maddubs_epi16   (rtrl_l,   bdrd_l);
+    __m128i hvs_hi   = _mm_maddubs_epi16   (rtrl_h,   bdrd_h);
+
+    __m128i xp1yp1_l = _mm_unpacklo_epi8   (xp1,      yp1);
+    __m128i xp1yp1_h = _mm_unpackhi_epi8   (xp1,      yp1);
+    __m128i trbl_lh  = _mm_unpacklo_epi8   (tr,       bl);
+
+    __m128i addend_l = _mm_maddubs_epi16   (trbl_lh,  xp1yp1_l);
+    __m128i addend_h = _mm_maddubs_epi16   (trbl_lh,  xp1yp1_h);
+
+            addend_l = _mm_add_epi16       (addend_l, wid16);
+            addend_h = _mm_add_epi16       (addend_h, wid16);
+
+    __m128i sum_l    = _mm_add_epi16       (hvs_lo,   addend_l);
+    __m128i sum_h    = _mm_add_epi16       (hvs_hi,   addend_h);
+
+    // Shift right by log2_width + 1
+    __m128i sum_l_t  = _mm_srli_epi16      (sum_l,    3);
+    __m128i sum_h_t  = _mm_srli_epi16      (sum_h,    3);
+    __m128i result   = _mm_packus_epi16    (sum_l_t,  sum_h_t);
+    _mm_storeu_si128((__m128i *)dst, result);
+  }
+}
+
+// Calculate the DC value for a 4x4 block. The algorithm uses slightly
+// different addends, multipliers etc for different pixels in the block,
+// but for a fixed-size implementation one vector wide, all the weights,
+// addends etc can be preinitialized for each position.
+static void pred_filtered_dc_4x4(const uint8_t *ref_top,
+                                 const uint8_t *ref_left,
+                                       uint8_t *out_block)
+{
+  const uint32_t rt_u32 = *(const uint32_t *)(ref_top  + 1);
+  const uint32_t rl_u32 = *(const uint32_t *)(ref_left + 1);
+
+  const __m128i zero    = _mm_setzero_si128();
+  const __m128i twos    = _mm_set1_epi8(2);
+
+  // Hack. Move 4 u8's to bit positions 0, 64, 128 and 192 in two regs, to
+  // expand them to 16 bits sort of "for free". Set highest bits on all the
+  // other bytes in vectors to zero those bits in the result vector.
+  const __m128i rl_shuf_lo = _mm_setr_epi32(0x80808000, 0x80808080,
+                                            0x80808001, 0x80808080);
+  const __m128i rl_shuf_hi = _mm_add_epi8  (rl_shuf_lo, twos);
+
+  // Every second multiplier is 1, because we want maddubs to calculate
+  // a + bc = 1 * a + bc (actually 2 + bc). We need to fill a vector with
+  // ((u8)2)'s for other stuff anyway, so that can also be used here.
+  const __m128i mult_lo = _mm_setr_epi32(0x01030102, 0x01030103,
+                                         0x01040103, 0x01040104);
+  const __m128i mult_hi = _mm_setr_epi32(0x01040103, 0x01040104,
+                                         0x01040103, 0x01040104);
+  __m128i four         = _mm_cvtsi32_si128  (4);
+  __m128i rt           = _mm_cvtsi32_si128  (rt_u32);
+  __m128i rl           = _mm_cvtsi32_si128  (rl_u32);
+  __m128i rtrl         = _mm_unpacklo_epi32 (rt, rl);
+
+  __m128i sad0         = _mm_sad_epu8       (rtrl, zero);
+  __m128i sad1         = _mm_shuffle_epi32  (sad0, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad2         = _mm_add_epi64      (sad0, sad1);
+  __m128i sad3         = _mm_add_epi64      (sad2, four);
+
+  __m128i dc_64        = _mm_srli_epi64     (sad3, 3);
+  __m128i dc_8         = _mm_broadcastb_epi8(dc_64);
+
+  __m128i rl_lo        = _mm_shuffle_epi8   (rl, rl_shuf_lo);
+  __m128i rl_hi        = _mm_shuffle_epi8   (rl, rl_shuf_hi);
+
+  __m128i rt_lo        = _mm_unpacklo_epi8  (rt, zero);
+  __m128i rt_hi        = zero;
+
+  __m128i dc_addend    = _mm_unpacklo_epi8(dc_8, twos);
+
+  __m128i dc_multd_lo  = _mm_maddubs_epi16(dc_addend,    mult_lo);
+  __m128i dc_multd_hi  = _mm_maddubs_epi16(dc_addend,    mult_hi);
+
+  __m128i rl_rt_lo     = _mm_add_epi16    (rl_lo,        rt_lo);
+  __m128i rl_rt_hi     = _mm_add_epi16    (rl_hi,        rt_hi);
+
+  __m128i res_lo       = _mm_add_epi16    (dc_multd_lo,  rl_rt_lo);
+  __m128i res_hi       = _mm_add_epi16    (dc_multd_hi,  rl_rt_hi);
+
+          res_lo       = _mm_srli_epi16   (res_lo,       2);
+          res_hi       = _mm_srli_epi16   (res_hi,       2);
+
+  __m128i final        = _mm_packus_epi16 (res_lo,       res_hi);
+  _mm_storeu_si128((__m128i *)out_block, final);
+}
+
+static void pred_filtered_dc_8x8(const uint8_t *ref_top,
+                                 const uint8_t *ref_left,
+                                       uint8_t *out_block)
+{
+  const uint64_t rt_u64 = *(const uint64_t *)(ref_top  + 1);
+  const uint64_t rl_u64 = *(const uint64_t *)(ref_left + 1);
+
+  const __m128i zero128 = _mm_setzero_si128();
+  const __m256i twos    = _mm256_set1_epi8(2);
+
+  // DC multiplier is 2 at (0, 0), 3 at (*, 0) and (0, *), and 4 at (*, *).
+  // There is a constant addend of 2 on each pixel, use values from the twos
+  // register and multipliers of 1 for that, to use maddubs for an (a*b)+c
+  // operation.
+  const __m256i mult_up_lo = _mm256_setr_epi32(0x01030102, 0x01030103,
+                                               0x01030103, 0x01030103,
+                                               0x01040103, 0x01040104,
+                                               0x01040104, 0x01040104);
+
+  // The 6 lowest rows have same multipliers, also the DC values and addends
+  // are the same so this works for all of those
+  const __m256i mult_rest  = _mm256_permute4x64_epi64(mult_up_lo, _MM_SHUFFLE(3, 2, 3, 2));
+
+  // Every 8-pixel row starts with the next pixel of ref_left. Along with
+  // doing the shuffling, also expand u8->u16, ie. move bytes 0 and 1 from
+  // ref_left to bit positions 0 and 128 in rl_up_lo, 2 and 3 to rl_up_hi,
+  // etc. The places to be zeroed out are 0x80 instead of the usual 0xff,
+  // because this allows us to form new masks on the fly by adding 0x02-bytes
+  // to this mask and still retain the highest bits as 1 where things should
+  // be zeroed out.
+  const __m256i rl_shuf_up_lo = _mm256_setr_epi32(0x80808000, 0x80808080,
+                                                  0x80808080, 0x80808080,
+                                                  0x80808001, 0x80808080,
+                                                  0x80808080, 0x80808080);
+  // And don't waste memory or architectural regs, hope these instructions
+  // will be placed in between the shuffles by the compiler to only use one
+  // register for the shufmasks, and executed way ahead of time because their
+  // regs can be renamed.
+  const __m256i rl_shuf_up_hi = _mm256_add_epi8 (rl_shuf_up_lo, twos);
+  const __m256i rl_shuf_dn_lo = _mm256_add_epi8 (rl_shuf_up_hi, twos);
+  const __m256i rl_shuf_dn_hi = _mm256_add_epi8 (rl_shuf_dn_lo, twos);
+

kvazaar-1.3.0.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -756,251 +756,6 @@
   }
 }
 
-static void inter_recon_bipred_no_mov_avx2(
- const int height,
- const int width,
- const int ypos,
- const int xpos,
- const hi_prec_buf_t*high_precision_rec0,
- const hi_prec_buf_t*high_precision_rec1,
- lcu_t* lcu,
- kvz_pixel* temp_lcu_y,
- kvz_pixel* temp_lcu_u,
- kvz_pixel* temp_lcu_v) {
-
- // This function is used only when kvazaar can't find any movement from the current block
- int y_in_lcu, x_in_lcu;
- __m256i sample0_epi8, sample1_epi8, temp_y_epi8;
- int32_t * pointer = 0;
-
- for (int temp_y = 0; temp_y < height; temp_y += 1) {
-  y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-
-  for (int temp_x = 0; temp_x < width; temp_x += 32) {
-
-   x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-
-   switch (width)
-   {
-
-   case 4:
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
-    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
-
-    break;
-
-   case 8:
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    // Store 64-bits from vector to memory
-    _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8));
-
-    break;
-
-   case 12:
-    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    // Store 64-bits from vector to memory
-    _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8));
-
-    x_in_lcu = ((xpos + temp_x + 8) & ((LCU_WIDTH)-1));
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
-    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
-    break;
-
-
-   case 16:
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    // Store 128-bit to memory
-    _mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8));
-
-    break;
-
-   case 32:
-
-    sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-    sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-
-    // Store 256-bit integers to memory
-    _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_y_epi8);
-    break;
-
-   default:
-    // If width is something strange size, use this
-    for (int temp_i = 0; temp_i < width; ++temp_i) {
-     x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
-
-     int sample0_y = (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH));
-     int sample1_y = (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH));
-
-     lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y) >> 1);
-    }
-
-
-   }
-
-   if (temp_x < width >> 1 && temp_y < height >> 1) {
-    y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-    x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-
-    __m256i temp_u_epi8;
-    __m256i temp_v_epi8;
-
-
-    switch (width)
-    {
-
-    case 8:
-
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8));
-
-     pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8));
-
-     break;
-
-    case 12:
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8));
-
-     pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8));
-
-     // This is used only with odd shaped objects
-     for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
-      int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-      int16_t sample0_u = (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH));
-      int16_t sample1_u = (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH));
-      lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1);
-
-      int16_t sample0_v = (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH));
-      int16_t sample1_v = (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH));
-      lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1);
-     }
-
-     break;
-
-    case 16:
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     // Store 64-bit integer into memory
-     _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_u_epi8));
-
-     // Store 64-bit integer into memory
-     _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_v_epi8));
-
-     break;
-
-    case 32:
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));

kvazaar-1.3.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -621,6 +621,7 @@
 * \param pred_in  Predicted pixels.
 * \param rec_out  Reconstructed pixels.
 * \param coeff_out  Coefficients used for reconstruction of rec_out.
+* \param early_skip if this is used for early skip, bypass IT and IQ
 *
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
@@ -629,11 +630,12 @@
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
-  kvz_pixel *rec_out, coeff_t *coeff_out)
+  kvz_pixel *rec_out, coeff_t *coeff_out,
+  bool early_skip)
 {
   // Temporary arrays to pass data to and from kvz_quant and transform functions.
-  int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
-  coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
 
   int has_coeffs = 0;
 
@@ -673,7 +675,7 @@
 
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
-  if (has_coeffs) {
+  if (has_coeffs && !early_skip) {
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
     kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);

kvazaar-1.3.0.tar.gz/src/strategies/avx2/sao-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/sao-avx2.c Changed

@@ -22,7 +22,12 @@
 
 #if COMPILE_INTEL_AVX2
 #include <immintrin.h>
+#include <nmmintrin.h>
 
+// Use a couple generic functions from here as a worst-case fallback
+#include "strategies/generic/sao_shared_generics.h"
+#include "strategies/avx2/avx2_common_functions.h"
+#include "strategies/missing-intel-intrinsics.h"
 #include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
@@ -30,324 +35,853 @@
 #include "sao.h"
 #include "strategyselector.h"
 
-
 // These optimizations are based heavily on sao-generic.c.
 // Might be useful to check that if (when) this file
 // is difficult to understand.
 
-
-static INLINE __m128i load_6_pixels(const kvz_pixel* data)
+// Do the SIGN3 operation for the difference a-b
+static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
 {
-  return _mm_insert_epi16(_mm_cvtsi32_si128(*(int32_t*)&(data[0])), *(int16_t*)&(data[4]), 2);
+  // Subtract 0x80 from unsigneds to compare them as signed
+  const __m256i epu2epi = _mm256_set1_epi8  (0x80);
+  const __m256i ones    = _mm256_set1_epi8  (0x01);
+
+  __m256i a_signed      = _mm256_sub_epi8   (a,        epu2epi);
+  __m256i b_signed      = _mm256_sub_epi8   (b,        epu2epi);
+
+  __m256i diff          = _mm256_subs_epi8  (a_signed, b_signed);
+  return                  _mm256_sign_epi8  (ones,     diff);
 }
 
-static INLINE __m256i load_5_offsets(const int* offsets)
+// Mapping of edge_idx values to eo-classes, 32x8b at once
+static __m256i FIX_W32 calc_eo_cat(const __m256i a,
+                                   const __m256i b,
+                                   const __m256i c)
 {
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_insert_epi32(_mm_setzero_si128(), offsets[4], 0), 1);
+  const __m256i twos       = _mm256_set1_epi8  (0x02);
+  const __m256i idx_to_cat = _mm256_setr_epi64x(0x0403000201, 0,
+                                                0x0403000201, 0);
+
+  __m256i c_a_sign         = sign3_diff_epu8    (c, a);
+  __m256i c_b_sign         = sign3_diff_epu8    (c, b);
+
+  __m256i signsum          = _mm256_add_epi8    (c_a_sign,   c_b_sign);
+  __m256i eo_idx           = _mm256_add_epi8    (signsum,    twos);
+
+  return                     _mm256_shuffle_epi8(idx_to_cat, eo_idx);
 }
 
+static INLINE __m256i srli_epi8(const __m256i  v,
+                                const uint32_t shift)
+{
+  const uint8_t hibit_mask     = 0xff >> shift;
+  const __m256i hibit_mask_256 = _mm256_set1_epi8(hibit_mask);
+
+  __m256i v_shifted = _mm256_srli_epi32(v,         shift);
+  __m256i v_masked  = _mm256_and_si256 (v_shifted, hibit_mask_256);
 
-static __m128i sao_calc_eo_cat_avx2(__m128i* a, __m128i* b, __m128i* c)
+  return v_masked;
+}
+
+static INLINE void cvt_epu8_epi16(const __m256i  v,
+                                        __m256i *res_lo,
+                                        __m256i *res_hi)
 {
-  __m128i v_eo_idx = _mm_set1_epi16(2);
-  __m128i v_a = _mm_cvtepu8_epi16(*a);
-  __m128i v_c = _mm_cvtepu8_epi16(*c);
-  __m128i v_b = _mm_cvtepu8_epi16(*b);
-  
-  __m128i temp_a = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_a));
-  __m128i temp_b = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_b));
-  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_a);
-  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_b);
-  
-  v_eo_idx = _mm_packus_epi16(v_eo_idx, v_eo_idx);
-  __m128i v_cat_lookup = _mm_setr_epi8(1,2,0,3,4,0,0,0,0,0,0,0,0,0,0,0);
-  __m128i v_cat = _mm_shuffle_epi8(v_cat_lookup, v_eo_idx);
-
-
-  return v_cat;
+  const __m256i zero  = _mm256_setzero_si256();
+             *res_lo  = _mm256_unpacklo_epi8(v, zero);
+             *res_hi  = _mm256_unpackhi_epi8(v, zero);
 }
 
+static INLINE void cvt_epi8_epi16(const __m256i  v,
+                                        __m256i *res_lo,
+                                        __m256i *res_hi)
+{
+  const __m256i zero  = _mm256_setzero_si256();
+        __m256i signs = _mm256_cmpgt_epi8   (zero, v);
+             *res_lo  = _mm256_unpacklo_epi8(v,    signs);
+             *res_hi  = _mm256_unpackhi_epi8(v,    signs);
+}
 
-static int sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
-                                     const kvz_pixel *rec_data,
-                                     int block_width,
-                                     int block_height,
-                                     int eo_class,
-                                     int offsets[NUM_SAO_EDGE_CATEGORIES])
+static INLINE void diff_epi8_epi16(const __m256i  a,
+                                   const __m256i  b,
+                                         __m256i *res_lo,
+                                         __m256i *res_hi)
 {
-  int y, x;
-  int sum = 0;
-  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
-  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
+  const __m256i invmask = _mm256_set1_epi16(0xff01);
 
-  __m256i v_accum = { 0 };
+  __m256i composite_lo  = _mm256_unpacklo_epi8(a, b);
+  __m256i composite_hi  = _mm256_unpackhi_epi8(a, b);
 
-  for (y = 1; y < block_height - 1; ++y) {
+         *res_lo        = _mm256_maddubs_epi16(composite_lo, invmask);
+         *res_hi        = _mm256_maddubs_epi16(composite_hi, invmask);
+}
 
-    for (x = 1; x < block_width - 8; x+=8) {
-      const kvz_pixel *c_data = &rec_data[y * block_width + x];
+// Convert a byte-addressed mask for VPSHUFB into two word-addressed ones, for
+// example:
+// 7 3 6 2 5 1 4 0 => e f 6 7 c d 4 5 a b 2 3 8 9 0 1
+static INLINE void cvt_shufmask_epi8_epi16(const __m256i  v,
+                                                 __m256i *res_lo,
+                                                 __m256i *res_hi)
+{
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i ones = _mm256_set1_epi8(1);
+
+  // There's no 8-bit shift, so highest bit could bleed into neighboring byte
+  // if set. To avoid it, reset all sign bits with max. The only valid input
+  // values for v are [0, 7] anyway and invalid places should be masked out by
+  // caller, so it doesn't matter that we turn negative bytes into garbage.
+  __m256i v_nonnegs  = _mm256_max_epi8  (zero,      v);
+  __m256i v_lobytes  = _mm256_slli_epi32(v_nonnegs, 1);
+  __m256i v_hibytes  = _mm256_add_epi8  (v_lobytes, ones);
+
+          *res_lo    = _mm256_unpacklo_epi8(v_lobytes, v_hibytes);
+          *res_hi    = _mm256_unpackhi_epi8(v_lobytes, v_hibytes);
+}
 
-      __m128i v_c_data = _mm_loadl_epi64((__m128i*)c_data);
-      __m128i v_a = _mm_loadl_epi64((__m128i*)(&c_data[a_ofs.y * block_width + a_ofs.x]));
-      __m128i v_c = v_c_data;
-      __m128i v_b = _mm_loadl_epi64((__m128i*)(&c_data[b_ofs.y * block_width + b_ofs.x]));
+// Check if all 4 dwords of v are in [-128, 127] and can be truncated to
+// 8 bits each. Returns -1 if everything is fine
+static INLINE uint16_t epi32v_fits_in_epi8s(const __m128i v)
+{
+  // Compare most significant 25 bits of SAO bands to the sign bit to assert
+  // that the i32's are between -128 and 127 (only comparing 24 would fail to
+  // detect values of 128...255)
+  __m128i  v_ms25b = _mm_srai_epi32   (v,  7);
+  __m128i  v_signs = _mm_srai_epi32   (v, 31);
+  __m128i  ok_i32s = _mm_cmpeq_epi32  (v_ms25b, v_signs);
+  return             _mm_movemask_epi8(ok_i32s);
+}
 
-      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+static INLINE __m128i truncate_epi32_epi8(const __m128i v)
+{
+  // LSBs of each dword, the values values must fit in 8 bits anyway for
+  // what this intended for (use epi32v_fits_in_epi8s to check if needed)
+  const __m128i trunc_shufmask = _mm_set1_epi32  (0x0c080400);
+        __m128i sbs_8          = _mm_shuffle_epi8(v, trunc_shufmask);
+  return        sbs_8;
+}
 
-      __m256i v_offset = load_5_offsets(offsets);
-      v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
-   
-      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_data[y * block_width + x])));
-      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
-      __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
-      __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
-      v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
-    }
+// Read 0-3 bytes (pixels) into uint32_t
+static INLINE uint32_t load_border_bytes(const uint8_t *buf,
+                                         const int32_t  start_pos,
+                                         const int32_t  width_rest)
+{
+  uint32_t last_dword = 0;
+  for (int32_t i = 0; i < width_rest; i++) {

kvazaar-1.3.0.tar.gz/src/strategies/generic/dct-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/dct-generic.c Changed

@@ -23,7 +23,7 @@
 #include "strategyselector.h"
 #include "tables.h"
 
-const int16_t kvz_g_dst_4[4][4] =
+ALIGNED(32) const int16_t kvz_g_dst_4[4][4] =
 {
   { 29, 55, 74, 84 },
   { 74, 74, 0, -74 },
@@ -31,7 +31,7 @@
   { 55, -84, 74, -29 }
 };
 
-const int16_t kvz_g_dct_4[4][4] =
+ALIGNED(32) const int16_t kvz_g_dct_4[4][4] =
 {
   { 64, 64, 64, 64 },
   { 83, 36, -36, -83 },
@@ -39,7 +39,7 @@
   { 36, -83, 83, -36 }
 };
 
-const int16_t kvz_g_dct_8[8][8] =
+ALIGNED(64) const int16_t kvz_g_dct_8[8][8] =
 {
   { 64, 64, 64, 64, 64, 64, 64, 64 },
   { 89, 75, 50, 18, -18, -50, -75, -89 },
@@ -51,7 +51,7 @@
   { 18, -50, 75, -89, 89, -75, 50, -18 }
 };
 
-const int16_t kvz_g_dct_16[16][16] =
+ALIGNED(64) const int16_t kvz_g_dct_16[16][16] =
 {
   { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
   { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 },
@@ -71,7 +71,7 @@
   { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 }
 };
 
-const int16_t kvz_g_dct_32[32][32] =
+ALIGNED(64) const int16_t kvz_g_dct_32[32][32] =
 {
   { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
   { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
@@ -107,7 +107,7 @@
   { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }
 };
 
-const int16_t kvz_g_dst_4_t[4][4] =
+ALIGNED(32) const int16_t kvz_g_dst_4_t[4][4] =
 {
   { 29, 74, 84, 55 },
   { 55, 74, -29, -84 },
@@ -115,7 +115,7 @@
   { 84, -74, 55, -29 }
 };
 
-const int16_t kvz_g_dct_4_t[4][4] =
+ALIGNED(32) const int16_t kvz_g_dct_4_t[4][4] =
 {
   { 64, 83, 64, 36, },
   { 64, 36, -64, -83, },
@@ -123,7 +123,7 @@
   { 64, -83, 64, -36 }
 };
 
-const int16_t kvz_g_dct_8_t[8][8] =
+ALIGNED(64) const int16_t kvz_g_dct_8_t[8][8] =
 {
   { 64, 89, 83, 75, 64, 50, 36, 18, },
   { 64, 75, 36, -18, -64, -89, -83, -50, },
@@ -135,7 +135,7 @@
   { 64, -89, 83, -75, 64, -50, 36, -18 }
 };
 
-const int16_t kvz_g_dct_16_t[16][16] =
+ALIGNED(64) const int16_t kvz_g_dct_16_t[16][16] =
 {
   { 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, },
   { 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, },
@@ -155,7 +155,7 @@
   { 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 }
 };
 
-const int16_t kvz_g_dct_32_t[32][32] =
+ALIGNED(64) const int16_t kvz_g_dct_32_t[32][32] =
 {
   { 64, 90, 90, 90, 89, 88, 87, 85, 83, 82, 80, 78, 75, 73, 70, 67, 64, 61, 57, 54, 50, 46, 43, 38, 36, 31, 25, 22, 18, 13, 9, 4, },
   { 64, 90, 87, 82, 75, 67, 57, 46, 36, 22, 9, -4, -18, -31, -43, -54, -64, -73, -80, -85, -89, -90, -90, -88, -83, -78, -70, -61, -50, -38, -25, -13, },

kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c Changed

@@ -227,16 +227,16 @@
         }
       }
       if (be_valid && sign_hidden) {
-    	coeff_signs = coeff_signs >> 1;
-    	if (!cabac->only_count)
-    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
-    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
-    	  }
+        coeff_signs = coeff_signs >> 1;
+        if (!cabac->only_count)
+          if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
+            coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
+          }
         CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
       } else {
         if (!cabac->only_count)
-    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
-    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
+          if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
+            coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
         CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
       }
 
@@ -247,12 +247,12 @@
           int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
 
           if (abs_coeff[idx] >= base_level) {
-        	if (!cabac->only_count) {
-        	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
-                    kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeff[idx] - base_level, go_rice_param, base_level);
-        	  else
-        		kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param);
-        	} else
+            if (!cabac->only_count) {
+              if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
+                kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeff[idx] - base_level, go_rice_param, base_level);
+              else
+                kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param);
+            } else
               kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param);
 
             if (abs_coeff[idx] > 3 * (1 << go_rice_param)) {

kvazaar-1.3.0.tar.gz/src/strategies/generic/intra-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/intra-generic.c Changed

@@ -188,12 +188,54 @@
 #endif
 }
 
+/**
+* \brief Generage intra DC prediction with post filtering applied.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void kvz_intra_pred_filtered_dc_generic(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const out_block)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  const int_fast8_t width = 1 << log2_width;
+
+  int_fast16_t sum = 0;
+  for (int_fast8_t i = 0; i < width; ++i) {
+    sum += ref_top[i + 1];
+    sum += ref_left[i + 1];
+  }
+
+  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
+
+  // Filter top-left with ([1 2 1] / 4)
+  out_block[0] = (ref_left[1] + 2 * dc_val + ref_top[1] + 2) / 4;
+
+  // Filter rest of the boundary with ([1 3] / 4)
+  for (int_fast8_t x = 1; x < width; ++x) {
+    out_block[x] = (ref_top[x + 1] + 3 * dc_val + 2) / 4;
+  }
+  for (int_fast8_t y = 1; y < width; ++y) {
+    out_block[y * width] = (ref_left[y + 1] + 3 * dc_val + 2) / 4;
+    for (int_fast8_t x = 1; x < width; ++x) {
+      out_block[y * width + x] = dc_val;
+    }
+  }
+}
+
+
 int kvz_strategy_register_intra_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
 
   success &= kvz_strategyselector_register(opaque, "angular_pred", "generic", 0, &kvz_angular_pred_generic);
   success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "generic", 0, &kvz_intra_pred_planar_generic);
+  success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "generic", 0, &kvz_intra_pred_filtered_dc_generic);
 
   return success;
 }

kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -536,54 +536,58 @@
 }
 
 static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
-	const int hi_prec_luma_rec1,
-	const int hi_prec_chroma_rec0,
-	const int hi_prec_chroma_rec1,
-	int32_t height,
-	int32_t width,
-	int32_t ypos,
-	int32_t xpos,
-	const hi_prec_buf_t*high_precision_rec0,
-	const hi_prec_buf_t*high_precision_rec1,
-	lcu_t* lcu,
-	kvz_pixel* temp_lcu_y,
-	kvz_pixel* temp_lcu_u,
-	kvz_pixel* temp_lcu_v) {
-
-	int shift = 15 - KVZ_BIT_DEPTH;
-	int offset = 1 << (shift - 1);
-
-	int y_in_lcu;
-	int x_in_lcu;
-
-	//After reconstruction, merge the predictors by taking an average of each pixel
-	for (int temp_y = 0; temp_y < height; ++temp_y) {
-
-
-		for (int temp_x = 0; temp_x < width; ++temp_x) {
-			y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-			x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-
-			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-
-			lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-
-			if (temp_x < width >> 1 && temp_y < height >> 1) {
-
-				y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-				x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-
-				int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-				int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-				lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-				int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-				int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-				lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-			}
-		}
-	}
+  const int hi_prec_luma_rec1,
+  const int hi_prec_chroma_rec0,
+  const int hi_prec_chroma_rec1,
+  int32_t height,
+  int32_t width,
+  int32_t ypos,
+  int32_t xpos,
+  const hi_prec_buf_t*high_precision_rec0,
+  const hi_prec_buf_t*high_precision_rec1,
+  lcu_t* lcu,
+  kvz_pixel* temp_lcu_y,
+  kvz_pixel* temp_lcu_u,
+  kvz_pixel* temp_lcu_v,
+  bool predict_luma,
+  bool predict_chroma) {
+
+  int shift = 15 - KVZ_BIT_DEPTH;
+  int offset = 1 << (shift - 1);
+
+  int y_in_lcu;
+  int x_in_lcu;
+
+  //After reconstruction, merge the predictors by taking an average of each pixel
+  for (int temp_y = 0; temp_y < height; ++temp_y) {
+
+
+    for (int temp_x = 0; temp_x < width; ++temp_x) {
+      y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+      x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+      if (predict_luma) {
+        int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+        int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+
+        lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+      }
+
+      if (predict_chroma && (temp_x < width >> 1 && temp_y < height >> 1)) {
+
+        y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+        x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+        int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+        int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+        lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+        int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+        int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+        lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+      }
+    }
+  }
 
 }
 
@@ -671,6 +675,32 @@
   return result;
 }
 
+// Calculate pixel value variance. Takes in arrays of kvz_pixel
+static double pixel_var_generic(const kvz_pixel *arr, const uint32_t len)
+{
+  double var = 0;
+  double arr_mean = 0;
+
+  // Calculate array mean
+  int i = 0;
+  double sum = 0;
+
+  for (; i < len; ++i) {
+    sum += arr[i];
+  }
+  arr_mean = sum / (double)len;
+
+  // Calculate array variance
+  for (i = 0; i < len; ++i) {
+    double tmp = (double)arr[i] - arr_mean;
+    var += tmp*tmp;
+  }
+
+  var /= len;
+
+  return var;
+}
+
 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -710,5 +740,7 @@
   success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
   success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);
 
+  success &= kvz_strategyselector_register(opaque, "pixel_var", "generic", 0, &pixel_var_generic);
+
   return success;
 }

kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -178,6 +178,7 @@
 * \param pred_in  Predicted pixels.
 * \param rec_out  Reconstructed pixels.
 * \param coeff_out  Coefficients used for reconstruction of rec_out.
+* \param early_skip if this is used for early skip, bypass IT and IQ
 *
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
@@ -186,11 +187,12 @@
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
-  kvz_pixel *rec_out, coeff_t *coeff_out)
+  kvz_pixel *rec_out, coeff_t *coeff_out,
+  bool early_skip)
 {
   // Temporary arrays to pass data to and from kvz_quant and transform functions.
-  int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
-  coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
 
   int has_coeffs = 0;
 
@@ -241,7 +243,7 @@
 
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
-  if (has_coeffs) {
+  if (has_coeffs && !early_skip) {
     int y, x;
 
     // Get quantized residual. (coeff_out -> coeff -> residual)

kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.h -> kvazaar-2.0.0.tar.gz/src/strategies/generic/quant-generic.h Changed

@@ -1,48 +1,49 @@
-#ifndef STRATEGIES_QUANT_GENERIC_H_
-#define STRATEGIES_QUANT_GENERIC_H_
-/*****************************************************************************
- * This file is part of Kvazaar HEVC encoder.
- *
- * Copyright (C) 2013-2015 Tampere University of Technology and others (see
- * COPYING file).
- *
- * Kvazaar is free software: you can redistribute it and/or modify it under
- * the terms of the GNU Lesser General Public License as published by the
- * Free Software Foundation; either version 2.1 of the License, or (at your
- * option) any later version.
- *
- * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
- ****************************************************************************/
-
-/**
- * \ingroup Optimization
- * \file
- * Generic C implementations of optimized functions.
- */
-
-#include "cu.h"
-#include "encoderstate.h"
-#include "global.h" // IWYU pragma: keep
-#include "kvazaar.h"
-#include "tables.h"
-
-#define QUANT_SHIFT 14
-
-int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
-void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
-
-int kvz_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
-  const coeff_scan_order_t scan_order, const int use_trskip,
-  const int in_stride, const int out_stride,
-  const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
-  kvz_pixel *rec_out, coeff_t *coeff_out);
-
-#endif //STRATEGIES_QUANT_GENERIC_H_
+#ifndef STRATEGIES_QUANT_GENERIC_H_
+#define STRATEGIES_QUANT_GENERIC_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "tables.h"
+
+#define QUANT_SHIFT 14
+
+int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
+void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
+  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
+
+int kvz_quantize_residual_generic(encoder_state_t *const state,
+  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const coeff_scan_order_t scan_order, const int use_trskip,
+  const int in_stride, const int out_stride,
+  const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
+  kvz_pixel *rec_out, coeff_t *coeff_out,
+  bool early_skip);
+
+#endif //STRATEGIES_QUANT_GENERIC_H_

kvazaar-1.3.0.tar.gz/src/strategies/generic/sao-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/sao-generic.c Changed

@@ -19,6 +19,7 @@
  ****************************************************************************/
 
 #include "strategies/generic/sao-generic.h"
+#include "strategies/generic/sao_shared_generics.h"
 
 #include "cu.h"
 #include "encoder.h"
@@ -28,51 +29,6 @@
 #include "strategyselector.h"
 
 
-// Mapping of edge_idx values to eo-classes.
-static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
-{
-  // Mapping relationships between a, b and c to eo_idx.
-  static const int sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
-
-  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
-
-  return sao_eo_idx_to_eo_category[eo_idx];
-}
-
-
-static int sao_edge_ddistortion_generic(const kvz_pixel *orig_data,
-                                        const kvz_pixel *rec_data,
-                                        int block_width,
-                                        int block_height,
-                                        int eo_class,
-                                        int offsets[NUM_SAO_EDGE_CATEGORIES])
-{
-  int y, x;
-  int sum = 0;
-  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
-  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
-
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const kvz_pixel *c_data = &rec_data[y * block_width + x];
-      kvz_pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
-      kvz_pixel c = c_data[0];
-      kvz_pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
-
-      int offset = offsets[sao_calc_eo_cat(a, b, c)];
-
-      if (offset != 0) {
-        int diff = orig_data[y * block_width + x] - c;
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
-
-  return sum;
-}
-
-
 /**
  * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
  * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
@@ -93,6 +49,9 @@
 
   // Don't sample the edge pixels because this function doesn't have access to
   // their neighbours.
+
+  
+
   for (y = 1; y < block_height - 1; ++y) {
     for (x = 1; x < block_width - 1; ++x) {
       const kvz_pixel *c_data = &rec_data[y * block_width + x];
@@ -152,36 +111,6 @@
 }
 
 
-static int sao_band_ddistortion_generic(const encoder_state_t * const state,
-                                        const kvz_pixel *orig_data,
-                                        const kvz_pixel *rec_data,
-                                        int block_width,
-                                        int block_height,
-                                        int band_pos,
-                                        int sao_bands[4])
-{
-  int y, x;
-  int shift = state->encoder_control->bitdepth-5;
-  int sum = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      int band = (rec_data[y * block_width + x] >> shift) - band_pos;
-      int offset = 0;
-      if (band >= 0 && band < 4) {
-        offset = sao_bands[band];
-      }
-      if (offset != 0) {
-        int diff = orig_data[y * block_width + x] - rec_data[y * block_width + x];
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
-
-  return sum;
-}
-
 
 int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth)
 {

kvazaar-2.0.0.tar.gz/src/strategies/generic/sao_shared_generics.h Added

@@ -0,0 +1,97 @@
+#ifndef SAO_BAND_DDISTORTION_H_
+#define SAO_BAND_DDISTORTION_H_
+
+// #include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "sao.h"
+
+// Mapping of edge_idx values to eo-classes.
+static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
+{
+  // Mapping relationships between a, b and c to eo_idx.
+  static const int sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
+
+  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
+
+  return sao_eo_idx_to_eo_category[eo_idx];
+}
+
+static int sao_edge_ddistortion_generic(const kvz_pixel *orig_data,
+                                        const kvz_pixel *rec_data,
+                                              int32_t    block_width,
+                                              int32_t    block_height,
+                                              int32_t    eo_class,
+                                        const int32_t    offsets[NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  int32_t sum = 0;
+  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
+
+  for (y = 1; y < block_height - 1; y++) {
+    for (x = 1; x < block_width - 1; x++) {
+      uint32_t c_pos =  y            * block_width + x;
+      uint32_t a_pos = (y + a_ofs.y) * block_width + x + a_ofs.x;
+      uint32_t b_pos = (y + b_ofs.y) * block_width + x + b_ofs.x;
+
+      uint8_t   a    =  rec_data[a_pos];
+      uint8_t   b    =  rec_data[b_pos];
+      uint8_t   c    =  rec_data[c_pos];
+      uint8_t   orig = orig_data[c_pos];
+
+      int32_t eo_cat = sao_calc_eo_cat(a, b, c);
+      int32_t offset = offsets[eo_cat];
+
+      if (offset != 0) {
+        int32_t diff   = orig - c;
+        int32_t delta  = diff - offset;
+        int32_t curr   = delta * delta - diff * diff;
+
+        sum += curr;
+      }
+    }
+  }
+  return sum;
+}
+
+static int sao_band_ddistortion_generic(const encoder_state_t * const state,
+                                        const kvz_pixel *orig_data,
+                                        const kvz_pixel *rec_data,
+                                        int block_width,
+                                        int block_height,
+                                        int band_pos,
+                                        const int sao_bands[4])
+{
+  int y, x;
+  int shift = state->encoder_control->bitdepth-5;
+  int sum = 0;
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      const int32_t curr_pos = y * block_width + x;
+
+      kvz_pixel rec  =  rec_data[curr_pos];
+      kvz_pixel orig = orig_data[curr_pos];
+
+      int32_t band = (rec >> shift) - band_pos;
+      int32_t offset = 0;
+      if (band >= 0 && band <= 3) {
+        offset = sao_bands[band];
+      }
+      // Offset is applied to reconstruction, so it is subtracted from diff.
+
+      int32_t diff  = orig - rec;
+      int32_t delta = diff - offset;
+
+      int32_t dmask = (offset == 0) ? -1 : 0;
+      diff  &= ~dmask;
+      delta &= ~dmask;
+
+      sum += delta * delta - diff * diff;
+    }
+  }
+
+  return sum;
+}
+
+#endif

kvazaar-1.3.0.tar.gz/src/strategies/missing-intel-intrinsics.h -> kvazaar-2.0.0.tar.gz/src/strategies/missing-intel-intrinsics.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-intra.c -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-intra.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-intra.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-intra.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-picture.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -27,6 +27,7 @@
  */
 
 #include "global.h" // IWYU pragma: keep
+#include "inter.h"
 #include "kvazaar.h"
 #include "encoderstate.h"
 #include "strategies/optimized_sad_func_ptr_t.h"
@@ -121,21 +122,23 @@
                                 uint32_t ref_stride, uint32_t left, uint32_t right);
 
 typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
-	const int hi_prec_luma_rec1,
-	const int hi_prec_chroma_rec0,
-	const int hi_prec_chroma_rec1,
-	int height,
-	int width,
-	int ypos,
-	int xpos,
-	const hi_prec_buf_t*high_precision_rec0,
-	const hi_prec_buf_t*high_precision_rec1,
-	lcu_t* lcu,
-	kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
-	kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
-	kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]);
-	
-	
+    const int hi_prec_luma_rec1,
+    const int hi_prec_chroma_rec0,
+    const int hi_prec_chroma_rec1,
+    int height,
+    int width,
+    int ypos,
+    int xpos,
+    const hi_prec_buf_t*high_precision_rec0,
+    const hi_prec_buf_t*high_precision_rec1,
+    lcu_t* lcu,
+    kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
+    kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
+    kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C],
+    bool predict_luma,
+    bool predict_chroma);  
+
+typedef double (pixel_var_func)(const kvz_pixel *buf, const uint32_t len);
 
 // Declare function pointers.
 extern reg_sad_func * kvz_reg_sad;
@@ -175,6 +178,8 @@
 extern ver_sad_func *kvz_ver_sad;
 extern hor_sad_func *kvz_hor_sad;
 
+extern pixel_var_func *kvz_pixel_var;
+
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
 cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
@@ -210,6 +215,7 @@
   {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
   {"ver_sad", (void**) &kvz_ver_sad}, \
   {"hor_sad", (void**) &kvz_hor_sad}, \
+  {"pixel_var", (void**) &kvz_pixel_var}, \

kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-sao.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-sao.h Changed

kvazaar-1.3.0.tar.gz/src/strategyselector.c -> kvazaar-2.0.0.tar.gz/src/strategyselector.c Changed

@@ -103,115 +103,115 @@
 
   //We can free the structure now, as all strategies are statically set to pointers
   if (strategies.allocated) {
-	  //Also check what optimizations are available and what are in use
-	  //SIMD optimizations available
-	  bool strategies_available = false;
-	  fprintf(stderr, "Available: ");
-	  if (kvz_g_strategies_available.intel_flags.avx != 0){
-		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.avx2 != 0){
-		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.mmx != 0) {
-		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse != 0) {
-		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
-		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
-		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
-		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
-		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
-		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.arm_flags.neon != 0) {
-		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
-		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
-		  strategies_available = true;
-	  }
-	  //If there is no strategies available
-	  if (!strategies_available){
-		  fprintf(stderr, "no SIMD optimizations");
-	  }
-	  fprintf(stderr, "\n");
-
-	  //SIMD optimizations in use
-	  bool strategies_in_use = false;
-	  fprintf(stderr, "In use: ");
-	  if (kvz_g_strategies_in_use.intel_flags.avx != 0){
-		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
-		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
-		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
-		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
-		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
-		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
-		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
-		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) {
-		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.arm_flags.neon != 0) {
-		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) {
-		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
-		  strategies_in_use = true;
-	  }
-	  //If there is no strategies in use
-	  if (!strategies_in_use){
-		  fprintf(stderr, "no SIMD optimizations");
-	  }
-	  fprintf(stderr, "\n");
-
-	  //Free memory
-	  free(strategies.strategies);
+    //Also check what optimizations are available and what are in use
+    //SIMD optimizations available
+    bool strategies_available = false;
+    fprintf(stderr, "Available: ");
+    if (kvz_g_strategies_available.intel_flags.avx != 0){
+      fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.avx2 != 0){
+      fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.mmx != 0) {
+      fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse != 0) {
+      fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
+      fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
+      fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
+      fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
+      fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
+      fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.arm_flags.neon != 0) {
+      fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
+      fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
+      strategies_available = true;
+    }
+    //If there is no strategies available
+    if (!strategies_available){
+      fprintf(stderr, "no SIMD optimizations");
+    }
+    fprintf(stderr, "\n");
+
+    //SIMD optimizations in use
+    bool strategies_in_use = false;
+    fprintf(stderr, "In use: ");
+    if (kvz_g_strategies_in_use.intel_flags.avx != 0){
+      fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
+      fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
+      fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
+      fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
+      fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
+      fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
+      fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
+      fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);

kvazaar-1.3.0.tar.gz/src/threadwrapper/include/pthread.h -> kvazaar-2.0.0.tar.gz/src/threadwrapper/include/pthread.h Changed

kvazaar-1.3.0.tar.gz/src/threadwrapper/src/pthread.cpp -> kvazaar-2.0.0.tar.gz/src/threadwrapper/src/pthread.cpp Changed

@@ -17,8 +17,13 @@
 #include "pthread.h"
 #include <condition_variable>
 #include <mutex>
+#include <shared_mutex>
 #include <thread>
 
+typedef struct {
+  std::shared_mutex *lock;
+  bool write_lock;
+} rw_lock_internal;
 
 int pthread_cond_broadcast(pthread_cond_t* cond) {
     static_cast<std::condition_variable*>(*cond)->notify_all();
@@ -86,3 +91,43 @@
     static_cast<std::mutex*>(*mutex)->unlock();
     return 0;
 }
+
+int pthread_rwlock_init(pthread_rwlock_t * lock, const pthread_rwlockattr_t *)
+{
+  *lock = new rw_lock_internal;
+  static_cast<rw_lock_internal*>(*lock)->lock = new std::shared_mutex;
+  static_cast<rw_lock_internal*>(*lock)->write_lock = false;
+  return 0;
+}
+
+int pthread_rwlock_destroy(pthread_rwlock_t* rwlock)
+{
+  delete static_cast<rw_lock_internal*>(*rwlock)->lock;
+  delete static_cast<rw_lock_internal*>(*rwlock);
+  return 0;
+}
+
+int pthread_rwlock_rdlock(pthread_rwlock_t* rwlock)
+{
+  static_cast<rw_lock_internal*>(*rwlock)->lock->lock_shared();
+  return 0;
+}
+
+int pthread_rwlock_wrlock(pthread_rwlock_t* rwlock)
+{
+  static_cast<rw_lock_internal*>(*rwlock)->lock->lock();
+  static_cast<rw_lock_internal*>(*rwlock)->write_lock = true;
+  return 0;
+}
+
+int pthread_rwlock_unlock(pthread_rwlock_t* rwlock)
+{
+  if (static_cast<rw_lock_internal*>(*rwlock)->write_lock) {
+    static_cast<rw_lock_internal*>(*rwlock)->write_lock = false;
+    static_cast<rw_lock_internal*>(*rwlock)->lock->unlock();
+  }
+  else {
+    static_cast<rw_lock_internal*>(*rwlock)->lock->unlock_shared();
+  }
+  return 0;
+}

kvazaar-1.3.0.tar.gz/src/transform.c -> kvazaar-2.0.0.tar.gz/src/transform.c Changed

@@ -155,7 +155,8 @@
   int32_t  j,k;
   for (j = 0; j < block_size; j++) {
     for(k = 0; k < block_size; k ++) {
-      coeff[j * block_size + k] = block[j * block_size + k] << shift;
+      // Casting back and forth to make UBSan not trigger due to left-shifting negatives
+      coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k]) << shift);
     }
   }
 }
@@ -246,14 +247,14 @@
   noskip.has_coeffs = kvz_quantize_residual(
       state, cur_cu, width, color, scan_order,
       0, in_stride, 4,
-      ref_in, pred_in, noskip.rec, noskip.coeff);
+      ref_in, pred_in, noskip.rec, noskip.coeff, false);
   noskip.cost = kvz_pixels_calc_ssd(ref_in, noskip.rec, in_stride, 4, 4);
   noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;
 
   skip.has_coeffs = kvz_quantize_residual(
     state, cur_cu, width, color, scan_order,
     1, in_stride, 4,
-    ref_in, pred_in, skip.rec, skip.coeff);
+    ref_in, pred_in, skip.rec, skip.coeff, false);
   skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, 4, 4);
   skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * bit_cost;
 
@@ -277,6 +278,8 @@
 
 /**
  * Calculate the residual coefficients for a single TU.
+ *
+ * \param early_skip if this is used for early skip, bypass IT and IQ
  */
 static void quantize_tr_residual(encoder_state_t * const state,
                                  const color_t color,
@@ -284,7 +287,8 @@
                                  const int32_t y,
                                  const uint8_t depth,
                                  cu_info_t *cur_pu,
-                                 lcu_t* lcu)
+                                 lcu_t* lcu,
+                                 bool early_skip)
 {
   const kvz_config *cfg    = &state->encoder_control->cfg;
   const int32_t shift      = color == COLOR_Y ? 0 : 1;
@@ -397,7 +401,8 @@
                                        ref,
                                        pred,
                                        pred,
-                                       coeff);
+                                       coeff,
+                                       early_skip);
   }
 
   if (has_coeffs) {
@@ -411,9 +416,10 @@
  * kvantized residual. Processes the TU tree recursively.
  *
  * Inputs are:
- * - lcu->rec  pixels after prediction for the area
- * - lcu->ref  reference pixels for the area
- * - lcu->cu   for the area
+ * - lcu->rec   pixels after prediction for the area
+ * - lcu->ref   reference pixels for the area
+ * - lcu->cu    for the area
+ * - early_skip if this is used for early skip, bypass IT and IQ
  *
  * Outputs are:
  * - lcu->rec               reconstruction after quantized residual
@@ -428,7 +434,8 @@
                                const int32_t y,
                                const uint8_t depth,
                                cu_info_t *cur_pu,
-                               lcu_t* lcu)
+                               lcu_t* lcu,
+                               bool early_skip)
 {
   const int32_t width = LCU_WIDTH >> depth;
   const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
@@ -445,16 +452,27 @@
          width == 32 ||
          width == 64);
 
+  // Reset CBFs because CBFs might have been set
+  // for depth earlier
+  if (luma) {
+    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
+  }
+  if (chroma) {
+    cbf_clear(&cur_pu->cbf, depth, COLOR_U);
+    cbf_clear(&cur_pu->cbf, depth, COLOR_V);
+  }
+
   if (depth == 0 || cur_pu->tr_depth > depth) {
+
     // Split transform and increase depth
     const int offset = width / 2;
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    kvz_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu);
-    kvz_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu);
-    kvz_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu);
-    kvz_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu);
+    kvz_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu, early_skip);
+    kvz_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu, early_skip);
+    kvz_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu, early_skip);
+    kvz_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu, early_skip);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -472,11 +490,11 @@
   } else {
     // Process a leaf TU.
     if (luma) {
-      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu);
+      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu, early_skip);
     }
     if (chroma) {
-      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu);
-      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu);
+      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip);
+      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);
     }
   }
 }

kvazaar-1.3.0.tar.gz/src/transform.h -> kvazaar-2.0.0.tar.gz/src/transform.h Changed

kvazaar-1.3.0.tar.gz/tests/Makefile.am -> kvazaar-2.0.0.tar.gz/tests/Makefile.am Changed

kvazaar-1.3.0.tar.gz/tests/dct_tests.c -> kvazaar-2.0.0.tar.gz/tests/dct_tests.c Changed

kvazaar-1.3.0.tar.gz/tests/test_gop.sh -> kvazaar-2.0.0.tar.gz/tests/test_gop.sh Changed

kvazaar-1.3.0.tar.gz/tests/test_interlace.sh -> kvazaar-2.0.0.tar.gz/tests/test_interlace.sh Changed

kvazaar-2.0.0.tar.gz/tests/test_pu_depth_constraints.sh Added

kvazaar-1.3.0.tar.gz/tests/test_rate_control.sh -> kvazaar-2.0.0.tar.gz/tests/test_rate_control.sh Changed

@@ -5,3 +5,8 @@
 
 valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3
 if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 512x512 30 --bitrate=100000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=2 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop 8 --rc-algorithm oba --no-intra-bits --no-clip-neighbour; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop 8 --rc-algorithm oba --intra-bits --clip-neighbour; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop lp-g8d4t1 --rc-algorithm oba --no-intra-bits --no-clip-neighbour; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop lp-g8d4t1 --rc-algorithm oba --intra-bits --clip-neighbour; fi
+

kvazaar-1.3.0.tar.gz/tests/test_tools.sh -> kvazaar-2.0.0.tar.gz/tests/test_tools.sh Changed

kvazaar-2.0.0.tar.gz/tests/tsan_suppressions.txt Added

kvazaar-1.3.0.tar.gz/tests/util.sh -> kvazaar-2.0.0.tar.gz/tests/util.sh Changed