Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 9

kvazaar.changes Changed

@@ -1,4 +1,108 @@
 -------------------------------------------------------------------
+Tue Oct  4 07:43:42 UTC 2016 - aloisio@gmx.com
+
+- Update to version 1.0.0
+  New Features
+  * --version
+  * --help
+  * --loop-input
+  * --mv-constraint to constrain motion vectors
+  * --tiles=2x2 as an alternative syntax for uniform tiles
+  * --hash=md5
+  * Print information about what SIMD optimizations are in 
+    use
+  * --mv=full8 --mv=full16 --mv=full32 --mv=full64
+  * --cu-split-termination=zero/off
+  * --crypto for selective encryption of bitstream (for 
+    OpenHEVC)
+  * --me-early-termination=sensitive/on/off for early 
+    termination of motion vector search
+  * Added 4x8 SMP and 4x12 AMP motion partitions
+  * --subme=0/1/2/3/4 for control over complexity of 
+    fractional pixel motion prediction
+  * --lossless for lossless coding
+  * Monochrome coding
+  * --input-format=420/400
+  * --input-bitdepth=8/10
+  * --tmpv for temporal motion vector predictor
+  * --rdoq-skip for not using rdoq for situations where it's 
+    unlikely to improve BDRate
+  * Modified --gop=lp-g4d3r1t1 syntax to not take the 
+    reference frames as a parameter, it's now --gop=lp-g4d3t1.
+  * Enable WPP and multithreading by default, with detection 
+    for number of cores
+  * Update all presets to ratedistortion-complexity 
+    optimized versions. These are based on a search of all 
+    (~ish) possible encoding parameters and bring a huge boost 
+    to both speed and BDRate when encoding with the presets (10x 
+    speed for veryslow, ~1.1x-4x for others, up to 30% improved 
+    BDRate for some presets).
+  * Set default options to match medium with intra period of 
+    64, QP 22 and --gop=lp-g4d3t1
+  * --implicit-rdpcm RExt feature
+  Optimizations
+  * AVX2 version for Sample Adaptive Offset (SAO)
+  * Optimized memory copying
+  * AVX2 versions of filters for fractional pixel motion 
+    estimation
+  * AVX2 version for half pixel chroma sampling for SMP/AMP
+  * AVX2 versions for calculating two or four SATD values at 
+    once for small blocks
+  * Rewrote AVX2 version of fractional pixel motion 
+    compensation
+  * Rewrote motion vector cost calculation. It only got 
+    slightly faster, but BDRate improved a bunch due to the new 
+    implementation being more correct.
+  * Made AVX2 SAD use SSE4.1 for cases where there isn't an 
+    AVX2 implementation, speeding up SMP/AMP.
+  Bugfixes
+  * Fixed a bug in rate control where an int overflowed 
+    after coding 2^31 bits (2Gb)
+  * Fixed non-determinism intiles
+  * Fixed chroma reconstruction bug in tiles
+  * Fixed a bug with calculating the number of bits used for 
+    intra mode on 4x4 CUs
+  * Stopped checking zero motion vector multiple times in 
+    motion compensation
+  * Fixed possible segfault in motion compensation
+  * Fixed a race condition with OWF and SMP/AMP
+  * Gave pthread_cond_timedwait time in correctly, such that 
+    main thread now sleeps instead of busylooping when it has 
+    nothing to do
+  * Fixed rate control with lp-gop
+  * Fixed full search not taking temporal motion vector into 
+    account
+  * Allow non-gop-length intra period for lp-gop
+  Code / Building / Testing
+  * Moved SAO to it's own file
+  * Removed a ton of unnecessary includes
+  * Updated autotools ax_pthread
+  * Added build test for OS-X for Travis
+  * Made tests check for bitstream correctness
+  * Refactored some of the copypasta in motion vector search 
+    starting point selection
+  * Refactored the cu_info_t datastructures to hold 
+    information at a 4x4 resolution needed for AMP and SMP
+  * Changed cu_info_t to use bitfields to negate the effect 
+    of increasing the cu_info_t array by a factor of 4
+  * Moved bitstream generation from encoderstate.c to 
+    encode_coding_tree.c
+  * Renamed encoder_state_t.global to frame, which makes 
+    sense since it hold frame level data, not global data
+  * Rewrote integer vector inter prediction, because it was 
+    so bad
+  * Refactored init_lcu_t
+  * Added more tests for inter SAD
+  * Added speed tests for dual intra SAD functions
+  * Added more realistic speed tests for inter SAD
+  Other
+  * Added a manpage
+  * Added scripts for updating manpage and README based on 
+    --usage.
+  * Added a Dockerfile. Just because.
+  * Added commit date to --version
+
+-------------------------------------------------------------------
 Thu Jan 28 20:07:47 UTC 2016 - aloisio@gmx.com
 
 - Update to version 0.8.3

kvazaar.spec Changed

kvazaar-0.8.3.tar.gz/.travis-install.sh -> kvazaar-1.0.0.tar.gz/.travis-install.sh Changed

kvazaar-0.8.3.tar.gz/.travis-script.sh -> kvazaar-1.0.0.tar.gz/.travis-script.sh Changed

kvazaar-0.8.3.tar.gz/.travis.yml -> kvazaar-1.0.0.tar.gz/.travis.yml Changed

@@ -2,25 +2,81 @@
 
 env:
   global:
-  - KVZ_DISABLE_AVX2=1
   - TEST_DIM=264x130
   - TEST_FRAMES=10
 
+# Use container based infrastructure
 sudo: false
 
+# Use this the global requirements list for valgrind tests, because those are the most numerous.
+addons:
+  apt:
+    sources:
+    - ubuntu-toolchain-r-test
+    packages:
+    - autoconf
+    - libtool
+    - p7zip-full  # to uncompress our own ffmpeg binary
+    - valgrind
+    - yasm
+
 matrix:
   fast_finish: true
+  allow_failures:
+  - os: osx  # Don't know what's wrong. Something changed in the environment.
   
   include:
     - compiler: clang
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - libtool
+          - yasm
+    
     - compiler: gcc-4.8
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - gcc-4.8
+          - libtool
+          - yasm
+
+    # We have some Mac specific code and Mac sometimes has odd build issues.
+    - os: osx
+      compiler: clang  # gcc is actually clang on Travis OS X
 
     # Check for external symbols without kvz_ prefix.
     - compiler: gcc-4.8
       script:
         - ./autogen.sh
         - ./configure && make
-        - (! nm -go --defined-only src/.libs/libkvazaar.a | grep -v ' kvz_')
+        - (! nm -go --defined-only src/.libs/libkvazaar.a | grep -v ' kvz_') || (echo 'ERROR Only symbols prefixed with kvz_ should be exported from libkvazaar.'; false)
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - gcc-4.8
+          - libtool
+          - yasm
+
+    # Tests trying to use invalid input dimensions
+    - env: EXPECTED_STATUS=1 PARAMS="-i src/kvazaar --input-res=1x65 -o /dev/null"
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - libtool
+          - yasm
 
     # These valgrind tests are slow, so they are performed with the minimum
     # number of small frames and fast settings.
@@ -35,8 +91,8 @@
     - env: VALGRIND_TEST="-p4 -r1 --owf=0 --threads=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
+    - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
+    - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     
     # Tests for rdoq, sao, deblock and signhide and subme.
     - env: VALGRIND_TEST="-p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3"
@@ -60,24 +116,6 @@
     - env: TEST_FRAMES=10 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=4 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     - env: TEST_FRAMES=20 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
 
-    # Tests trying to use invalid input dimensions
-    - env: EXPECTED_STATUS=1 PARAMS="-i src/kvazaar --input-res=1x65 -o /dev/null"
-
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-
-    packages:
-    - autoconf
-    - gcc
-    - gcc-4.8
-    - libtool
-    - nasm
-    - p7zip-full
-    - valgrind
-    - yasm
-
 install:
   - source .travis-install.sh

kvazaar-1.0.0.tar.gz/Dockerfile Added

@@ -0,0 +1,42 @@
+# A simple Dockerfile for building Kvazaar from the git repository
+# Example build command when in this directory: docker build -t kvazaar .
+#
+# Example usage
+# Run with an input YUV file and output HEVC binary file
+#     docker run -i -a STDIN -a STDOUT kvazaar -i - --input-res=320x240 -o - < testfile_320x240.yuv > out.265
+#
+# Use libav or ffmpeg to input (almost) any format and convert it to YUV420 for kvazaar, audio is disabled
+#
+#     RESOLUTION=`avconv -i input.avi 2>&1 | grep Stream | grep -oP ', \K[0-9]+x[0-9]+'`
+#     avconv -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
+#  or 
+#     RESOLUTION=`ffmpeg -i input.avi 2>&1 | grep Stream | grep -oP ', \K[0-9]+x[0-9]+'`
+#     ffmpeg -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
+#
+
+# Use Ubuntu 15.10 as a base for now, it's around 136MB
+FROM ubuntu:15.10
+
+MAINTAINER Marko Viitanen <fador@iki.fi>
+
+    # List of needed packages to be able to build kvazaar with autotools
+    ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf
+    
+    # Run all the commands in one RUN so we don't have any extra history
+    # data in the image.
+    RUN apt-get update \
+    && apt-get install -y $REQUIRED_PACKAGES \
+    && apt-get clean \
+    && git clone --depth=1 git://github.com/ultravideo/kvazaar.git; \
+        cd kvazaar; \
+        ./autogen.sh; \
+        ./configure --disable-shared;\
+        make;\
+        make install; \
+    AUTOINSTALLED_PACKAGES=`apt-mark showauto`; \
+    apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES; \
+        apt-get clean autoclean; \
+        apt-get autoremove -y; \
+        rm -rf /var/lib/{apt,dpkg,cache,log}/
+ENTRYPOINT ["kvazaar"]
+CMD ["--help"]

kvazaar-0.8.3.tar.gz/Makefile.am -> kvazaar-1.0.0.tar.gz/Makefile.am Changed

kvazaar-0.8.3.tar.gz/README.md -> kvazaar-1.0.0.tar.gz/README.md Changed

@@ -1,5 +1,5 @@
-Kvazaar {#mainpage}
-=========
+Kvazaar
+=======
 An open-source HEVC encoder licensed under LGPLv2.1
 
 Join channel #kvazaar_hevc in Freenode IRC network to contact us.
@@ -11,131 +11,157 @@
 
 [![Build Status](https://travis-ci.org/ultravideo/kvazaar.svg?branch=master)](https://travis-ci.org/ultravideo/kvazaar)
 
-##Using Kvazaar
-
-    Usage:
-    kvazaar -i <input> --input-res <width>x<height> -o <output>
-
-    Optional parameters:
-          -n, --frames <integer>     : Number of frames to code [all]
-          --seek <integer>           : First frame to code [0]
-          --input-res <int>x<int>    : Input resolution (width x height) or
-                      auto           : try to detect from file name [auto]
-          --input-fps <num>/<denom>  : Framerate of the input video [25.0]
-          -q, --qp <integer>         : Quantization Parameter [32]
-          -p, --period <integer>     : Period of intra pictures [0]
-                                         0: only first picture is intra
-                                         1: all pictures are intra
-                                         2-N: every Nth picture is intra
-              --vps-period <integer> : Specify how often the video parameter set is
-                                       re-sent. [0]
-                                         0: only send VPS with the first frame
-                                         1: send VPS with every intra frame
-                                         N: send VPS with every Nth intra frame
-          -r, --ref <integer>        : Reference frames, range 1..15 [3]
-              --no-deblock           : Disable deblocking filter
-              --deblock <beta:tc>    : Deblocking filter parameters
-                                       beta and tc range is -6..6 [0:0]
-              --no-sao               : Disable sample adaptive offset
-              --no-rdoq              : Disable RDO quantization
-              --no-signhide          : Disable sign hiding in quantization
-              --smp                  : Enable Symmetric Motion Partition
-              --amp                  : Enable Asymmetric Motion Partition
-              --rd <integer>         : Rate-Distortion Optimization level [1]
-                                         0: no RDO
-                                         1: estimated RDO
-                                         2: full RDO
-              --mv-rdo               : Enable Rate-Distortion Optimized motion vector costs
-              --full-intra-search    : Try all intra modes.
-              --no-transform-skip    : Disable transform skip
-              --aud                  : Use access unit delimiters
-              --cqmfile <string>     : Custom Quantization Matrices from a file
-              --debug <string>       : Output encoders reconstruction.
-              --cpuid <integer>      : Disable runtime cpu optimizations with value 0.
-              --me <string>          : Set integer motion estimation algorithm ["hexbs"]
-                                         "hexbs": Hexagon Based Search (faster)
-                                         "tz":    Test Zone Search (better quality)
-                                         "full":  Full Search (super slow)
-              --subme <integer>      : Set fractional pixel motion estimation level [1].
-                                         0: only integer motion estimation
-                                         1: fractional pixel motion estimation enabled
-              --source-scan-type <string> : Set source scan type ["progressive"].
-                                              "progressive": progressive scan
-                                              "tff": top field first
-                                              "bff": bottom field first
-              --pu-depth-inter <int>-<int> : Range for sizes of inter prediction units to try.
-                                         0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8
-              --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.
-                                         0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4
-              --no-info              : Don't add information about the encoder to settings.
-              --gop <string>         : Definition for GOP [0]
-                                         - 0 disabled
-                                         - 8 B-frame pyramid of length 8
-                                         - lp-gop syntax, defined below (example: g8d4r3t2)
-              --bipred               : Enable bi-prediction search
-              --bitrate <integer>    : Target bitrate. [0]
-                                         0: disable rate-control
-                                         N: target N bits per second
-              --preset <string>      : Use preset. This will override previous options.
-                                         ultrafast, superfast,veryfast, faster,
-                                         fast, medium, slow, slower, veryslow, placebo
-              --no-psnr              : Don't calculate PSNR for frames
-
-      Video Usability Information:
-              --sar <width:height>   : Specify Sample Aspect Ratio
-              --overscan <string>    : Specify crop overscan setting ["undef"]
-                                         - undef, show, crop
-              --videoformat <string> : Specify video format ["undef"]
-                                         - component, pal, ntsc, secam, mac, undef
-              --range <string>       : Specify color range ["tv"]
-                                         - tv, pc
-              --colorprim <string>   : Specify color primaries ["undef"]
-                                         - undef, bt709, bt470m, bt470bg,
-                                           smpte170m, smpte240m, film, bt2020
-              --transfer <string>    : Specify transfer characteristics ["undef"]
-                                         - undef, bt709, bt470m, bt470bg,
-                                           smpte170m, smpte240m, linear, log100,
-                                           log316, iec61966-2-4, bt1361e,
-                                           iec61966-2-1, bt2020-10, bt2020-12
-              --colormatrix <string> : Specify color matrix setting ["undef"]
-                                         - undef, bt709, fcc, bt470bg, smpte170m,
-                                           smpte240m, GBR, YCgCo, bt2020nc, bt2020c
-              --chromaloc <integer>  : Specify chroma sample location (0 to 5) [0]
-
-      Parallel processing:
-              --threads <integer>    : Maximum number of threads to use.
-                                       Disable threads if set to 0.
-
-      Tiles:
-              --tiles-width-split <string>|u<int> :
-                                       Specifies a comma separated list of pixel
-                                       positions of tiles columns separation coordinates.
-                                       Can also be u followed by and a single int n,
-                                       in which case it produces columns of uniform width.
-              --tiles-height-split <string>|u<int> :
-                                       Specifies a comma separated list of pixel
-                                       positions of tiles rows separation coordinates.
-                                       Can also be u followed by and a single int n,
-                                       in which case it produces rows of uniform height.
-
-      Wpp:
-              --wpp                  : Enable wavefront parallel processing
-              --owf <integer>|auto   : Number of parallel frames to process. 0 to disable.
-
-      Slices:
-              --slice-addresses <string>|u<int>:
-                                       Specifies a comma separated list of LCU
-                                       positions in tile scan order of tile separations.
-                                       Can also be u followed by and a single int n,
-                                       in which case it produces uniform slice length.
-
-      Deprecated parameters: (might be removed at some point)
-         Use --input-res:
-           -w, --width               : Width of input in pixels
-           -h, --height              : Height of input in pixels
-
-
-###For example:
+## Using Kvazaar
+
+[comment]: # (BEGIN KVAZAAR HELP MESSAGE)
+```
+Usage:
+kvazaar -i <input> --input-res <width>x<height> -o <output>
+
+Optional parameters:
+      --help                     : Print this help message and exit
+      --version                  : Print version information and exit
+      -n, --frames <integer>     : Number of frames to code [all]
+      --seek <integer>           : First frame to code [0]
+      --input-res <int>x<int>    : Input resolution (width x height) or
+                  auto           : try to detect from file name [auto]
+      --input-fps <num>/<denom>  : Framerate of the input video [25.0]
+      -q, --qp <integer>         : Quantization Parameter [32]
+      -p, --period <integer>     : Period of intra pictures [0]
+                                     0: only first picture is intra
+                                     1: all pictures are intra
+                                     2-N: every Nth picture is intra
+          --vps-period <integer> : Specify how often the video parameter set is
+                                   re-sent. [0]
+                                     0: only send VPS with the first frame
+                                     1: send VPS with every intra frame
+                                     N: send VPS with every Nth intra frame
+      -r, --ref <integer>        : Reference frames, range 1..15 [3]
+          --no-deblock           : Disable deblocking filter
+          --deblock <beta:tc>    : Deblocking filter parameters
+                                   beta and tc range is -6..6 [0:0]
+          --no-sao               : Disable sample adaptive offset
+          --no-rdoq              : Disable RDO quantization
+          --no-signhide          : Disable sign hiding in quantization
+          --smp                  : Enable Symmetric Motion Partition
+          --amp                  : Enable Asymmetric Motion Partition
+          --rd <integer>         : Rate-Distortion Optimization level [1]
+                                     0: no RDO
+                                     1: estimated RDO
+                                     2: full RDO
+          --mv-rdo               : Enable Rate-Distortion Optimized motion vector costs
+          --full-intra-search    : Try all intra modes.
+          --no-transform-skip    : Disable transform skip
+          --aud                  : Use access unit delimiters
+          --cqmfile <string>     : Custom Quantization Matrices from a file
+          --debug <string>       : Output encoders reconstruction.
+          --cpuid <integer>      : Disable runtime cpu optimizations with value 0.
+          --me <string>          : Set integer motion estimation algorithm ["hexbs"]
+                                     "hexbs": Hexagon Based Search (faster)
+                                     "tz":    Test Zone Search (better quality)
+                                     "full":  Full Search (super slow)
+          --subme <integer>      : Set fractional pixel motion estimation level [4].
+                                     0: only integer motion estimation
+                                     1: + 1/2-pixel horizontal and vertical
+                                     2: + 1/2-pixel diagonal
+                                     3: + 1/4-pixel horizontal and vertical
+                                     4: + 1/4-pixel diagonal
+          --source-scan-type <string> : Set source scan type ["progressive"].
+                                     "progressive": progressive scan
+                                     "tff": top field first
+                                     "bff": bottom field first
+          --pu-depth-inter <int>-<int> : Range for sizes of inter prediction units to try.
+                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8
+          --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.
+                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4

kvazaar-0.8.3.tar.gz/build/C_Properties.props -> kvazaar-1.0.0.tar.gz/build/C_Properties.props Changed

kvazaar-0.8.3.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-1.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -125,6 +125,8 @@
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\src\extras\crypto.cpp" />
+    <ClCompile Include="..\..\src\extras\libmd5.c" />
     <ClCompile Include="..\..\src\input_frame_buffer.c" />
     <ClCompile Include="..\..\src\kvazaar.c" />
     <ClCompile Include="..\..\src\bitstream.c" />
@@ -137,6 +139,7 @@
     <ClCompile Include="..\..\src\encoder_state-bitstream.c" />
     <ClCompile Include="..\..\src\encoder_state-ctors_dtors.c" />
     <ClCompile Include="..\..\src\encoder_state-geometry.c" />
+    <ClCompile Include="..\..\src\encode_coding_tree.c" />
     <ClCompile Include="..\..\src\extras\getopt.c" />
     <ClCompile Include="..\..\src\filter.c" />
     <ClCompile Include="..\..\src\image.c" />
@@ -163,12 +166,21 @@
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\avx2\sao-avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
+    <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
     <ClInclude Include="..\..\src\checkpoint.h" />
     <ClInclude Include="..\..\src\cu.h" />
+    <ClInclude Include="..\..\src\extras\crypto.h" />
+    <ClInclude Include="..\..\src\extras\libmd5.h" />
     <ClInclude Include="..\..\src\image.h" />
     <ClInclude Include="..\..\src\imagelist.h" />
     <ClCompile Include="..\..\src\strategies\altivec\picture-altivec.c" />
@@ -200,21 +212,26 @@
     <ClCompile Include="..\..\src\strategies\strategies-ipol.c" />
     <ClCompile Include="..\..\src\strategies\strategies-nal.c" />
     <ClCompile Include="..\..\src\strategies\strategies-picture.c" />
+    <ClCompile Include="..\..\src\strategies\strategies-sao.c" />
     <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" />
     <ClCompile Include="..\..\src\videoframe.c" />
     <ClInclude Include="..\..\src\encoder_state-bitstream.h" />
     <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h" />
     <ClInclude Include="..\..\src\encoder_state-geometry.h" />
+    <ClInclude Include="..\..\src\encode_coding_tree.h" />
     <ClCompile Include="..\..\src\strategyselector.c" />
     <ClCompile Include="..\..\src\tables.c" />
     <ClCompile Include="..\..\src\threadqueue.c" />
     <ClCompile Include="..\..\src\transform.c" />
     <ClInclude Include="..\..\src\input_frame_buffer.h" />
     <ClInclude Include="..\..\src\kvazaar_internal.h" />
+    <ClInclude Include="..\..\src\kvz_math.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
     <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
+    <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
     <ClInclude Include="..\..\src\strategies\strategies-common.h" />
     <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\quant-generic.h" />
@@ -254,6 +271,7 @@
     <ClInclude Include="..\..\src\strategies\strategies-ipol.h" />
     <ClInclude Include="..\..\src\strategies\strategies-nal.h" />
     <ClInclude Include="..\..\src\strategies\strategies-picture.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-sao.h" />
     <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.h" />
     <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.h" />
     <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm.h" />

kvazaar-0.8.3.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-1.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -207,6 +207,20 @@
     <ClCompile Include="..\..\src\encoder_state-bitstream.c">
       <Filter>Bitstream</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\encode_coding_tree.c">
+      <Filter>Bitstream</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\strategies-sao.c">
+      <Filter>Optimization\strategies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\sao-generic.c">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\avx2\sao-avx2.c">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\extras\libmd5.c" />
+    <ClCompile Include="..\..\src\extras\crypto.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -382,6 +396,21 @@
     <ClInclude Include="..\..\src\encoder_state-bitstream.h">
       <Filter>Bitstream</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\encode_coding_tree.h">
+      <Filter>Bitstream</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\kvz_math.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-sao.h">
+      <Filter>Optimization\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\generic\sao-generic.h">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\extras\libmd5.h" />
+    <ClInclude Include="..\..\src\extras\crypto.h" />
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">

kvazaar-0.8.3.tar.gz/configure.ac -> kvazaar-1.0.0.tar.gz/configure.ac Changed

@@ -23,10 +23,10 @@
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=3
-ver_minor=2
+ver_minor=13
 ver_release=0
 
-# not used, but it prevents configure from adding a lot of defines to the CFLAGS
+# Prevents configure from adding a lot of defines to the CFLAGS
 AC_CONFIG_HEADERS([config.h])
 
 AC_CONFIG_MACRO_DIR([m4])
@@ -38,40 +38,59 @@
 AC_PROG_CC
 AC_PROG_CC_C99
 AM_PROG_AR
+AC_PROG_CXX
 
 # Get fread that can read more than 2GB on 32 bit systems.
 AC_SYS_LARGEFILE
 
 LT_INIT([win32-dll])
 
-AX_CHECK_COMPILE_FLAG(-mavx2,   [flag_avx2="true"])
-AX_CHECK_COMPILE_FLAG(-msse4.1, [flag_sse4_1="true"])
-AX_CHECK_COMPILE_FLAG(-msse2,   [flag_sse2="true"])
+AX_CHECK_COMPILE_FLAG([-mavx2],   [flag_avx2="true"])
+AX_CHECK_COMPILE_FLAG([-msse4.1], [flag_sse4_1="true"])
+AX_CHECK_COMPILE_FLAG([-msse2],   [flag_sse2="true"])
 
 AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true"])
 AM_CONDITIONAL([HAVE_SSE4_1], [test x"$flag_sse4_1" = x"true"])
 AM_CONDITIONAL([HAVE_SSE2], [test x"$flag_sse2" = x"true"])
 
-AX_PTHREAD
-CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -ftree-vectorize -fvisibility=hidden $PTHREAD_CFLAGS $CFLAGS"
+KVZ_CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
+CFLAGS="$KVZ_CFLAGS $CFLAGS"
+
+AC_ARG_WITH([cryptopp],
+    AS_HELP_STRING([--with-cryptopp],
+        [Build with cryptopp Enables selective encryption.]))
+AS_IF([test "x$with_cryptopp" = "xyes"], [
+    PKG_CHECK_MODULES([cryptopp], [cryptopp],
+        [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])],
+        [AC_MSG_ERROR([cryptopp not found with pkg-config])]
+    )
+])
+
+AM_CONDITIONAL([USE_CRYPTOPP], [test "x$with_cryptopp" = "xyes"])
+CPPFLAGS="$CPPFLAGS $cryptopp_CFLAGS"
+LIBS="$LIBS $cryptopp_LIBS"
+
+
 CPPFLAGS="-DKVZ_DLL_EXPORTS $CPPFLAGS"
 
 AC_SEARCH_LIBS([log], [m c], [], [exit 1])
 AC_SEARCH_LIBS([pow], [m c], [], [exit 1])
 AC_SEARCH_LIBS([sqrt], [m c], [], [exit 1])
-LIBS="$PTHREAD_LIBS $LIBS"
 
 
+
+# This does workarounds for pthreads on various compilers.
+AX_PTHREAD
+CFLAGS="$PTHREAD_CFLAGS $CFLAGS"
+LIBS="$PTHREAD_LIBS $LIBS"
+CC="$PTHREAD_CC"
+
 # --disable-werror
 AC_ARG_ENABLE([werror], [AS_HELP_STRING([--disable-werror], [don't treat warnings as errors [no]])],
               [], [CFLAGS="-Werror $CFLAGS"]
 )
 
 
-# check for getopt
-AC_CHECK_HEADER([getopt.h], [], [CFLAGS="$CFLAGS -I$srcdir/src/extras"])
-
-
 # host and cpu specific settings
 AS_CASE([$host_cpu],
         [i?86], [BITS="32" ASFLAGS="$ASFLAGS -DARCH_X86_64=0" X86="true"],
@@ -129,10 +148,8 @@
 KVZ_API_VERSION="$ver_major:$ver_minor:$ver_release"
 AC_SUBST([KVZ_API_VERSION])
 
-
 AC_CONFIG_FILES([Makefile
                  src/Makefile
                  src/kvazaar.pc
                  tests/Makefile])
 AC_OUTPUT
-

kvazaar-1.0.0.tar.gz/doc/kvazaar.1 Added

@@ -0,0 +1,262 @@
+.TH KVAZAAR "1" "October 2016" "kvazaar v0.8.3" "User Commands"
+.SH NAME
+kvazaar \- open source HEVC encoder
+.SH SYNOPSIS
+\fBkvazaar \fR\-i <input> \-\-input\-res <width>x<height> \-o <output>
+.SH DESCRIPTION
+.TP
+\fB\-\-help                    
+Print this help message and exit
+.TP
+\fB\-\-version                 
+Print version information and exit
+.TP
+\fB\-n\fR, \fB\-\-frames <integer>    
+Number of frames to code [all]
+.TP
+\fB\-\-seek <integer>          
+First frame to code [0]
+.TP
+\fB\-\-input\-res <int>x<int>   
+Input resolution (width x height) or
+auto          
+try to detect from file name [auto]
+.TP
+\fB\-\-input\-fps <num>/<denom> 
+Framerate of the input video [25.0]
+.TP
+\fB\-q\fR, \fB\-\-qp <integer>        
+Quantization Parameter [32]
+.TP
+\fB\-p\fR, \fB\-\-period <integer>    
+Period of intra pictures [0]
+  0: only first picture is intra
+  1: all pictures are intra
+  2\-N: every Nth picture is intra
+.TP
+\fB\-\-vps\-period <integer>
+Specify how often the video parameter set is
+re\-sent. [0]
+  0: only send VPS with the first frame
+  1: send VPS with every intra frame
+  N: send VPS with every Nth intra frame
+.TP
+\fB\-r\fR, \fB\-\-ref <integer>       
+Reference frames, range 1..15 [3]
+.TP
+\fB\-\-no\-deblock          
+Disable deblocking filter
+.TP
+\fB\-\-deblock <beta:tc>   
+Deblocking filter parameters
+beta and tc range is \-6..6 [0:0]
+.TP
+\fB\-\-no\-sao              
+Disable sample adaptive offset
+.TP
+\fB\-\-no\-rdoq             
+Disable RDO quantization
+.TP
+\fB\-\-no\-signhide         
+Disable sign hiding in quantization
+.TP
+\fB\-\-smp                 
+Enable Symmetric Motion Partition
+.TP
+\fB\-\-amp                 
+Enable Asymmetric Motion Partition
+.TP
+\fB\-\-rd <integer>        
+Rate\-Distortion Optimization level [1]
+  0: no RDO
+  1: estimated RDO
+  2: full RDO
+.TP
+\fB\-\-mv\-rdo              
+Enable Rate\-Distortion Optimized motion vector costs
+.TP
+\fB\-\-full\-intra\-search   
+Try all intra modes.
+.TP
+\fB\-\-no\-transform\-skip   
+Disable transform skip
+.TP
+\fB\-\-aud                 
+Use access unit delimiters
+.TP
+\fB\-\-cqmfile <string>    
+Custom Quantization Matrices from a file
+.TP
+\fB\-\-debug <string>      
+Output encoders reconstruction.
+.TP
+\fB\-\-cpuid <integer>     
+Disable runtime cpu optimizations with value 0.
+.TP
+\fB\-\-me <string>         
+Set integer motion estimation algorithm ["hexbs"]
+  "hexbs": Hexagon Based Search (faster)
+  "tz":    Test Zone Search (better quality)
+  "full":  Full Search (super slow)
+.TP
+\fB\-\-subme <integer>     
+Set fractional pixel motion estimation level [4].
+  0: only integer motion estimation
+  1: + 1/2\-pixel horizontal and vertical
+  2: + 1/2\-pixel diagonal
+  3: + 1/4\-pixel horizontal and vertical
+  4: + 1/4\-pixel diagonal
+.TP
+\fB\-\-source\-scan\-type <string>
+Set source scan type ["progressive"].
+  "progressive": progressive scan
+  "tff": top field first
+  "bff": bottom field first
+.TP
+\fB\-\-pu\-depth\-inter <int>\-<int>
+Range for sizes of inter prediction units to try.
+  0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8
+.TP
+\fB\-\-pu\-depth\-intra <int>\-<int>
+Range for sizes of intra prediction units to try.
+  0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4
+.TP
+\fB\-\-no\-info             
+Don't add information about the encoder to settings.
+.TP
+\fB\-\-gop <string>        
+Definition of GOP structure [0]
+  "0":           disabled
+  "8":           B\-frame pyramid of length 8
+  "lp\-<string>": lp\-gop definition (e.g. lp\-g8d4r3t2)
+.TP
+\fB\-\-bipred              
+Enable bi\-prediction search
+.TP
+\fB\-\-bitrate <integer>   
+Target bitrate. [0]
+  0: disable rate\-control
+  N: target N bits per second
+.TP
+\fB\-\-preset <string>     
+Use preset. This will override previous options.
+  ultrafast, superfast, veryfast, faster,
+  fast, medium, slow, slower, veryslow, placebo
+.TP
+\fB\-\-no\-psnr             
+Don't calculate PSNR for frames
+.TP
+\fB\-\-loop\-input          
+Re\-read input file forever
+.TP
+\fB\-\-mv\-constraint       
+Constrain movement vectors
+  "none": no constraint
+  "frametile": constrain within the tile
+  "frametilemargin": constrain even more
+.TP
+\fB\-\-hash                
+Specify which decoded picture hash to use [checksum]
+  "none": 0 bytes
+  "checksum": 18 bytes
+  "md5": 56 bytes
+.TP
+\fB\-\-cu\-split\-termination
+Specify the cu split termination behaviour
+  "zero": Terminate when splitting gives little
+            improvement.
+  "off": Don't terminate splitting early
+.TP
+\fB\-\-me\-early\-termination
+Specify the me early termination behaviour
+  "off": Early termination is off
+  "on": Early termination is on
+  "sensitive": Sensitive early termination is on
+.TP
+\fB\-\-lossless            
+Use lossless coding
+.TP
+\fB\-\-implicit\-rdpcm      
+Enable implicit residual DPCM. Currently only supported
+with lossless coding.
+.TP
+\fB\-\-no\-tmvp             
+Disable Temporal Motion Vector Prediction
+.TP
+\fB\-\-rdoq\-skip           
+Skips RDOQ for 4x4 blocks
+.TP
+\fB\-\-input\-format        
+P420 or P400
+.TP
+\fB\-\-input\-bitdepth      
+8\-16
+
+.SS "Video Usability Information:"
+.TP
+\fB\-\-sar <width:height>  
+Specify Sample Aspect Ratio
+.TP

kvazaar-0.8.3.tar.gz/m4/ax_pthread.m4 -> kvazaar-1.0.0.tar.gz/m4/ax_pthread.m4 Changed

@@ -19,10 +19,10 @@
 #   is necessary on AIX to use the special cc_r compiler alias.)
 #
 #   NOTE: You are assumed to not only compile your program with these flags,
-#   but also link it with them as well. e.g. you should link with
+#   but also to link with them as well. For example, you might link with
 #   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
 #
-#   If you are only building threads programs, you may wish to use these
+#   If you are only building threaded programs, you may wish to use these
 #   variables in your default LIBS, CFLAGS, and CC:
 #
 #     LIBS="$PTHREAD_LIBS $LIBS"
@@ -30,8 +30,8 @@
 #     CC="$PTHREAD_CC"
 #
 #   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
-#   has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name
-#   (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#   has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
+#   that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
 #
 #   Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
 #   PTHREAD_PRIO_INHERIT symbol is defined when compiling with
@@ -82,35 +82,40 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 21
+#serial 23
 
 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
 AC_DEFUN([AX_PTHREAD], [
 AC_REQUIRE([AC_CANONICAL_HOST])
+AC_REQUIRE([AC_PROG_CC])
+AC_REQUIRE([AC_PROG_SED])
 AC_LANG_PUSH([C])
 ax_pthread_ok=no
 
 # We used to check for pthread.h first, but this fails if pthread.h
-# requires special compiler flags (e.g. on True64 or Sequent).
+# requires special compiler flags (e.g. on Tru64 or Sequent).
 # It gets checked for in the link test anyway.
 
 # First of all, check if the user has set any of the PTHREAD_LIBS,
 # etcetera environment variables, and if threads linking works using
 # them:
-if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
-        save_CFLAGS="$CFLAGS"
+if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
+        ax_pthread_save_CC="$CC"
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"])
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-        save_LIBS="$LIBS"
         LIBS="$PTHREAD_LIBS $LIBS"
-        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
-        AC_TRY_LINK_FUNC([pthread_join], [ax_pthread_ok=yes])
+        AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS])
+        AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes])
         AC_MSG_RESULT([$ax_pthread_ok])
-        if test x"$ax_pthread_ok" = xno; then
+        if test "x$ax_pthread_ok" = "xno"; then
                 PTHREAD_LIBS=""
                 PTHREAD_CFLAGS=""
         fi
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CC="$ax_pthread_save_CC"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 fi
 
 # We must check for the threads library under a number of different
@@ -123,7 +128,7 @@
 # which indicates that we try without any flags at all, and "pthread-config"
 # which is a program returning the flags for the Pth emulation library.
 
-ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
 
 # The ordering *is* (sometimes) important.  Some notes on the
 # individual items follow:
@@ -132,82 +137,225 @@
 # none: in case threads are in libc; should be tried before -Kthread and
 #       other compiler flags to prevent continual compiler warnings
 # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
-# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
-# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
-# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
-# -pthreads: Solaris/gcc
-# -mthreads: Mingw32/gcc, Lynx/gcc
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64
+#           (Note: HP C rejects this with "bad form for `-t' option")
+# -pthreads: Solaris/gcc (Note: HP C also rejects)
 # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
-#      doesn't hurt to check since this sometimes defines pthreads too;
-#      also defines -D_REENTRANT)
-#      ... -mt is also the pthreads flag for HP/aCC
+#      doesn't hurt to check since this sometimes defines pthreads and
+#      -D_REENTRANT too), HP C (must be checked before -lpthread, which
+#      is present but should not be used directly; and before -mthreads,
+#      because the compiler interprets this as "-mt" + "-hreads")
+# -mthreads: Mingw32/gcc, Lynx/gcc
 # pthread: Linux, etcetera
 # --thread-safe: KAI C++
 # pthread-config: use pthread-config program (for GNU Pth library)
 
-case ${host_os} in
+case $host_os in
+
+        freebsd*)
+
+        # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+        # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+
+        ax_pthread_flags="-kthread lthread $ax_pthread_flags"
+        ;;
+
+        hpux*)
+
+        # From the cc(1) man page: "[-mt] Sets various -D flags to enable
+        # multi-threading and also sets -lpthread."
+
+        ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags"
+        ;;
+
+        openedition*)
+
+        # IBM z/OS requires a feature-test macro to be defined in order to
+        # enable POSIX threads at all, so give the user a hint if this is
+        # not set. (We don't define these ourselves, as they can affect
+        # other portions of the system API in unpredictable ways.)
+
+        AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING],
+            [
+#            if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS)
+             AX_PTHREAD_ZOS_MISSING
+#            endif
+            ],
+            [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])])
+        ;;
+
         solaris*)
 
         # On Solaris (at least, for some versions), libc contains stubbed
         # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
-        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
-        # a function called by this macro, so we could check for that, but
-        # who knows whether they'll stub that too in a future libc.)  So,
-        # we'll just look for -pthreads and -lpthread first:
+        # tests will erroneously succeed. (N.B.: The stubs are missing
+        # pthread_cleanup_push, or rather a function called by this macro,
+        # so we could check for that, but who knows whether they'll stub
+        # that too in a future libc.)  So we'll check first for the
+        # standard Solaris way of linking pthreads (-mt -lpthread).
+
+        ax_pthread_flags="-mt,pthread pthread $ax_pthread_flags"
+        ;;
+esac
+
+# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC)
 
-        ax_pthread_flags="-pthreads pthread -mt -pthread $ax_pthread_flags"
+AS_IF([test "x$GCC" = "xyes"],
+      [ax_pthread_flags="-pthread -pthreads $ax_pthread_flags"])
+
+# The presence of a feature test macro requesting re-entrant function
+# definitions is, on some systems, a strong hint that pthreads support is
+# correctly enabled
+
+case $host_os in
+        darwin* | hpux* | linux* | osf* | solaris*)
+        ax_pthread_check_macro="_REENTRANT"
         ;;
 
-        darwin*)
-        ax_pthread_flags="-pthread $ax_pthread_flags"
+        aix*)
+        ax_pthread_check_macro="_THREAD_SAFE"
+        ;;
+
+        *)
+        ax_pthread_check_macro="--"
         ;;
 esac
+AS_IF([test "x$ax_pthread_check_macro" = "x--"],
+      [ax_pthread_check_cond=0],
+      [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"])
+
+# Are we compiling with Clang?
+
+AC_CACHE_CHECK([whether $CC is Clang],
+    [ax_cv_PTHREAD_CLANG],
+    [ax_cv_PTHREAD_CLANG=no
+     # Note that Autoconf sets GCC=yes for Clang as well as GCC
+     if test "x$GCC" = "xyes"; then
+        AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG],

kvazaar-0.8.3.tar.gz/src/Makefile.am -> kvazaar-1.0.0.tar.gz/src/Makefile.am Changed

@@ -31,6 +31,7 @@
 	yuv_io.h
 kvazaar_LDADD = libkvazaar.la $(LIBS)
 
+kvazaar_CPPFLAGS = -DKVZ_VERSION="`$(srcdir)/../tools/version.sh`"
 
 libkvazaar_la_SOURCES = \
 	bitstream.c \
@@ -55,6 +56,8 @@
 	encoder_state-ctors_dtors.h \
 	encoder_state-geometry.c \
 	encoder_state-geometry.h \
+	encode_coding_tree.c \
+	encode_coding_tree.h \
 	filter.c \
 	filter.h \
 	global.h \
@@ -70,6 +73,7 @@
 	intra.h \
 	kvazaar.c \
 	kvazaar_internal.h \
+	kvz_math.h \
 	nal.c \
 	nal.h \
 	rate_control.c \
@@ -107,6 +111,8 @@
 	strategies/generic/picture-generic.h \
 	strategies/generic/quant-generic.c \
 	strategies/generic/quant-generic.h \
+	strategies/generic/sao-generic.c \
+	strategies/generic/sao-generic.h \
 	strategies/strategies-common.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -120,10 +126,17 @@
 	strategies/strategies-picture.h \
 	strategies/strategies-quant.c \
 	strategies/strategies-quant.h \
+	strategies/strategies-sao.c \
+	strategies/strategies-sao.h \
 	strategies/x86_asm/picture-x86-asm.c \
 	strategies/x86_asm/picture-x86-asm.h \
 	strategyselector.c \
-	strategyselector.h
+	strategyselector.h \
+	extras/libmd5.c \
+	extras/libmd5.h \
+	extras/crypto.h
+
+libkvazaar_la_CFLAGS =
 
 libkvazaar_la_LIBADD = \
 	libaltivec.la \
@@ -131,6 +144,12 @@
 	libsse2.la \
 	libsse41.la
 
+if USE_CRYPTOPP
+libkvazaar_la_SOURCES += \
+	extras/crypto.h \
+	extras/crypto.cpp
+endif
+
 libkvazaar_la_LDFLAGS = $(AM_LDFLAGS) -no-undefined -version-number $(KVZ_API_VERSION)
 
 
@@ -148,7 +167,10 @@
 	strategies/avx2/picture-avx2.c \
 	strategies/avx2/picture-avx2.h \
 	strategies/avx2/quant-avx2.c \
-	strategies/avx2/quant-avx2.h
+	strategies/avx2/quant-avx2.h \
+	strategies/avx2/sao-avx2.c \
+	strategies/avx2/sao-avx2.h
+	
 
 libsse2_la_SOURCES = \
 	strategies/sse2/picture-sse2.c \
@@ -182,7 +204,7 @@
 	strategies/x86_asm/picture-x86-asm-sad.h \
 	strategies/x86_asm/picture-x86-asm-satd.asm \
 	strategies/x86_asm/picture-x86-asm-satd.h
-libasm_la_CFLAGS = -DKVZ_COMPILE_ASM
+libkvazaar_la_CFLAGS += -DKVZ_COMPILE_ASM
 
 strategies/x86_asm/picture-x86-asm-sad.lo: strategies/x86_asm/picture-x86-asm-sad.asm
 strategies/x86_asm/picture-x86-asm-satd.lo: strategies/x86_asm/picture-x86-asm-satd.asm
@@ -195,5 +217,5 @@
 yasm_verbose_0 = @echo "  YASM    " $@;
 
 .asm.lo:
-	$(yasm_verbose)$(LIBTOOL) --mode=compile $(YASM) -I$(srcdir)/extras $(ASFLAGS) $< -o $@ -prefer-non-pic 1>/dev/null
+	$(yasm_verbose)$(LIBTOOL) --mode=compile --tag=CC $(YASM) -I$(srcdir)/extras $(ASFLAGS) $< -o $@ -prefer-non-pic 1>/dev/null

kvazaar-0.8.3.tar.gz/src/bitstream.c -> kvazaar-1.0.0.tar.gz/src/bitstream.c Changed

@@ -20,13 +20,12 @@
 
 #include "bitstream.h"
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <math.h>
-#include <string.h>
-#include <stdarg.h>
 #include <stdlib.h>
-#include <assert.h>
+#include <string.h>
+
+#include "kvz_math.h"
+
 
 const uint32_t kvz_bit_set_mask[] =
 {
@@ -57,19 +56,6 @@
 }
 #endif
 
-static int floor_log2(unsigned int n)
-{
-  assert(n != 0);
-
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return pos;
-}
-
 /**
  * \brief Initialize the Exp Golomb code table.
  *
@@ -84,7 +70,7 @@
   uint8_t M;
   uint32_t info;
   for (code_num = 0; code_num < EXP_GOLOMB_TABLE_SIZE; code_num++) {
-    M = (uint8_t)floor_log2(code_num + 1);
+    M = kvz_math_floor_log2(code_num + 1);
     info = code_num + 1 - (uint32_t)pow(2, M);
     kvz_g_exp_table[code_num].len = M * 2 + 1;
     kvz_g_exp_table[code_num].value = (1<<M) | info;

kvazaar-0.8.3.tar.gz/src/bitstream.h -> kvazaar-1.0.0.tar.gz/src/bitstream.h Changed

kvazaar-0.8.3.tar.gz/src/cabac.c -> kvazaar-1.0.0.tar.gz/src/cabac.c Changed

@@ -20,10 +20,10 @@
 
 #include "cabac.h"
 
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-
+#include "encoder.h"
+#include "encoderstate.h"
+#include "extras/crypto.h"
+#include "kvazaar.h"
 
 const uint8_t kvz_g_auc_next_state_mps[128] =
 {
@@ -275,6 +275,7 @@
 {
   int32_t code_number = symbol;
   uint32_t length;
+
   if (code_number < (3 << r_param)) {
     length = code_number >> r_param;
     CABAC_BINS_EP(cabac, (1 << (length + 1)) - 2 , length + 1, "coeff_abs_level_remaining");
@@ -291,6 +292,198 @@
   }
 }
 
+void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac,const uint32_t symbol, const uint32_t r_param, int32_t base_level)
+{
+ int32_t codeNumber  = (int32_t)symbol;
+ uint32_t length;
+
+ if (codeNumber < (3 << r_param)) {
+   length = codeNumber>>r_param;
+   CABAC_BINS_EP(cabac, (1 << (length + 1)) - 2 , length + 1, "coeff_abs_level_remaining");
+   //m_pcBinIf->encodeBinsEP( (1<<(length+1))-2 , length+1);
+   uint32_t Suffix = (codeNumber%(1<<r_param));
+
+   if(!r_param)
+    CABAC_BINS_EP(cabac, Suffix, r_param, "coeff_abs_level_remaining");
+    //m_pcBinIf->encodeBinsEP(Suffix, r_param);
+   if(r_param==1) {
+     if(!(( base_level ==2 )&& (codeNumber==4 || codeNumber==5) ) ) {
+       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+       state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 1;
+       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 1, "coeff_abs_level_remaining");
+       //m_pcBinIf->encodeBinsEP(m_prev_pos, 1);
+     } else {
+       CABAC_BINS_EP(cabac, Suffix, 1, "coeff_abs_level_remaining");
+       //m_pcBinIf->encodeBinsEP(Suffix, 1);
+     }
+   }
+   else
+    if(r_param==2) {
+       if( base_level ==1) {
+    	 uint32_t key    =ff_get_key(&state->tile->dbs_g, 2);
+         state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
+         CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+         //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+       } else
+         if( base_level ==2) {
+           if(codeNumber<=7 || codeNumber>=12) {
+        	 uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+           }
+           else
+             if(codeNumber<10) {
+                uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                state->tile->m_prev_pos  = (( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
+                CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+                //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+             } else
+               CABAC_BINS_EP(cabac, Suffix, 2, "coeff_abs_level_remaining");
+               //m_pcBinIf->encodeBinsEP(Suffix, 2);
+         } else { //base_level=3
+           if(codeNumber<=7 || codeNumber>11) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+             state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+           } else {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+             state->tile->m_prev_pos  = ((Suffix&2))+(( (Suffix&1) + ( state->tile->m_prev_pos^key)) & 1);
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+           }
+         }
+     } else
+       if(r_param==3) {
+         if( base_level ==1) {
+           uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+           state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
+           CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+           //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+         }
+         else if( base_level ==2) {
+           if(codeNumber<=15 || codeNumber>23) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+           } else
+             if(codeNumber<=19){
+               uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+               state->tile->m_prev_pos  = ((Suffix&4))+(( (Suffix&3) + (state->tile->m_prev_pos^key )) & 3);
+               CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+               //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+             } else
+               if(codeNumber<=21){
+            	 uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                 state->tile->m_prev_pos  = 4+(( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+               } else
+                 CABAC_BINS_EP(cabac, Suffix, 3, "coeff_abs_level_remaining");
+           // m_pcBinIf->encodeBinsEP(Suffix, 3);
+         } else {//base_level=3
+           CABAC_BINS_EP(cabac, Suffix, 3, "coeff_abs_level_remaining");
+           //m_pcBinIf->encodeBinsEP(Suffix, 3);
+           if(codeNumber<=15 || codeNumber>23) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+             state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+           } else
+             if(codeNumber<=19) {
+               uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+               state->tile->m_prev_pos  = (( (Suffix&3) + ( state->tile->m_prev_pos^key )) &3);
+               CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+               //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+             } else
+               if(codeNumber<=23) {
+                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                 state->tile->m_prev_pos  = (Suffix&6)+(( (Suffix&1) + (state->tile->m_prev_pos^key )) & 1);
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+               }
+         }
+       } else
+         if(r_param==4) {
+           if( base_level ==1) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
+             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+           } else
+             if( base_level ==2) {
+               if(codeNumber<=31 || codeNumber>47) {
+                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
+                 state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, r_param, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, r_param);
+               } else
+                 if(codeNumber<=39) {
+                   uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+                   state->tile->m_prev_pos  = (( (Suffix&7) + ( state->tile->m_prev_pos^key )) & 7);
+                   CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                   //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                 } else
+                   if(codeNumber<=43) {
+                     uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+                     state->tile->m_prev_pos  = 8+(( (Suffix&3) + ( state->tile->m_prev_pos^key )) & 3);
+                     CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                     //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                   } else
+                     if(codeNumber<=45){
+                       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                       state->tile->m_prev_pos  = 12+(( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
+                       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                       //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                     } else
+                       CABAC_BINS_EP(cabac, Suffix, 4, "coeff_abs_level_remaining");
+                       //m_pcBinIf->encodeBinsEP(Suffix, 4);
+             } else {//base_level=3
+               if(codeNumber<=31 || codeNumber>47) {
+                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
+                 state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, r_param, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, r_param);
+               } else
+                 if(codeNumber<=39) {
+                   uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+                   state->tile->m_prev_pos  = (( (Suffix&7) + ( state->tile->m_prev_pos^key )) & 7);
+                   CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                   //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                 } else
+                   if(codeNumber<=43) {
+                     uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+                     state->tile->m_prev_pos  = 8+(( (Suffix&3) + ( state->tile->m_prev_pos^key )) & 3);
+                     CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                     //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                   } else
+                     if(codeNumber<=47) {
+                       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                       state->tile->m_prev_pos  = (Suffix&14)+(( (Suffix&1) + (state->tile->m_prev_pos^key )) & 1);
+                       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                       //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                     }

kvazaar-0.8.3.tar.gz/src/cabac.h -> kvazaar-1.0.0.tar.gz/src/cabac.h Changed

@@ -26,10 +26,11 @@
  * Coding bins using CABAC.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #include "bitstream.h"
 
+struct encoder_state_t;
 
 // Types
 typedef struct
@@ -75,6 +76,7 @@
     cabac_ctx_t cu_skip_flag_model[3];
     cabac_ctx_t cu_merge_idx_ext_model;
     cabac_ctx_t cu_merge_flag_ext_model;
+    cabac_ctx_t cu_transquant_bypass;
     cabac_ctx_t cu_mvd_model[2];
     cabac_ctx_t cu_ref_pic_model[2];
     cabac_ctx_t mvp_idx_model[2];
@@ -103,8 +105,10 @@
 void kvz_cabac_flush(cabac_data_t *data);
 void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
                               uint32_t r_param);
-void kvz_cabac_write_ep_ex_golomb(cabac_data_t *data, uint32_t symbol,
-                              uint32_t count);
+void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac, const uint32_t symbol,
+		const uint32_t r_param, int32_t base_level);
+void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+								uint32_t symbol, uint32_t count);
 void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
                                   uint32_t symbol, int32_t offset,
                                   uint32_t max_symbol);
@@ -112,36 +116,36 @@
 
 
 // Macros
-#define CTX_STATE(ctx) (ctx->uc_state >> 1)
-#define CTX_MPS(ctx) (ctx->uc_state & 1)
+#define CTX_STATE(ctx) ((ctx)->uc_state >> 1)
+#define CTX_MPS(ctx) ((ctx)->uc_state & 1)
 #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; }
 #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; }
 
 #ifdef VERBOSE
   #define CABAC_BIN(data, value, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bin(data, value); \
+    kvz_cabac_encode_bin((data), (value)) \
     printf("%s = %u, state = %u -> %u\n", \
-           name, (uint32_t)value, (uint32_t)prev_state, (data)->ctx->uc_state); }
+           (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); }
 
   #define CABAC_BINS_EP(data, value, bins, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bins_ep(data, value, bins); \
+    kvz_cabac_encode_bins_ep((data), (value), (bins)); \
     printf("%s = %u(%u bins), state = %u -> %u\n", \
-           name, (uint32_t)value, (uint32_t)bins, prev_state, (data)->ctx->uc_state); }
+           (name), (uint32_t)(value), (bins), prev_state, (data)->ctx->uc_state); }
 
   #define CABAC_BIN_EP(data, value, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bin_ep(data, value); \
+    kvz_cabac_encode_bin_ep((data), (value)); \
     printf("%s = %u, state = %u -> %u\n", \
-           name, (uint32_t)value, (uint32_t)prev_state, (data)->ctx->uc_state); }
+           (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); }
 #else
   #define CABAC_BIN(data, value, name) \
-    kvz_cabac_encode_bin(data, value);
+    kvz_cabac_encode_bin((data), (value));
   #define CABAC_BINS_EP(data, value, bins, name) \
-    kvz_cabac_encode_bins_ep(data, value, bins);
+    kvz_cabac_encode_bins_ep((data), (value), (bins));
   #define CABAC_BIN_EP(data, value, name) \
-    kvz_cabac_encode_bin_ep(data, value);
+    kvz_cabac_encode_bin_ep((data), (value));
 #endif
 
 #endif

kvazaar-0.8.3.tar.gz/src/cfg.c -> kvazaar-1.0.0.tar.gz/src/cfg.c Changed

@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+
 kvz_config *kvz_config_alloc(void)
 {
   kvz_config *cfg = (kvz_config *)malloc(sizeof(kvz_config));
@@ -44,24 +45,25 @@
   cfg->framerate       = 25; // deprecated and will be removed.
   cfg->framerate_num   = 0;
   cfg->framerate_denom = 1;
-  cfg->qp              = 32;
-  cfg->intra_period    = 0;
+  cfg->qp              = 22;
+  cfg->intra_period    = 64;
   cfg->vps_period      = 0;
   cfg->deblock_enable  = 1;
   cfg->deblock_beta    = 0;
   cfg->deblock_tc      = 0;
   cfg->sao_enable      = 1;
   cfg->rdoq_enable     = 1;
+  cfg->rdoq_skip       = 1;
   cfg->signhide_enable = true;
   cfg->smp_enable      = false;
   cfg->amp_enable      = false;
   cfg->rdo             = 1;
   cfg->mv_rdo          = 0;
   cfg->full_intra_search = 0;
-  cfg->trskip_enable   = 1;
+  cfg->trskip_enable   = 0;
   cfg->tr_depth_intra  = 0;
   cfg->ime_algorithm   = 0; /* hexbs */
-  cfg->fme_level       = 1;
+  cfg->fme_level       = 4;
   cfg->source_scan_type = 0; /* progressive */
   cfg->vui.sar_width   = 0;
   cfg->vui.sar_height  = 0;
@@ -75,33 +77,51 @@
   cfg->aud_enable      = 0;
   cfg->cqmfile         = NULL;
   cfg->ref_frames      = DEFAULT_REF_PIC_COUNT;
-  cfg->gop_len         = 0;
+  cfg->gop_len         = 4;
+  cfg->gop_lowdelay    = true;
   cfg->bipred          = 0;
   cfg->target_bitrate  = 0;
+  cfg->hash            = KVZ_HASH_CHECKSUM;
+  cfg->lossless        = false;
+  cfg->tmvp_enable     = true;
+  cfg->implicit_rdpcm  = false;
+
+  cfg->cu_split_termination = KVZ_CU_SPLIT_TERMINATION_ZERO;
 
-  cfg->tiles_width_count         = 0;
-  cfg->tiles_height_count         = 0;
-  cfg->tiles_width_split          = NULL;
-  cfg->tiles_height_split          = NULL;
+  cfg->tiles_width_count  = 1;
+  cfg->tiles_height_count = 1;
+  cfg->tiles_width_split  = NULL;
+  cfg->tiles_height_split = NULL;
   
-  cfg->wpp = 0;
+  cfg->wpp = 1;
   cfg->owf = -1;
   cfg->slice_count = 1;
   cfg->slice_addresses_in_ts = MALLOC(int32_t, 1);
   cfg->slice_addresses_in_ts[0] = 0;
   
-  cfg->threads = 0;
+  cfg->threads = -1;
   cfg->cpuid = 1;
 
   // Defaults for what sizes of PUs are tried.
-  cfg->pu_depth_inter.min = 0; // 0-3
+  cfg->pu_depth_inter.min = 2; // 0-3
   cfg->pu_depth_inter.max = 3; // 0-3
-  cfg->pu_depth_intra.min = 1; // 0-4
-  cfg->pu_depth_intra.max = 4; // 0-4
+  cfg->pu_depth_intra.min = 2; // 0-4
+  cfg->pu_depth_intra.max = 3; // 0-4
 
   cfg->add_encoder_info = true;
   cfg->calc_psnr = true;
 
+  cfg->mv_constraint = KVZ_MV_CONSTRAIN_NONE;
+  cfg->crypto_features = KVZ_CRYPTO_OFF;
+
+  cfg->me_early_termination = 1;
+
+  cfg->input_format = KVZ_FORMAT_P420;
+  cfg->input_bitdepth = 8;
+
+  cfg->gop_lp_definition.d = 3;
+  cfg->gop_lp_definition.t = 1;
+
   return 1;
 }
 
@@ -131,11 +151,11 @@
   return 0;
 }
 
-static int parse_enum(const char *arg, const char * const *names, int8_t *dst)
+static int parse_enum_n(const char *arg, unsigned num_chars, const char * const *names, int8_t *dst)
 {
   int8_t i;
   for (i = 0; names[i]; i++) {
-    if (!strcmp(arg, names[i])) {
+    if (!strncmp(arg, names[i], num_chars)) {
       *dst = i;
       return 1;
     }
@@ -144,6 +164,11 @@
   return 0;
 }
 
+static int parse_enum(const char *arg, const char * const *names, int8_t *dst)
+{
+  return parse_enum_n(arg, 255, names, dst);
+}
+
 static int parse_tiles_specification(const char* const arg, int32_t * const ntiles, int32_t** const array) {
   const char* current_arg = NULL;
   int32_t current_value;
@@ -158,9 +183,9 @@
   
   //If the arg starts with u, we want an uniform split
   if (arg[0]=='u') {
-    *ntiles = atoi(arg+1)-1;
-    if (MAX_TILES_PER_DIM <= *ntiles || 0 > *ntiles) {
-      fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles + 1, MAX_TILES_PER_DIM);
+    *ntiles = atoi(arg + 1);
+    if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) {
+      fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM);
       return 0;
     }
     //Done with parsing
@@ -169,7 +194,7 @@
   
   //We have a comma-separated list of int for the split...
   current_arg = arg;
-  *ntiles = 0;
+  *ntiles = 1;
   do {
     int ret = sscanf(current_arg, "%d", &current_value);
     if (ret != 1) {
@@ -179,24 +204,24 @@
     current_arg = strchr(current_arg, ',');
     //Skip the , if we found one
     if (current_arg) ++current_arg;
-    values[*ntiles] = current_value;
+    values[*ntiles - 1] = current_value;
     ++(*ntiles);
     if (MAX_TILES_PER_DIM <= *ntiles) break;
   } while (current_arg);
   
-  if (MAX_TILES_PER_DIM <= *ntiles || 0 >= *ntiles) {
-    fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles + 1, MAX_TILES_PER_DIM);
+  if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) {
+    fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM);
     return 0;
   }
   
-  *array = MALLOC(int32_t, *ntiles);
+  *array = MALLOC(int32_t, *ntiles - 1);
   if (!*array) {
     fprintf(stderr, "Could not allocate array for tiles\n");
     return 0;
   }
   
   //TODO: memcpy?
-  for (i = 0; i < *ntiles; ++i) {
+  for (i = 0; i < *ntiles - 1; ++i) {
     (*array)[i] = values[i];
   }
   
@@ -266,7 +291,7 @@
 
 int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
 {
-  static const char * const me_names[]          = { "hexbs", "tz", "full", NULL };
+  static const char * const me_names[]          = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", NULL };
   static const char * const source_scan_type_names[] = { "progressive", "tff", "bff", NULL };
 
   static const char * const overscan_names[]    = { "undef", "show", "crop", NULL };
@@ -279,176 +304,244 @@
                                                     "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", NULL };
   static const char * const colormatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m",
                                                     "smpte240m", "YCgCo", "bt2020nc", "bt2020c", NULL };
+  static const char * const mv_constraint_names[] = { "none", "frame", "tile", "frametile", "frametilemargin", NULL };
+  static const char * const hash_names[] = { "none", "checksum", "md5", NULL };
+
+  static const char * const cu_split_termination_names[] = { "zero", "off", NULL };
+  static const char * const crypto_toggle_names[] = { "off", "on", NULL };
+  static const char * const crypto_feature_names[] = { "mvs", "mv_signs", "trans_coeffs", "trans_coeff_signs", NULL };
+
+  static const char * const me_early_termination_names[] = { "off", "on", "sensitive", NULL };

kvazaar-0.8.3.tar.gz/src/cfg.h -> kvazaar-1.0.0.tar.gz/src/cfg.h Changed

kvazaar-0.8.3.tar.gz/src/checkpoint.h -> kvazaar-1.0.0.tar.gz/src/checkpoint.h Changed

kvazaar-0.8.3.tar.gz/src/cli.c -> kvazaar-1.0.0.tar.gz/src/cli.c Changed

@@ -78,6 +78,7 @@
   { "no-aud",                   no_argument, NULL, 0 },
   { "cqmfile",            required_argument, NULL, 0 },
   { "seek",               required_argument, NULL, 0 },
+  { "tiles",              required_argument, NULL, 0 },
   { "tiles-width-split",  required_argument, NULL, 0 },
   { "tiles-height-split", required_argument, NULL, 0 },
   { "wpp",                      no_argument, NULL, 0 },
@@ -99,6 +100,24 @@
   { "no-mv-rdo",                no_argument, NULL, 0 },
   { "psnr",                     no_argument, NULL, 0 },
   { "no-psnr",                  no_argument, NULL, 0 },
+  { "version",                  no_argument, NULL, 0 },
+  { "help",                     no_argument, NULL, 0 },
+  { "loop-input",               no_argument, NULL, 0 },
+  { "mv-constraint",      required_argument, NULL, 0 },
+  { "hash",               required_argument, NULL, 0 },
+  {"cu-split-termination",required_argument, NULL, 0 },
+  { "crypto",             required_argument, NULL, 0 },
+  { "me-early-termination",required_argument, NULL, 0 },
+  { "lossless",                 no_argument, NULL, 0 },
+  { "no-lossless",              no_argument, NULL, 0 },
+  { "tmvp",                     no_argument, NULL, 0 },
+  { "no-tmvp",                  no_argument, NULL, 0 },
+  { "rdoq-skip",                no_argument, NULL, 0 },
+  { "no-rdoq-skip",             no_argument, NULL, 0 },
+  { "input-bitdepth",     required_argument, NULL, 0 },
+  { "input-format",       required_argument, NULL, 0 },
+  { "implicit-rdpcm",           no_argument, NULL, 0 },
+  { "no-implicit-rdpcm",        no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -203,6 +222,14 @@
       opts->seek = atoi(optarg);
     } else if (!strcmp(name, "frames")) {
       opts->frames = atoi(optarg);
+    } else if (!strcmp(name, "version")) {
+      opts->version = true;
+      goto done;
+    } else if (!strcmp(name, "help")) {
+      opts->help = true;
+      goto done;
+    } else if (!strcmp(name, "loop-input")) {
+      opts->loop_input = true;
     } else if (!api->config_parse(opts->config, name, optarg)) {
       fprintf(stderr, "invalid argument: %s=%s\n", name, optarg);
       ok = 0;
@@ -232,7 +259,7 @@
   }
 
   // Set resolution automatically if necessary
-  if (opts->config->width == 0 && opts->config->width == 0){
+  if (opts->config->width == 0 && opts->config->height == 0) {
     ok = select_input_res_auto(opts->input, &opts->config->width, &opts->config->height);
     goto done;
   }
@@ -263,28 +290,36 @@
 }
 
 
+void print_usage(void)
+{
+  fprintf(stdout,
+    "Kvazaar usage: -i and --input-res to set input, -o to set output\n"
+    "               --help for more information\n");
+}
+
+
 void print_version(void)
 {
-  fprintf(stderr,
-    "/***********************************************/\n"
-    " *   Kvazaar HEVC Encoder v. " VERSION_STRING "             *\n"
-    " *     Tampere University of Technology 2015   *\n"
-    "/***********************************************/\n\n");
+  fprintf(stdout,
+    "Kvazaar " VERSION_STRING "\n"
+    "Kvazaar license: LGPL version 2\n");
 }
 
 
 void print_help(void)
 {
-  fprintf(stderr,
+  fprintf(stdout,
     "Usage:\n"
     "kvazaar -i <input> --input-res <width>x<height> -o <output>\n"
     "\n"
     "Optional parameters:\n"
+    "      --help                     : Print this help message and exit\n"
+    "      --version                  : Print version information and exit\n"
     "      -n, --frames <integer>     : Number of frames to code [all]\n"
     "      --seek <integer>           : First frame to code [0]\n"
     "      --input-res <int>x<int>    : Input resolution (width x height) or\n"
     "                  auto           : try to detect from file name [auto]\n"
-    "      --input-fps <number>       : Framerate of the input video [25.0]\n"
+    "      --input-fps <num>/<denom>  : Framerate of the input video [25.0]\n"
     "      -q, --qp <integer>         : Quantization Parameter [32]\n"
     "      -p, --period <integer>     : Period of intra pictures [0]\n"
     "                                     0: only first picture is intra\n"
@@ -310,18 +345,21 @@
     "                                     2: full RDO\n"
     "          --mv-rdo               : Enable Rate-Distortion Optimized motion vector costs\n"
     "          --full-intra-search    : Try all intra modes.\n"
-    "          --me <string>          : Set integer motion estimation algorithm [\"hexbs\"]\n"
-    "                                     \"hexbs\": Hexagon Based Search (faster)\n"
-    "                                     \"tz\":    Test Zone Search (better quality)\n"
-    "                                     \"full\":  Full Search (super slow)\n"
     "          --no-transform-skip    : Disable transform skip\n"
     "          --aud                  : Use access unit delimiters\n"
     "          --cqmfile <string>     : Custom Quantization Matrices from a file\n"
     "          --debug <string>       : Output encoders reconstruction.\n"
     "          --cpuid <integer>      : Disable runtime cpu optimizations with value 0.\n"
-    "          --subme <integer>      : Set fractional pixel motion estimation level [1].\n"
+    "          --me <string>          : Set integer motion estimation algorithm [\"hexbs\"]\n"
+    "                                     \"hexbs\": Hexagon Based Search (faster)\n"
+    "                                     \"tz\":    Test Zone Search (better quality)\n"
+    "                                     \"full\":  Full Search (super slow)\n"
+    "          --subme <integer>      : Set fractional pixel motion estimation level [4].\n"
     "                                     0: only integer motion estimation\n"
-    "                                     1: fractional pixel motion estimation enabled\n"
+    "                                     1: + 1/2-pixel horizontal and vertical\n"
+    "                                     2: + 1/2-pixel diagonal\n"
+    "                                     3: + 1/4-pixel horizontal and vertical\n"
+    "                                     4: + 1/4-pixel diagonal\n"
     "          --source-scan-type <string> : Set source scan type [\"progressive\"].\n"
     "                                     \"progressive\": progressive scan\n"
     "                                     \"tff\": top field first\n"
@@ -331,15 +369,42 @@
     "          --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.\n"
     "                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4\n"
     "          --no-info              : Don't add information about the encoder to settings.\n"
-    "          --gop <int>           : Length of Group of Pictures, must be 8 or 0 [0]\n"
+    "          --gop <string>         : Definition of GOP structure [0]\n"
+    "                                     \"0\":           disabled\n"
+    "                                     \"8\":           B-frame pyramid of length 8\n"
+    "                                     \"lp-<string>\": lp-gop definition (e.g. lp-g8d4r3t2)\n"
     "          --bipred               : Enable bi-prediction search\n"
     "          --bitrate <integer>    : Target bitrate. [0]\n"
     "                                     0: disable rate-control\n"
     "                                     N: target N bits per second\n"
-    "          --preset <string>      : Use preset\n"
-    "                                     ultrafast, superfast,veryfast, faster,\n"
+    "          --preset <string>      : Use preset. This will override previous options.\n"
+    "                                     ultrafast, superfast, veryfast, faster,\n"
     "                                     fast, medium, slow, slower, veryslow, placebo\n"
     "          --no-psnr              : Don't calculate PSNR for frames\n"
+    "          --loop-input           : Re-read input file forever\n"
+    "          --mv-constraint        : Constrain movement vectors\n"
+    "                                     \"none\": no constraint\n"
+    "                                     \"frametile\": constrain within the tile\n"
+    "                                     \"frametilemargin\": constrain even more\n"
+    "          --hash                 : Specify which decoded picture hash to use [checksum]\n"
+    "                                     \"none\": 0 bytes\n"
+    "                                     \"checksum\": 18 bytes\n"
+    "                                     \"md5\": 56 bytes\n"
+    "          --cu-split-termination : Specify the cu split termination behaviour\n"
+    "                                     \"zero\": Terminate when splitting gives little\n"
+    "                                               improvement.\n"
+    "                                     \"off\": Don't terminate splitting early\n"
+    "          --me-early-termination : Specify the me early termination behaviour\n"
+    "                                     \"off\": Early termination is off\n"
+    "                                     \"on\": Early termination is on\n"
+    "                                     \"sensitive\": Sensitive early termination is on\n"
+    "          --lossless             : Use lossless coding\n"
+    "          --implicit-rdpcm       : Enable implicit residual DPCM. Currently only supported\n"
+    "                                   with lossless coding.\n"
+    "          --no-tmvp              : Disable Temporal Motion Vector Prediction\n"
+    "          --rdoq-skip            : Skips RDOQ for 4x4 blocks\n"
+    "          --input-format         : P420 or P400\n"
+    "          --input-bitdepth       : 8-16\n"
     "\n"
     "  Video Usability Information:\n"
     "          --sar <width:height>   : Specify Sample Aspect Ratio\n"
@@ -367,12 +432,13 @@
     "                                   Disable threads if set to 0.\n"
     "\n"
     "  Tiles:\n"
-    "          --tiles-width-split <string>|u<int> : \n"
+    "          --tiles <int>x<int>    : Split picture into width x height uniform tiles.\n"
+    "          --tiles-width-split <string>|u<int> :\n"
     "                                   Specifies a comma separated list of pixel\n"
     "                                   positions of tiles columns separation coordinates.\n"
     "                                   Can also be u followed by and a single int n,\n"
     "                                   in which case it produces columns of uniform width.\n"
-    "          --tiles-height-split <string>|u<int> : \n"
+    "          --tiles-height-split <string>|u<int> :\n"
     "                                   Specifies a comma separated list of pixel\n"
     "                                   positions of tiles rows separation coordinates.\n"
     "                                   Can also be u followed by and a single int n,\n"
@@ -382,13 +448,13 @@
     "          --wpp                  : Enable wavefront parallel processing\n"
     "          --owf <integer>|auto   : Number of parallel frames to process. 0 to disable.\n"
     "\n"
-    "  Slices:\n"
-    "          --slice-addresses <string>|u<int>: \n"
+    /*"  Slices:\n"
+    "          --slice-addresses <string>|u<int> :\n"
     "                                   Specifies a comma separated list of LCU\n"

kvazaar-0.8.3.tar.gz/src/cli.h -> kvazaar-1.0.0.tar.gz/src/cli.h Changed

kvazaar-0.8.3.tar.gz/src/context.c -> kvazaar-1.0.0.tar.gz/src/context.c Changed

@@ -20,114 +20,114 @@
 
 #include "context.h"
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "tables.h"
 
-#include "encoder.h"
 
+static const uint8_t INIT_SAO_MERGE_FLAG[3] = { 153, 153, 153 };
+static const uint8_t INIT_SAO_TYPE_IDX[3] = { 160, 185, 200 };
 
-// stuff
-
-const uint8_t kvz_INIT_SAO_MERGE_FLAG[3] = { 153, 153, 153 };
-const uint8_t kvz_INIT_SAO_TYPE_IDX[3] = { 160, 185, 200 };
-
-const uint8_t kvz_INIT_QT_ROOT_CBF[3][1] = {
+static const uint8_t INIT_QT_ROOT_CBF[3][1] = {
   {  79, },
   {  79, },
   { CNU, },
 };
 
-const uint8_t kvz_INIT_MVP_IDX[3][2] = {
+static const uint8_t INIT_MVP_IDX[3][2] = {
   { 168,  CNU, },
   { 168,  CNU, },
   { CNU,  CNU, },
 };
 
-const uint8_t kvz_INIT_REF_PIC[3][2] = {
+static const uint8_t INIT_REF_PIC[3][2] = {
   { 153,  153 },
   { 153,  153 },
   { CNU,  CNU },
 };
 
-const uint8_t kvz_INIT_MVD[3][2] = {
+static const uint8_t INIT_MVD[3][2] = {
   { 169,  198, },
   { 140,  198, },
   { CNU,  CNU, },
 };
 
-const uint8_t kvz_INIT_MERGE_FLAG_EXT[3][1] = {
+static const uint8_t INIT_MERGE_FLAG_EXT[3][1] = {
   { 154, },
   { 110, },
   { CNU, },
 };
 
-const uint8_t kvz_INIT_MERGE_IDX_EXT[3][1] = {
+static const uint8_t INIT_MERGE_IDX_EXT[3][1] = {
   { 137, },
   { 122, },
   { CNU, },
 };
 
-const uint8_t kvz_INIT_SKIP_FLAG[3][3] =  {
+static const uint8_t INIT_CU_TRANSQUANT_BYPASS[3][1] = {
+  { 154, },
+  { 154, },
+  { 154, },
+};
+
+static const uint8_t INIT_SKIP_FLAG[3][3] =  {
   { 197,  185,  201, },
   { 197,  185,  201, },
   { CNU,  CNU,  CNU, },
 };
 
-const uint8_t kvz_INIT_PRED_MODE[3][1] = {
+static const uint8_t INIT_PRED_MODE[3][1] = {
   { 134, },
   { 149, },
   { CNU, },
 };
 
 
-const uint8_t kvz_INIT_PART_SIZE[3][4] = {
+static const uint8_t INIT_PART_SIZE[3][4] = {
   { 154,  139,  CNU,  CNU, },
   { 154,  139,  CNU,  CNU, },
   { 184,  CNU,  CNU,  CNU, },
 };
 
-const uint8_t  kvz_INIT_SPLIT_FLAG[3][3] = {
+static const uint8_t  INIT_SPLIT_FLAG[3][3] = {
   { 107,  139,  126 },
   { 107,  139,  126 },
   { 139,  141,  157 },
 };
 
-const uint8_t kvz_INIT_INTRA_PRED_MODE[3] = {
+static const uint8_t INIT_INTRA_PRED_MODE[3] = {
   183, 154, 184
 };
 
-const uint8_t kvz_INIT_CHROMA_PRED_MODE[3][2] = {
+static const uint8_t INIT_CHROMA_PRED_MODE[3][2] = {
   { 152,  139 },
   { 152,  139 },
   {  63,  139 },
 };
 
-const uint8_t kvz_INIT_INTER_DIR[3][5] = {
+static const uint8_t INIT_INTER_DIR[3][5] = {
   {  95,  79,  63,  31,  31, },
   {  95,  79,  63,  31,  31, },
   { CNU, CNU, CNU, CNU, CNU, },
 };
 
-const uint8_t kvz_INIT_TRANS_SUBDIV_FLAG[3][3] = {
+static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][3] = {
   { 224,  167,  122 },
   { 124,  138,   94 },
   { 153,  138,  138 },
 };
 
-const uint8_t kvz_INIT_QT_CBF[3][8] = {
+static const uint8_t INIT_QT_CBF[3][8] = {
   { 153,  111,  CNU,  CNU,   149,   92,  167,  154 },
   { 153,  111,  CNU,  CNU,   149,  107,  167,  154 },
   { 111,  141,  CNU,  CNU,    94,  138,  182,  154 },
 };
 
-const uint8_t kvz_INIT_SIG_CG_FLAG[3][4] = {
+static const uint8_t INIT_SIG_CG_FLAG[3][4] = {
   { 121,  140,  61,  154  },
   { 121,  140,  61,  154 },
   {  91,  171,  134,  141  },
 };
 
-const uint8_t kvz_INIT_SIG_FLAG[3][42] = {
+static const uint8_t INIT_SIG_FLAG[3][42] = {
    {170,154,139,153,139,123,123, 63,124,166,
     183,140,136,153,154,166,183,140,136,153,
     154,166,183,140,136,153,154,170,153,138,
@@ -145,7 +145,7 @@
    139,111},
 };
 
-const uint8_t kvz_INIT_LAST[3][30] = {
+static const uint8_t INIT_LAST[3][30] = {
   { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
     108,  123,   93,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU  },
   { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
@@ -154,14 +154,14 @@
     108,  123,   63,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU  },
 };
 
-const uint8_t kvz_INIT_ONE_FLAG[3][24] =
+static const uint8_t INIT_ONE_FLAG[3][24] =
 {
   {154,196,167,167,154,152,167,182,182,134,149,136,153,121,136,122,169,208,166,167,154,152,167,182},
   {154,196,196,167,154,152,167,182,182,134,149,136,153,121,136,137,169,194,166,167,154,167,137,182},
   {140, 92,137,138,140,152,138,139,153, 74,149, 92,139,107,122,152,140,179,166,182,140,227,122,197},
 };
 
-const uint8_t kvz_INIT_ABS_FLAG[3][6] =
+static const uint8_t INIT_ABS_FLAG[3][6] =
 {
   { 107,167, 91,107,107,167},
   { 107,167, 91,122,107,167},
@@ -209,74 +209,75 @@
   kvz_ctx_init(&cabac->ctx.transform_skip_model_luma, QP, INIT_TRANSFORMSKIP_FLAG[slice][0]);
   kvz_ctx_init(&cabac->ctx.transform_skip_model_chroma, QP, INIT_TRANSFORMSKIP_FLAG[slice][1]);
 
-  kvz_ctx_init(&cabac->ctx.sao_merge_flag_model, QP, kvz_INIT_SAO_MERGE_FLAG[slice]);
-  kvz_ctx_init(&cabac->ctx.sao_type_idx_model, QP, kvz_INIT_SAO_TYPE_IDX[slice]);
+  kvz_ctx_init(&cabac->ctx.sao_merge_flag_model, QP, INIT_SAO_MERGE_FLAG[slice]);
+  kvz_ctx_init(&cabac->ctx.sao_type_idx_model, QP, INIT_SAO_TYPE_IDX[slice]);
 
-  kvz_ctx_init(&cabac->ctx.cu_merge_flag_ext_model, QP, kvz_INIT_MERGE_FLAG_EXT[slice][0]);
-  kvz_ctx_init(&cabac->ctx.cu_merge_idx_ext_model, QP, kvz_INIT_MERGE_IDX_EXT[slice][0]);
-  kvz_ctx_init(&cabac->ctx.cu_pred_mode_model, QP, kvz_INIT_PRED_MODE[slice][0]);
+  kvz_ctx_init(&cabac->ctx.cu_merge_flag_ext_model, QP, INIT_MERGE_FLAG_EXT[slice][0]);
+  kvz_ctx_init(&cabac->ctx.cu_merge_idx_ext_model, QP, INIT_MERGE_IDX_EXT[slice][0]);
+  kvz_ctx_init(&cabac->ctx.cu_pred_mode_model, QP, INIT_PRED_MODE[slice][0]);
+  kvz_ctx_init(&cabac->ctx.cu_transquant_bypass, QP, INIT_CU_TRANSQUANT_BYPASS[slice][0]);
 
-  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[0], QP, kvz_INIT_SKIP_FLAG[slice][0]);
-  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[1], QP, kvz_INIT_SKIP_FLAG[slice][1]);
-  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[2], QP, kvz_INIT_SKIP_FLAG[slice][2]);
+  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[0], QP, INIT_SKIP_FLAG[slice][0]);
+  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[1], QP, INIT_SKIP_FLAG[slice][1]);
+  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[2], QP, INIT_SKIP_FLAG[slice][2]);
 
-  kvz_ctx_init(&cabac->ctx.split_flag_model[0], QP, kvz_INIT_SPLIT_FLAG[slice][0]);
-  kvz_ctx_init(&cabac->ctx.split_flag_model[1], QP, kvz_INIT_SPLIT_FLAG[slice][1]);
-  kvz_ctx_init(&cabac->ctx.split_flag_model[2], QP, kvz_INIT_SPLIT_FLAG[slice][2]);
+  kvz_ctx_init(&cabac->ctx.split_flag_model[0], QP, INIT_SPLIT_FLAG[slice][0]);
+  kvz_ctx_init(&cabac->ctx.split_flag_model[1], QP, INIT_SPLIT_FLAG[slice][1]);
+  kvz_ctx_init(&cabac->ctx.split_flag_model[2], QP, INIT_SPLIT_FLAG[slice][2]);
 
-  kvz_ctx_init(&cabac->ctx.intra_mode_model, QP, kvz_INIT_INTRA_PRED_MODE[slice]);
+  kvz_ctx_init(&cabac->ctx.intra_mode_model, QP, INIT_INTRA_PRED_MODE[slice]);

kvazaar-0.8.3.tar.gz/src/context.h -> kvazaar-1.0.0.tar.gz/src/context.h Changed

kvazaar-0.8.3.tar.gz/src/cu.c -> kvazaar-1.0.0.tar.gz/src/cu.c Changed

@@ -24,6 +24,7 @@
 #include "cu.h"
 #include "threads.h"
 
+
 /**
  * \brief Number of PUs in a CU.
  *
@@ -76,6 +77,13 @@
   { {3, 4}, {1, 4}                 }, // nRx2N
 };
 
+
+#define BLIT_COEFF_CASE(n) case n:\
+  for (y = 0; y < n; ++y) {\
+    memcpy(&dst[y*dst_stride], &orig[y*orig_stride], n * sizeof(coeff_t));\
+  }\
+  break;
+
 void kvz_coefficients_blit(const coeff_t * const orig, coeff_t * const dst,
                          const unsigned width, const unsigned height,
                          const unsigned orig_stride, const unsigned dst_stride)
@@ -84,52 +92,11 @@
   
   int nxn_width = (width == height) ? width : 0;
   switch (nxn_width) {
-    case 4:
-      *(int64_t*)&dst[dst_stride*0] = *(int64_t*)&orig[orig_stride*0];
-      *(int64_t*)&dst[dst_stride*1] = *(int64_t*)&orig[orig_stride*1];
-      *(int64_t*)&dst[dst_stride*2] = *(int64_t*)&orig[orig_stride*2];
-      *(int64_t*)&dst[dst_stride*3] = *(int64_t*)&orig[orig_stride*3];
-      break;
-    case 8:
-#define KVZ_COPY_ROW_8(row_num) \
-*(int64_t*)&dst[dst_stride*(row_num)] = *(int64_t*)&orig[orig_stride*(row_num)]; \
-*(int64_t*)&dst[dst_stride*(row_num) + 4] = *(int64_t*)&orig[orig_stride*(row_num) + 4];
-      
-      KVZ_COPY_ROW_8(0);
-      KVZ_COPY_ROW_8(1);
-      KVZ_COPY_ROW_8(2);
-      KVZ_COPY_ROW_8(3);
-      KVZ_COPY_ROW_8(4);
-      KVZ_COPY_ROW_8(5);
-      KVZ_COPY_ROW_8(6);
-      KVZ_COPY_ROW_8(7);
-      break;
-#undef KVZ_COPY_ROW_8
-          case 16:
-#define KVZ_COPY_ROW_16(row_num) \
-*(int64_t*)&dst[dst_stride*(row_num)] = *(int64_t*)&orig[orig_stride*(row_num)]; \
-*(int64_t*)&dst[dst_stride*(row_num) + 4] = *(int64_t*)&orig[orig_stride*(row_num) + 4]; \
-*(int64_t*)&dst[dst_stride*(row_num) + 8] = *(int64_t*)&orig[orig_stride*(row_num) + 8]; \
-*(int64_t*)&dst[dst_stride*(row_num) + 12] = *(int64_t*)&orig[orig_stride*(row_num) + 12];
-      
-      KVZ_COPY_ROW_16(0);
-      KVZ_COPY_ROW_16(1);
-      KVZ_COPY_ROW_16(2);
-      KVZ_COPY_ROW_16(3);
-      KVZ_COPY_ROW_16(4);
-      KVZ_COPY_ROW_16(5);
-      KVZ_COPY_ROW_16(6);
-      KVZ_COPY_ROW_16(7);
-      KVZ_COPY_ROW_16(8);
-      KVZ_COPY_ROW_16(9);
-      KVZ_COPY_ROW_16(10);
-      KVZ_COPY_ROW_16(11);
-      KVZ_COPY_ROW_16(12);
-      KVZ_COPY_ROW_16(13);
-      KVZ_COPY_ROW_16(14);
-      KVZ_COPY_ROW_16(15);
-      break;
-#undef KVZ_COPY_ROW_16
+    BLIT_COEFF_CASE(4)
+    BLIT_COEFF_CASE(8)
+    BLIT_COEFF_CASE(16)
+    BLIT_COEFF_CASE(32)
+    BLIT_COEFF_CASE(64)
   default:
     for (y = 0; y < height; ++y) {
       memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(coeff_t));
@@ -138,43 +105,122 @@
   }
 }
 
-unsigned kvz_coefficients_calc_abs(const coeff_t *const buf, const int buf_stride,
-                        const int width)
+cu_info_t* kvz_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 {
-  int sum = 0;
-  int y, x;
+  return (cu_info_t*) kvz_cu_array_at_const(cua, x_px, y_px);
+}
 
-  for (y = 0; y < width; ++y) {
-    for (x = 0; x < width; ++x) {
-      sum += abs(buf[x + y * buf_stride]);
-    }
-  }
 
-  return sum;
+const cu_info_t* kvz_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px)
+{
+  assert(x_px < cua->width);
+  assert(y_px < cua->height);
+  return &(cua)->data[(x_px >> 2) + (y_px >> 2) * ((cua)->width >> 2)];
 }
 
-cu_array_t * kvz_cu_array_alloc(const int width_in_scu, const int height_in_scu) {
-  unsigned cu_array_size = height_in_scu * width_in_scu;
-  cu_array_t *cua;
-  cua = MALLOC(cu_array_t, 1);
-  cua->data = (cu_info_t*)malloc(sizeof(cu_info_t) * cu_array_size);
+
+/**
+ * \brief Allocate a CU array.
+ *
+ * \param width   width of the array in luma pixels
+ * \param height  height of the array in luma pixels
+ */
+cu_array_t * kvz_cu_array_alloc(const int width, const int height) {
+  cu_array_t *cua = MALLOC(cu_array_t, 1);
+
+  // Round up to a multiple of cell width and divide by cell width.
+  const int width_scu  = (width  + 15) >> 2;
+  const int height_scu = (height + 15) >> 2;
+  assert(width_scu  * 16 >= width);
+  assert(height_scu * 16 >= height);
+  const unsigned cu_array_size = width_scu * height_scu;
+  cua->data = calloc(cu_array_size, sizeof(cu_info_t));
+  cua->width  = width_scu  << 2;
+  cua->height = height_scu << 2;
   cua->refcount = 1;
-  FILL_ARRAY(cua->data, 0, cu_array_size);
+
   return cua;
 }
 
+
 int kvz_cu_array_free(cu_array_t * const cua)
 {
   int32_t new_refcount;
   if (!cua) return 1;
-  
+
   new_refcount = KVZ_ATOMIC_DEC(&(cua->refcount));
   //Still we have some references, do nothing
   if (new_refcount > 0) return 1;
-  
+
   FREE_POINTER(cua->data);
   free(cua);
 
   return 1;
 }
 
+
+/**
+ * \brief Copy part of a cu array to another cu array.
+ *
+ * All values are in luma pixels.
+ *
+ * \param dst     destination array
+ * \param dst_x   x-coordinate of the left edge of the copied area in dst
+ * \param dst_y   y-coordinate of the top edge of the copied area in dst
+ * \param src     source array
+ * \param src_x   x-coordinate of the left edge of the copied area in src
+ * \param src_y   y-coordinate of the top edge of the copied area in src
+ * \param width   width of the area to copy
+ * \param height  height of the area to copy
+ */
+void kvz_cu_array_copy(cu_array_t* dst,       int dst_x, int dst_y,
+                       const cu_array_t* src, int src_x, int src_y,
+                       int width, int height)
+{
+  // Convert values from pixel coordinates to array indices.
+  int src_stride = src->width >> 2;
+  int dst_stride = dst->width >> 2;
+  const cu_info_t* src_ptr = &src->data[(src_x >> 2) + (src_y >> 2) * src_stride];
+  cu_info_t* dst_ptr       = &dst->data[(dst_x >> 2) + (dst_y >> 2) * dst_stride];
+
+  // Number of bytes to copy per row.
+  const size_t row_size = sizeof(cu_info_t) * (width >> 2);
+
+  width = MIN(width,   MIN(src->width  - src_x, dst->width  - dst_x));
+  height = MIN(height, MIN(src->height - src_y, dst->height - dst_y));
+
+  assert(src_x + width  <= src->width);
+  assert(src_y + height <= src->height);
+  assert(dst_x + width  <= dst->width);
+  assert(dst_y + height <= dst->height);
+
+  for (int i = 0; i < (height >> 2); ++i) {
+    memcpy(dst_ptr, src_ptr, row_size);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+/**
+ * \brief Copy an lcu to a cu array.

kvazaar-0.8.3.tar.gz/src/cu.h -> kvazaar-1.0.0.tar.gz/src/cu.h Changed

@@ -26,16 +26,21 @@
  * Coding Unit data structure and related functions.
  */
 
-#include "global.h"
-
+#include "global.h" // IWYU pragma: keep
 #include "image.h"
+#include "kvazaar.h"
 
 
 //Cu stuff
 //////////////////////////////////////////////////////////////////////////
 // CONSTANTS
 
-typedef enum { CU_NOTSET = 0, CU_PCM, CU_SKIP, CU_SPLIT, CU_INTRA, CU_INTER } cu_type_t;
+typedef enum {
+  CU_NOTSET = 0,
+  CU_INTRA  = 1,
+  CU_INTER  = 2,
+  CU_PCM    = 3,
+} cu_type_t;
 
 typedef enum {
   SIZE_2Nx2N = 0,
@@ -106,46 +111,49 @@
   int y;
 } vector2d_t;
 
-typedef struct
-{
-  uint8_t y;
-  uint8_t u;
-  uint8_t v;
-} cu_cbf_t;
-
 /**
  * \brief Struct for CU info
  */
 typedef struct
 {
-  int8_t type;       //!< \brief block type, CU_INTER / CU_INTRA
-  int8_t depth;      //!< \brief depth / size of this block
-  int8_t part_size;  //!< \brief Currently only 2Nx2N, TODO: AMP/SMP/NxN parts
-  int8_t tr_depth;   //!< \brief transform depth
-  int8_t coded;      //!< \brief flag to indicate this block is coded and reconstructed
-  int8_t skipped;    //!< \brief flag to indicate this block is skipped
-  int8_t merged;     //!< \brief flag to indicate this block is merged
-  int8_t merge_idx;  //!< \brief merge index
-
-  cu_cbf_t cbf;
-  struct {
-    int8_t mode;
-    int8_t mode_chroma;
-    int8_t tr_skip;    //!< \brief transform skip flag
-  } intra[4];
-  struct {
-    double cost;
-    uint32_t bitcost;
-    int16_t mv[2][2];  // \brief Motion vectors for L0 and L1
-    int16_t mvd[2][2]; // \brief Motion vector differences for L0 and L1
-    uint8_t mv_cand[2]; // \brief selected MV candidate
-    uint8_t mv_ref[2]; // \brief Index of the encoder_control.ref array.
-    uint8_t mv_ref_coded[2]; // \brief Coded and corrected index of ref picture
-    uint8_t mv_dir; // \brief Probably describes if mv_ref is L0, L1 or both (bi-pred)
-    int8_t mode;
-  } inter;
+  uint8_t type      : 2; //!< \brief block type, one of cu_type_t values
+  uint8_t depth     : 3; //!< \brief depth / size of this block
+  uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values
+  uint8_t tr_depth  : 3; //!< \brief transform depth
+  uint8_t skipped   : 1; //!< \brief flag to indicate this block is skipped
+  uint8_t merged    : 1; //!< \brief flag to indicate this block is merged
+  uint8_t merge_idx : 3; //!< \brief merge index
+
+  uint16_t cbf;
+
+  union {
+    struct {
+      int8_t mode;
+      int8_t mode_chroma;
+      int8_t tr_skip;    //!< \brief transform skip flag
+    } intra;
+    struct {
+      int16_t mv[2][2];  // \brief Motion vectors for L0 and L1
+      uint8_t mv_ref[2]; // \brief Index of the encoder_control.ref array.
+      uint8_t mv_cand0 : 3; // \brief selected MV candidate
+      uint8_t mv_cand1 : 3; // \brief selected MV candidate
+      uint8_t mv_dir   : 2; // \brief Probably describes if mv_ref is L0, L1 or both (bi-pred)
+    } inter;
+  };
 } cu_info_t;
 
+#define CU_GET_MV_CAND(cu_info_ptr, reflist) \
+  (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
+
+#define CU_SET_MV_CAND(cu_info_ptr, reflist, value) \
+  do { \
+    if ((reflist) == 0) { \
+      (cu_info_ptr)->inter.mv_cand0 = (value); \
+    } else { \
+      (cu_info_ptr)->inter.mv_cand1 = (value); \
+    } \
+  } while (0)
+
 #define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
   "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
   "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
@@ -164,12 +172,20 @@
   (cu).inter.mv_cand, (cu).inter.mv_ref, (cu).inter.mv_dir, (cu).inter.mode)
 
 typedef struct {
-  cu_info_t *data;           //!< \brief cu_info data
-  int32_t refcount;        //!< \brief number of references in reflists to this cu_array
+  cu_info_t *data; //!< \brief cu array
+  int32_t width;    //!< \brief width of the array in pixels
+  int32_t height;   //!< \brief height of the array in pixels
+  int32_t refcount; //!< \brief number of references to this cu_array
 } cu_array_t;
 
-cu_array_t * kvz_cu_array_alloc(int width_in_scu, int height_in_scu);
+cu_array_t * kvz_cu_array_alloc(int width, int height);
 int kvz_cu_array_free(cu_array_t *cua);
+cu_info_t* kvz_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
+const cu_info_t* kvz_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
+void kvz_cu_array_copy(cu_array_t* dst,       int dst_x, int dst_y,
+                       const cu_array_t* src, int src_x, int src_y,
+                       int width, int height);
+
 
 /**
  * \brief Return the 7 lowest-order bits of the pixel coordinate.
@@ -179,9 +195,10 @@
  */
 #define SUB_SCU(xy) ((xy) & (LCU_WIDTH - 1))
 
-#define LCU_CU_WIDTH 8
-#define LCU_T_CU_WIDTH 9
-#define LCU_CU_OFFSET 10
+#define LCU_CU_WIDTH 16
+#define LCU_T_CU_WIDTH (LCU_CU_WIDTH + 1)
+#define LCU_CU_OFFSET (LCU_T_CU_WIDTH + 1)
+#define SCU_WIDTH (LCU_WIDTH / LCU_CU_WIDTH)
 
 // Width from top left of the LCU, so +1 for ref buffer size.
 #define LCU_REF_PX_WIDTH (LCU_WIDTH + LCU_WIDTH / 2)
@@ -217,43 +234,34 @@
   lcu_coeff_t coeff; //!< LCU coefficients
 
   /**
-   * A 9x9 CU array for the LCU, +1 CU.
-   * - Top reference CUs on row 0.
-   * - Left reference CUs on column 0.
-   * - All of LCUs CUs on 1:9, 1:9.
-   * - Top right reference CU on the last slot.
+   * A 17x17 CU array, plus the top right reference CU.
+   * - Top reference CUs at indices [0,16] (row 0).
+   * - Left reference CUs at indices 17*n where n is in [0,16] (column 0).
+   * - All CUs of this LCU at indices 17*y + x where x,y are in [1,16].
+   * - Top right reference CU at the last index.
+   *
+   * The figure below shows how the indices map to CU locations.
    *
    \verbatim
 
-      .-- left reference CUs
-      v
-       0 |  1  2  3  4  5  6  7  8 | 81 <-- top reference CUs
-     ----+-------------------------+----
-       9 | 10 11 12 13 14 15 16 17 |
-      18 | 19 20 21 22 23 24 25 26 <-- this LCU
-      27 | 28 29 30 31 32 33 34 35 |
-      36 | 37 38 39 40 41 42 43 44 |
-      45 | 46 47 48 49 50 51 52 53 |
-      54 | 55 56 57 58 59 60 61 62 |
-      63 | 64 65 66 67 68 69 70 71 |
-      72 | 73 74 75 76 77 78 79 80 |
-     ----+-------------------------+----
+       .-- left reference CUs
+       v
+        0 |   1   2  . . .  16 | 289 <-- top reference CUs
+     -----+--------------------+----
+       17 |  18  19  . . .  33 |
+       34 |  35  36  . . .  50 <-- this LCU
+        . |   .   .  .       . |
+        . |   .   .    .     . |
+        . |   .   .      .   . |
+      272 | 273 274  . . . 288 |
+     -----+--------------------+----
 
    \endverbatim
    */
-  cu_info_t cu[9*9+1];
+  cu_info_t cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1];
 } lcu_t;
 
-/**
- * \brief Return pointer to a given CU.
- *
- * \param lcu   pointer to the containing LCU

kvazaar-0.8.3.tar.gz/src/encmain.c -> kvazaar-1.0.0.tar.gz/src/encmain.c Changed

@@ -27,24 +27,24 @@
 /* The following two defines must be located before the inclusion of any system header files. */
 #define WINVER       0x0500
 #define _WIN32_WINNT 0x0500
-#include <io.h>       /* _setmode() */
 #include <fcntl.h>    /* _O_BINARY */
+#include <io.h>       /* _setmode() */
 #endif
 
-#include "global.h"
-
-#include "kvazaar_internal.h"
-
 #include <math.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
+#include <time.h> // IWYU pragma: keep for CLOCKS_PER_SEC
 
 #include "checkpoint.h"
-#include "global.h"
-#include "encoder.h"
 #include "cli.h"
+#include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "kvazaar_internal.h"
+#include "threads.h"
 #include "yuv_io.h"
 
 /**
@@ -98,14 +98,15 @@
  */
 static void compute_psnr(const kvz_picture *const src,
                          const kvz_picture *const rec,
-                         double psnr[NUM_COLORS])
+                         double psnr[3])
 {
   assert(src->width  == rec->width);
   assert(src->height == rec->height);
 
   int32_t pixels = src->width * src->height;
+  int colors = rec->chroma_format == KVZ_CSP_400 ? 1 : 3;
 
-  for (int32_t c = 0; c < NUM_COLORS; ++c) {
+  for (int32_t c = 0; c < colors; ++c) {
     int32_t num_pixels = pixels;
     if (c != COLOR_Y) {
       num_pixels >>= 2;
@@ -123,16 +124,20 @@
 }
 
 typedef struct {
-  FILE* input;
+  // Mutexes for synchronization.
   pthread_mutex_t* input_mutex;
   pthread_mutex_t* main_thread_mutex;
 
-  kvz_picture **img_in;
-  cmdline_opts_t *opts;
-  encoder_control_t *encoder;
-  uint8_t padding_x;
-  uint8_t padding_y;
-  const kvz_api * api;
+  // Parameters passed from main thread to input thread.
+  FILE* input;
+  const kvz_api *api;
+  const cmdline_opts_t *opts;
+  const encoder_control_t *encoder;
+  const uint8_t padding_x;
+  const uint8_t padding_y;
+
+  // Picture and thread status passed from input thread to main thread.
+  kvz_picture *img_in;
   int retval;
 } input_handler_args;
 
@@ -160,6 +165,7 @@
 
   input_handler_args* args = (input_handler_args*)in_args;
   kvz_picture *frame_in = NULL;
+  int retval = RETVAL_RUNNING;
   int frames_read = 0;
 
   for (;;) {
@@ -169,49 +175,92 @@
     bool input_empty = !(args->opts->frames == 0 // number of frames to read is unknown
                          || frames_read < args->opts->frames); // not all frames have been read
     if (feof(args->input) || input_empty) {
-      goto exit_eof;
+      retval = RETVAL_EOF;
+      goto done;
     }
 
-    frame_in = args->api->picture_alloc(args->opts->config->width + args->padding_x, args->opts->config->height + args->padding_y);
-        
+    enum kvz_chroma_format csp = KVZ_FORMAT2CSP(args->opts->config->input_format);
+    frame_in = args->api->picture_alloc_csp(csp,
+                                            args->opts->config->width  + args->padding_x,
+                                            args->opts->config->height + args->padding_y);
+
     if (!frame_in) {
       fprintf(stderr, "Failed to allocate image.\n");
-      goto exit_failure;
+      retval = RETVAL_FAILURE;
+      goto done;
     }
 
-    if (!yuv_io_read(args->input, args->opts->config->width, args->opts->config->height, frame_in)) {
+    bool read_success = yuv_io_read(args->input, 
+                                    args->opts->config->width,
+                                    args->opts->config->height,
+                                    args->encoder->cfg->input_bitdepth,
+                                    args->encoder->bitdepth,
+                                    frame_in);
+    if (!read_success) {
       // reading failed
       if (feof(args->input)) {
-        goto exit_eof;
+        // When looping input, re-open the file and re-read data.
+        if (args->opts->loop_input && args->input != stdin) {
+          fclose(args->input);
+          args->input = fopen(args->opts->input, "rb");
+          if (args->input == NULL)
+          {
+            fprintf(stderr, "Could not re-open input file, shutting down!\n");
+            retval = RETVAL_FAILURE;
+            goto done;
+          }
+          bool read_success = yuv_io_read(args->input,
+                                          args->opts->config->width,
+                                          args->opts->config->height,
+                                          args->encoder->cfg->input_bitdepth,
+                                          args->encoder->bitdepth,
+                                          frame_in);
+          if (!read_success) {
+            fprintf(stderr, "Could not re-open input file, shutting down!\n");
+            retval = RETVAL_FAILURE;
+            goto done;
+          }
+        } else {
+          retval = RETVAL_EOF;
+          goto done;
+        }
       } else {
         fprintf(stderr, "Failed to read a frame %d\n", frames_read);
-        goto exit_failure;
+        retval = RETVAL_FAILURE;
+        goto done;
       }
     }
 
+    frames_read++;
+
     if (args->encoder->cfg->source_scan_type != 0) {
       // Set source scan type for frame, so that it will be turned into fields.
       frame_in->interlacing = args->encoder->cfg->source_scan_type;
     }
-    args->img_in[frames_read & 1] = frame_in;
-    frame_in = NULL;
-
-    frames_read++;
 
-    // Wait until main thread is ready to receive input and then release main thread
+    // Wait until main thread is ready to receive the next frame.
     PTHREAD_LOCK(args->input_mutex);
+    args->img_in = frame_in;
+    args->retval = retval;
+    // Unlock main_thread_mutex to notify main thread that the new img_in
+    // and retval have been placed to args.
     PTHREAD_UNLOCK(args->main_thread_mutex);
+
+    frame_in = NULL;
   }
 
-exit_eof:
-  args->retval = RETVAL_EOF;  
-  args->img_in[frames_read & 1] = NULL;
-exit_failure:
-  // Do some cleaning up  
+done:
+  // Wait until main thread is ready to receive the next frame.
+  PTHREAD_LOCK(args->input_mutex);
+  args->img_in = NULL;
+  args->retval = retval;
+  // Unlock main_thread_mutex to notify main thread that the new img_in
+  // and retval have been placed to args.
+  PTHREAD_UNLOCK(args->main_thread_mutex);
+
+  // Do some cleaning up.
   args->api->picture_free(frame_in);
-  if (!args->retval) {
-    args->retval = RETVAL_FAILURE;
-  }
+
   pthread_exit(NULL);
   return 0;
 }

kvazaar-1.0.0.tar.gz/src/encode_coding_tree.c Added

@@ -0,0 +1,1083 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "encode_coding_tree.h"
+
+#include "cabac.h"
+#include "context.h"
+#include "cu.h"
+#include "encoder.h"
+#include "extras/crypto.h"
+#include "imagelist.h"
+#include "inter.h"
+#include "intra.h"
+#include "kvazaar.h"
+#include "kvz_math.h"
+#include "tables.h"
+#include "videoframe.h"
+
+/**
+ * \brief Encode (X,Y) position of the last significant coefficient
+ *
+ * \param lastpos_x   X component of last coefficient
+ * \param lastpos_y   Y component of last coefficient
+ * \param width       Block width
+ * \param height      Block height
+ * \param type        plane type / luminance or chrominance
+ * \param scan        scan type (diag, hor, ver)
+ *
+ * This method encodes the X and Y component within a block of the last
+ * significant coefficient.
+ */
+static void encode_last_significant_xy(encoder_state_t * const state,
+                                       uint8_t lastpos_x, uint8_t lastpos_y,
+                                       uint8_t width, uint8_t height,
+                                       uint8_t type, uint8_t scan)
+{
+  cabac_data_t * const cabac = &state->cabac;
+
+  const int index = kvz_math_floor_log2(width) - 2;
+  uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4);
+  uint8_t shift = type ? index : (index + 3) / 4;
+
+  cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
+  cabac_ctx_t *base_ctx_y = (type ? cabac->ctx.cu_ctx_last_y_chroma : cabac->ctx.cu_ctx_last_y_luma);
+
+  if (scan == SCAN_VER) {
+    SWAP(lastpos_x, lastpos_y, uint8_t);
+  }
+
+  const int group_idx_x = g_group_idx[lastpos_x];
+  const int group_idx_y = g_group_idx[lastpos_y];
+
+  // x prefix
+  for (int last_x = 0; last_x < group_idx_x; last_x++) {
+    cabac->cur_ctx = &base_ctx_x[ctx_offset + (last_x >> shift)];
+    CABAC_BIN(cabac, 1, "last_sig_coeff_x_prefix");
+  }
+  if (group_idx_x < g_group_idx[width - 1]) {
+    cabac->cur_ctx = &base_ctx_x[ctx_offset + (group_idx_x >> shift)];
+    CABAC_BIN(cabac, 0, "last_sig_coeff_x_prefix");
+  }
+
+  // y prefix
+  for (int last_y = 0; last_y < group_idx_y; last_y++) {
+    cabac->cur_ctx = &base_ctx_y[ctx_offset + (last_y >> shift)];
+    CABAC_BIN(cabac, 1, "last_sig_coeff_y_prefix");
+  }
+  if (group_idx_y < g_group_idx[height - 1]) {
+    cabac->cur_ctx = &base_ctx_y[ctx_offset + (group_idx_y >> shift)];
+    CABAC_BIN(cabac, 0, "last_sig_coeff_y_prefix");
+  }
+
+  // last_sig_coeff_x_suffix
+  if (group_idx_x > 3) {
+    const int suffix = lastpos_x - g_min_in_group[group_idx_x];
+    const int bits = (group_idx_x - 2) / 2;
+    CABAC_BINS_EP(cabac, suffix, bits, "last_sig_coeff_x_suffix");
+  }
+
+  // last_sig_coeff_y_suffix
+  if (group_idx_y > 3) {
+    const int suffix = lastpos_y - g_min_in_group[group_idx_y];
+    const int bits = (group_idx_y - 2) / 2;
+    CABAC_BINS_EP(cabac, suffix, bits, "last_sig_coeff_y_suffix");
+  }
+}
+
+void kvz_encode_coeff_nxn(encoder_state_t * const state,
+                          coeff_t *coeff,
+                          uint8_t width,
+                          uint8_t type,
+                          int8_t scan_mode,
+                          int8_t tr_skip)
+{
+  const encoder_control_t * const encoder = state->encoder_control;
+  cabac_data_t * const cabac = &state->cabac;
+  int c1 = 1;
+  uint8_t last_coeff_x = 0;
+  uint8_t last_coeff_y = 0;
+  int32_t i;
+  uint32_t sig_coeffgroup_flag[8 * 8] = { 0 };
+
+  int8_t be_valid = encoder->sign_hiding;
+  int32_t scan_pos_sig;
+  uint32_t go_rice_param = 0;
+  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
+
+  // CONSTANTS
+  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
+  const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
+  const uint32_t *scan           =
+    kvz_g_sig_last_scan[scan_mode][log2_block_size - 1];
+  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
+
+  // Init base contexts according to block type
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]);
+  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) :
+                                 &(cabac->ctx.cu_sig_model_chroma[0]);
+
+  // Scan all coeff groups to find out which of them have coeffs.
+  // Populate sig_coeffgroup_flag with that info.
+
+  unsigned sig_cg_cnt = 0;
+  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
+    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
+      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
+      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
+        // Load four 16-bit coeffs and see if any of them are non-zero.
+        unsigned coeff_pos = cg_pos + coeff_row * width;
+        uint64_t four_coeffs = *(uint64_t*)(&coeff[coeff_pos]);
+        if (four_coeffs) {
+          ++sig_cg_cnt;
+          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
+          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
+          sig_coeffgroup_flag[cg_pos_x + cg_pos_y * num_blk_side] = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  // Rest of the code assumes at least one non-zero coeff.
+  assert(sig_cg_cnt > 0);
+
+  // Find the last coeff group by going backwards in scan order.
+  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
+  while (!sig_coeffgroup_flag[scan_cg[scan_cg_last]]) {
+    --scan_cg_last;
+  }
+
+  // Find the last coeff by going backwards in scan order.
+  unsigned scan_pos_last = scan_cg_last * 16 + 15;
+  while (!coeff[scan[scan_pos_last]]) {
+    --scan_pos_last;
+  }
+
+  int pos_last = scan[scan_pos_last];
+
+  // transform skip flag
+  if(width == 4 && encoder->trskip_enable) {
+    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
+    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+  }
+
+  last_coeff_x = pos_last & (width - 1);
+  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
+
+  // Code last_coeff_x and last_coeff_y
+  encode_last_significant_xy(state, last_coeff_x, last_coeff_y, width, width,
+                             type, scan_mode);
+
+  scan_pos_sig  = scan_pos_last;
+
+  // significant_coeff_flag
+  for (i = scan_cg_last; i >= 0; i--) {
+    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
+    int32_t abs_coeff[16];
+    int32_t cg_blk_pos     = scan_cg[i];
+    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
+    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
+
+    uint32_t coeff_signs   = 0;

kvazaar-1.0.0.tar.gz/src/encode_coding_tree.h Added

@@ -0,0 +1,44 @@
+#ifndef ENCODE_CODING_TREE_H_
+#define ENCODE_CODING_TREE_H_
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \file
+ * Functions for writing the coding quadtree and related syntax.
+ */
+
+#include "encoderstate.h"
+#include "global.h"
+
+void kvz_encode_coding_tree(encoder_state_t *state,
+                            uint16_t x_ctb,
+                            uint16_t y_ctb,
+                            uint8_t depth);
+
+void kvz_encode_coeff_nxn(encoder_state_t *state,
+                          coeff_t *coeff,
+                          uint8_t width,
+                          uint8_t type,
+                          int8_t scan_mode,
+                          int8_t tr_skip);
+
+#endif // ENCODE_CODING_TREE_H_

kvazaar-0.8.3.tar.gz/src/encoder.c -> kvazaar-1.0.0.tar.gz/src/encoder.c Changed

@@ -22,22 +22,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
-#include <assert.h>
 
-#include "tables.h"
 #include "cfg.h"
-#include "cabac.h"
-#include "image.h"
-#include "nal.h"
-#include "context.h"
-#include "transform.h"
-#include "intra.h"
-#include "inter.h"
-#include "filter.h"
-#include "search.h"
-#include "sao.h"
-#include "rdo.h"
+#include "strategyselector.h"
+
 
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
@@ -49,57 +37,90 @@
 
 static int select_owf_auto(const kvz_config *const cfg)
 {
-  if (cfg->wpp) {
-    // If wpp is on, select owf such that less than 15% of the
-    // frame is covered by the are threads can not work at the same time.
-    const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
-    const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
+  if (cfg->intra_period == 1) {
+    if (cfg->wpp) {
+      // If wpp is on, select owf such that less than 15% of the
+      // frame is covered by the are threads can not work at the same time.
+      const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
+      const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
+
+      // Find the largest number of threads per frame that satifies the
+      // the condition: wpp start/stop inefficiency takes up  less than 15%
+      // of frame area.
+      int threads_per_frame = 1;
+      const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
+      while ((threads_per_frame + 1) * 2 < lcu_width &&
+        threads_per_frame + 1 < lcu_height &&
+        size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold) {
+        ++threads_per_frame;
+      }
 
-    // Find the largest number of threads per frame that satifies the
-    // the condition: wpp start/stop inefficiency takes up  less than 15%
-    // of frame area.
-    int threads_per_frame = 1;
-    const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
-    while ((threads_per_frame + 1) * 2 < lcu_width &&
-           threads_per_frame + 1 < lcu_height &&
-           size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold)
-    {
-      ++threads_per_frame;
-    }
+      const int threads = MAX(cfg->threads, 1);
+      const int frames = CEILDIV(threads, threads_per_frame);
 
-    const int threads = MAX(cfg->threads, 1);
-    const int frames = CEILDIV(threads, threads_per_frame);
+      // Convert from number of parallel frames to number of additional frames.
+      return CLIP(0, threads - 1, frames - 1);
+    } else {
+      // If wpp is not on, select owf such that there is enough
+      // tiles for twice the number of threads.
 
-    // Convert from number of parallel frames to number of additional frames.
-    return CLIP(0, threads - 1, frames - 1);
-  } else {
-    // If wpp is not on, select owf such that there is enough
-    // tiles for twice the number of threads.
+      int tiles_per_frame = cfg->tiles_width_count * cfg->tiles_height_count;
+      int threads = (cfg->threads > 1 ? cfg->threads : 1);
+      int frames = CEILDIV(threads * 4, tiles_per_frame);
 
-    int tiles_per_frame = 1;
-    if (cfg->tiles_width_count > 0) {
-      tiles_per_frame *= cfg->tiles_width_count + 1;
+      // Limit number of frames to 1.25x the number of threads for the case
+      // where there is only 1 tile per frame.
+      frames = CLIP(1, threads * 4 / 3, frames);
+      return frames - 1;
     }
-    if (cfg->tiles_height_count > 0) {
-      tiles_per_frame *= cfg->tiles_height_count + 1;
+  } else {
+    // Try and estimate a good number of parallel frames for inter.
+    const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
+    const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
+    int threads_per_frame = MIN(lcu_width / 2, lcu_height);
+    int threads = cfg->threads;
+
+    // If all threads fit into one frame, at least two parallel frames should
+    // be used to reduce the effect of WPP spin-up and wind-down.
+    int frames = 1;
+
+    while (threads > 0 && threads_per_frame > 0) {
+      frames += 1;
+      threads -= threads_per_frame;
+      threads_per_frame -= 2;
     }
-    int threads = (cfg->threads > 1 ? cfg->threads : 1);
-    int frames = CEILDIV(threads * 4, tiles_per_frame);
 
-    // Limit number of frames to 1.25x the number of threads for the case
-    // where there is only 1 tile per frame.
-    frames = CLIP(1, threads * 4 / 3, frames);
-    return frames - 1;
+    if (cfg->gop_lowdelay && cfg->gop_lp_definition.t > 1) {
+      // Temporal skipping makes every other frame very fast to encode so
+      // more parallel frames should be used.
+      frames *= 2;
+    }
+    return CLIP(0, cfg->threads * 2 - 1, frames - 1);
   }
 }
 
+
+static unsigned cfg_num_threads(void)
+{
+  unsigned cpus = kvz_g_hardware_flags.physical_cpu_count;
+  unsigned fake_cpus = kvz_g_hardware_flags.logical_cpu_count - cpus;
+
+  // Default to 4 if we don't know the number of CPUs.
+  if (cpus == 0) return 4;
+
+  // 1.5 times the number of physical cores seems to be a good compromise
+  // when hyperthreading is available on Haswell.
+  return cpus + fake_cpus / 2;
+}
+
+
 /**
  * \brief Allocate and initialize an encoder control structure.
  *
  * \param cfg   encoder configuration
  * \return      initialized encoder control or NULL on failure
  */
-encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) {
+encoder_control_t* kvz_encoder_control_init(kvz_config *const cfg) {
   encoder_control_t *encoder = NULL;
 
   if (!cfg) {
@@ -107,6 +128,20 @@
     goto init_failed;
   }
 
+  if (cfg->threads == -1) {
+    cfg->threads = cfg_num_threads();
+  }
+
+  if (cfg->gop_len > 0) {
+    if (cfg->tmvp_enable) {
+      cfg->tmvp_enable = false;
+      fprintf(stderr, "Disabling TMVP because GOP is used.\n");
+    }
+    if (cfg->gop_lowdelay) {
+      kvz_config_process_lp_gop(cfg);
+    }
+  }
+
   // Make sure that the parameters make sense.
   if (!kvz_config_validate(cfg)) {
     goto init_failed;
@@ -147,6 +182,8 @@
 
   encoder->bitdepth = KVZ_BIT_DEPTH;
 
+  encoder->chroma_format = KVZ_FORMAT2CSP(cfg->input_format);
+
   // deblocking filter
   encoder->deblock_enable    = 1;
   encoder->beta_offset_div2  = 0;
@@ -191,8 +228,8 @@
   }
 
   //Tiles
-  encoder->tiles_enable = encoder->cfg->tiles_width_count > 0 ||
-                          encoder->cfg->tiles_height_count > 0;
+  encoder->tiles_enable = encoder->cfg->tiles_width_count > 1 ||
+                          encoder->cfg->tiles_height_count > 1;
 
   {
     int i, j; //iteration variables
@@ -202,11 +239,11 @@
     //Temporary pointers to allow encoder fields to be const
     int32_t *tiles_col_width, *tiles_row_height, *tiles_ctb_addr_rs_to_ts, *tiles_ctb_addr_ts_to_rs, *tiles_tile_id, *tiles_col_bd, *tiles_row_bd;
 
-    if (encoder->cfg->tiles_width_count >= encoder->in.width_in_lcu) {
+    if (encoder->cfg->tiles_width_count > encoder->in.width_in_lcu) {
       fprintf(stderr, "Too many tiles (width)!\n");
       goto init_failed;

kvazaar-0.8.3.tar.gz/src/encoder.h -> kvazaar-1.0.0.tar.gz/src/encoder.h Changed

@@ -26,18 +26,12 @@
  * Initialization of encoder_control_t.
  */
 
-#include "global.h"
-
-#include "image.h"
-#include "bitstream.h"
-#include "cabac.h"
-#include "tables.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
 #include "scalinglist.h"
 #include "threadqueue.h"
 
 
-enum { FORMAT_400 = 0, FORMAT_420, FORMAT_422, FORMAT_444 };
-
 /* Encoder control options, the main struct */
 typedef struct encoder_control_t
 {
@@ -52,8 +46,6 @@
     int32_t height_in_lcu;
     int32_t real_width;  /*!< \brief real input picture width */
     int32_t real_height; /*!< \brief real input picture width */
-    int8_t video_format;
-    int8_t bitdepth;  /*!< \brief input bit depth (8,10) */
     int64_t pixels_per_pic;
     int8_t source_scan_type;
   } in;
@@ -66,6 +58,8 @@
   } me;
   
   int8_t bitdepth;
+  enum kvz_chroma_format chroma_format;
+
   int8_t tr_depth_intra;
 
   int8_t fme_level;
@@ -150,6 +144,8 @@
 
   bool sign_hiding;
 
+  bool implicit_rdpcm;
+
   //! Target average bits per picture.
   double target_avg_bppic;
 
@@ -161,7 +157,7 @@
 
 } encoder_control_t;
 
-encoder_control_t* kvz_encoder_control_init(const kvz_config *cfg);
+encoder_control_t* kvz_encoder_control_init(kvz_config *cfg);
 void kvz_encoder_control_free(encoder_control_t *encoder);
 
 void kvz_encoder_control_input_init(encoder_control_t *encoder, int32_t width, int32_t height);

kvazaar-0.8.3.tar.gz/src/encoder_state-bitstream.c -> kvazaar-1.0.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -20,12 +20,25 @@
 
 #include "encoder_state-bitstream.h"
 
-#include <string.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
+#include "bitstream.h"
+#include "cabac.h"
 #include "checkpoint.h"
+#include "cu.h"
+#include "encoder.h"
+#include "encoder_state-geometry.h"
 #include "encoderstate.h"
+#include "imagelist.h"
+#include "kvazaar.h"
+#include "kvz_math.h"
 #include "nal.h"
+#include "scalinglist.h"
+#include "tables.h"
+#include "threadqueue.h"
+#include "videoframe.h"
 
 
 static void encoder_state_write_bitstream_aud(encoder_state_t * const state)
@@ -33,8 +46,8 @@
   bitstream_t * const stream = &state->stream;
   kvz_nal_write(stream, KVZ_NAL_AUD_NUT, 0, 1);
 
-  uint8_t pic_type = state->global->slicetype == KVZ_SLICE_I ? 0
-                   : state->global->slicetype == KVZ_SLICE_P ? 1
+  uint8_t pic_type = state->frame->slicetype == KVZ_SLICE_I ? 0
+                   : state->frame->slicetype == KVZ_SLICE_P ? 1
                    :                                       2;
   WRITE_U(stream, pic_type, 3, "pic_type");
 
@@ -230,7 +243,7 @@
       encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
       encoder->vui.colormatrix != 2) {
     WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
-    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
+    WRITE_U(stream, encoder->vui.videoformat, 3, "chroma_format");
     WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
 
     if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
@@ -280,6 +293,33 @@
   //ENDIF
 }
 
+
+static void encoder_state_write_bitstream_SPS_extension(bitstream_t *stream,
+                                                        encoder_state_t * const state)
+{
+  if (state->encoder_control->cfg->implicit_rdpcm &&
+      state->encoder_control->cfg->lossless) {
+    WRITE_U(stream, 1, 1, "sps_extension_present_flag");
+
+    WRITE_U(stream, 1, 1, "sps_range_extension_flag");
+    WRITE_U(stream, 0, 1, "sps_multilayer_extension_flag");
+    WRITE_U(stream, 0, 1, "sps_3d_extension_flag");
+    WRITE_U(stream, 0, 5, "sps_extension_5bits");
+
+    WRITE_U(stream, 0, 1, "transform_skip_rotation_enabled_flag");
+    WRITE_U(stream, 0, 1, "transform_skip_context_enabled_flag");
+    WRITE_U(stream, 1, 1, "implicit_rdpcm_enabled_flag");
+    WRITE_U(stream, 0, 1, "explicit_rdpcm_enabled_flag");
+    WRITE_U(stream, 0, 1, "extended_precision_processing_flag");
+    WRITE_U(stream, 0, 1, "intra_smoothing_disabled_flag");
+    WRITE_U(stream, 0, 1, "high_precision_offsets_enabled_flag");
+    WRITE_U(stream, 0, 1, "persistent_rice_adaptation_enabled_flag");
+    WRITE_U(stream, 0, 1, "cabac_bypass_alignment_enabled_flag");
+  } else {
+    WRITE_U(stream, 0, 1, "sps_extension_present_flag");
+  }
+}
+
 static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
                                                             encoder_state_t * const state)
 {
@@ -297,10 +337,9 @@
   encoder_state_write_bitstream_PTL(stream, state);
 
   WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
-  WRITE_UE(stream, encoder->in.video_format,
-           "chroma_format_idc");
+  WRITE_UE(stream, encoder->chroma_format, "chroma_format_idc");
 
-  if (encoder->in.video_format == 3) {
+  if (encoder->chroma_format == KVZ_CSP_444) {
     WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
   }
 
@@ -380,14 +419,14 @@
   //IF long_term_ref_pics_present
   //ENDIF
 
-  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
+  WRITE_U(stream, state->encoder_control->cfg->tmvp_enable, 1,
           "sps_temporal_mvp_enable_flag");
   WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
   WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
 
   encoder_state_write_bitstream_VUI(stream, state);
 
-  WRITE_U(stream, 0, 1, "sps_extension_flag");
+  encoder_state_write_bitstream_SPS_extension(stream, state);
 
   kvz_bitstream_add_rbsp_trailing_bits(stream);
 }
@@ -424,7 +463,7 @@
   WRITE_U(stream, 0, 1, "weighted_bipred_idc");
 
   //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
-  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
+  WRITE_U(stream, encoder->cfg->lossless, 1, "transquant_bypass_enable_flag");
   WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
   //wavefronts
   WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
@@ -566,7 +605,7 @@
 
   if (state->encoder_control->vui.frame_field_info_present_flag){
 
-    int8_t odd_picture = state->global->frame % 2;
+    int8_t odd_picture = state->frame->num % 2;
     int8_t pic_struct = 0; //0: progressive picture, 1: top field, 2: bottom field, 3...
     int8_t source_scan_type = 1; //0: interlaced, 1: progressive
 
@@ -630,16 +669,6 @@
   }
 }
 
-static int num_bitcount(unsigned int n) {
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return ((n == 0) ? (-1) : pos);
-}
-
 void kvz_encoder_state_write_bitstream_slice_header(encoder_state_t * const state)
 {
   const encoder_control_t * const encoder = state->encoder_control;
@@ -648,22 +677,22 @@
   int ref_negative = 0;
   int ref_positive = 0;
   if (encoder->cfg->gop_len) {
-    for (j = 0; j < state->global->ref->used_size; j++) {
-      if (state->global->ref->pocs[j] < state->global->poc) {
+    for (j = 0; j < state->frame->ref->used_size; j++) {
+      if (state->frame->ref->pocs[j] < state->frame->poc) {
         ref_negative++;
       } else {
         ref_positive++;
       }
     }
-  } else ref_negative = state->global->ref->used_size;
+  } else ref_negative = state->frame->ref->used_size;
 
 #ifdef KVZ_DEBUG
   printf("=========== Slice ===========\n");
 #endif
   WRITE_U(stream, (state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
 
-  if (state->global->pictype >= KVZ_NAL_BLA_W_LP
-      && state->global->pictype <= KVZ_NAL_RSV_IRAP_VCL23) {
+  if (state->frame->pictype >= KVZ_NAL_BLA_W_LP
+      && state->frame->pictype <= KVZ_NAL_RSV_IRAP_VCL23) {
     WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
   }
 
@@ -674,7 +703,7 @@
     WRITE_UE(stream, state->slice->start_in_rs, "slice_segment_address");
   }
 
-  WRITE_UE(stream, state->global->slicetype, "slice_type");
+  WRITE_UE(stream, state->frame->slicetype, "slice_type");
 
   // if !entropy_slice_flag
 
@@ -682,12 +711,12 @@
       //WRITE_U(stream, 1, 1, "pic_output_flag");
     //end if
     //if( IdrPicFlag ) <- nal_unit_type == 5
-  if (state->global->pictype != KVZ_NAL_IDR_W_RADL
-      && state->global->pictype != KVZ_NAL_IDR_N_LP) {
+  if (state->frame->pictype != KVZ_NAL_IDR_W_RADL
+      && state->frame->pictype != KVZ_NAL_IDR_N_LP) {
     int last_poc = 0;
     int poc_shift = 0;
 
-      WRITE_U(stream, state->global->poc&0x1f, 5, "pic_order_cnt_lsb");
+      WRITE_U(stream, state->frame->poc&0x1f, 5, "pic_order_cnt_lsb");
       WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
       WRITE_UE(stream, ref_negative, "num_negative_pics");
       WRITE_UE(stream, ref_positive, "num_positive_pics");
@@ -697,9 +726,9 @@

kvazaar-0.8.3.tar.gz/src/encoder_state-bitstream.h -> kvazaar-1.0.0.tar.gz/src/encoder_state-bitstream.h Changed

kvazaar-0.8.3.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-1.0.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

@@ -20,29 +20,41 @@
 
 #include "encoder_state-ctors_dtors.h"
 
+#include <stdio.h>
 #include <stdlib.h>
 
+#include "bitstream.h"
+#include "cabac.h"
+#include "cu.h"
+#include "encoder.h"
+#include "encoder_state-geometry.h"
 #include "encoderstate.h"
+#include "extras/crypto.h"
+#include "image.h"
+#include "imagelist.h"
+#include "kvazaar.h"
+#include "threadqueue.h"
+#include "videoframe.h"
 
 
-static int encoder_state_config_global_init(encoder_state_t * const state) {
-  state->global->ref = kvz_image_list_alloc(MAX_REF_PIC_COUNT);
-  if(!state->global->ref) {
+static int encoder_state_config_frame_init(encoder_state_t * const state) {
+  state->frame->ref = kvz_image_list_alloc(MAX_REF_PIC_COUNT);
+  if(!state->frame->ref) {
     fprintf(stderr, "Failed to allocate the picture list!\n");
     return 0;
   }
-  state->global->ref_list = REF_PIC_LIST_0;
-  state->global->frame = 0;
-  state->global->poc = 0;
-  state->global->total_bits_coded = 0;
-  state->global->cur_gop_bits_coded = 0;
-  state->global->rc_alpha = 3.2003;
-  state->global->rc_beta = -1.367;
+  state->frame->ref_list = REF_PIC_LIST_0;
+  state->frame->num = 0;
+  state->frame->poc = 0;
+  state->frame->total_bits_coded = 0;
+  state->frame->cur_gop_bits_coded = 0;
+  state->frame->rc_alpha = 3.2003;
+  state->frame->rc_beta = -1.367;
   return 1;
 }
 
-static void encoder_state_config_global_finalize(encoder_state_t * const state) {
-  kvz_image_list_destroy(state->global->ref);
+static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
+  kvz_image_list_destroy(state->frame->ref);
 }
 
 static int encoder_state_config_tile_init(encoder_state_t * const state, 
@@ -50,7 +62,7 @@
                                           const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
   
   const encoder_control_t * const encoder = state->encoder_control;
-  state->tile->frame = kvz_videoframe_alloc(width, height, 0);
+  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format);
   
   state->tile->frame->rec = NULL;
   
@@ -72,14 +84,20 @@
   
   state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
   
-  //Allocate buffers
-  //order by row of (LCU_WIDTH * frame->width_in_lcu) pixels
-  state->tile->hor_buf_search = kvz_yuv_t_alloc(LCU_WIDTH * state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu);
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
-  state->tile->ver_buf_search = kvz_yuv_t_alloc(LCU_WIDTH * state->tile->frame->height_in_lcu * state->tile->frame->width_in_lcu);
+  // hor_buf_search and ver_buf_search store single row/col from each LCU row/col.
+  // Because these lines are independent, the chroma subsampling only matters in one
+  // of the directions, .
+  unsigned luma_size = LCU_WIDTH * state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu;
+  unsigned chroma_sizes_hor[] = { 0, luma_size / 2, luma_size / 2, luma_size };
+  unsigned chroma_sizes_ver[] = { 0, luma_size / 2, luma_size, luma_size };
+  unsigned chroma_size_hor = chroma_sizes_hor[state->encoder_control->chroma_format];
+  unsigned chroma_size_ver = chroma_sizes_ver[state->encoder_control->chroma_format];
+
+  state->tile->hor_buf_search = kvz_yuv_t_alloc(luma_size, chroma_size_hor);
+  state->tile->ver_buf_search = kvz_yuv_t_alloc(luma_size, chroma_size_ver);
   
   if (encoder->sao_enable) {
-    state->tile->hor_buf_before_sao = kvz_yuv_t_alloc(LCU_WIDTH * state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu);
+    state->tile->hor_buf_before_sao = kvz_yuv_t_alloc(luma_size, chroma_size_hor);
   } else {
     state->tile->hor_buf_before_sao = NULL;
   }
@@ -94,8 +112,14 @@
   } else {
     state->tile->wf_jobs = NULL;
   }
-  
   state->tile->id = encoder->tiles_tile_id[state->tile->lcu_offset_in_ts];
+  
+  state->tile->dbs_g = NULL;
+  if (state->encoder_control->cfg->crypto_features) {
+    state->tile->dbs_g = InitC();
+  }
+  state->tile->m_prev_pos = 0;
+
   return 1;
 }
 
@@ -107,7 +131,9 @@
   
   kvz_videoframe_free(state->tile->frame);
   state->tile->frame = NULL;
-  
+  if (state->encoder_control->cfg->crypto_features) {
+    DeleteCryptoC(state->tile->dbs_g);
+  }
   FREE_POINTER(state->tile->wf_jobs);
 }
 
@@ -227,7 +253,7 @@
   printf(" \"%p\" [\n", state);
   printf("  label = \"{encoder_state|");
   printf("+ type=%c\\l", state->type);
-  if (!state->parent || state->global != state->parent->global) {
+  if (!state->parent || state->frame != state->parent->global) {
     printf("|+ global\\l");
   }
   if (!state->parent || state->tile != state->parent->tile) {
@@ -274,7 +300,7 @@
   //
   //If parent_state is not NULL, the following variable should either be set to NULL,
   //in order to inherit from parent, or should point to a valid structure:
-  //child_state->global
+  //child_state->frame
   //child_state->tile
   //child_state->slice
   //child_state->wfrow
@@ -291,9 +317,9 @@
     const encoder_control_t * const encoder = child_state->encoder_control;
     child_state->type = ENCODER_STATE_TYPE_MAIN;
     assert(child_state->encoder_control);
-    child_state->global = MALLOC(encoder_state_config_global_t, 1);
-    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
-      fprintf(stderr, "Could not initialize encoder_state->global!\n");
+    child_state->frame = MALLOC(encoder_state_config_frame_t, 1);
+    if (!child_state->frame || !encoder_state_config_frame_init(child_state)) {
+      fprintf(stderr, "Could not initialize encoder_state->frame!\n");
       return 0;
     }
     child_state->tile = MALLOC(encoder_state_config_tile_t, 1);
@@ -313,7 +339,7 @@
     }
   } else {
     child_state->encoder_control = parent_state->encoder_control;
-    if (!child_state->global) child_state->global = parent_state->global;
+    if (!child_state->frame) child_state->frame = parent_state->frame;
     if (!child_state->tile) child_state->tile = parent_state->tile;
     if (!child_state->slice) child_state->slice = parent_state->slice;
     if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
@@ -401,9 +427,9 @@
         //Create a slice
         new_child = &child_state->children[child_count];
         new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_SLICE;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
+        new_child->type  = ENCODER_STATE_TYPE_SLICE;
+        new_child->frame = child_state->frame;
+        new_child->tile  = child_state->tile;
         new_child->wfrow = child_state->wfrow;
         new_child->slice = MALLOC(encoder_state_config_slice_t, 1);
         if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
@@ -427,9 +453,9 @@
         
         new_child = &child_state->children[child_count];
         new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_TILE;
-        new_child->global = child_state->global;
-        new_child->tile = MALLOC(encoder_state_config_tile_t, 1);
+        new_child->type  = ENCODER_STATE_TYPE_TILE;
+        new_child->frame = child_state->frame;
+        new_child->tile  = MALLOC(encoder_state_config_tile_t, 1);
         new_child->slice = child_state->slice;
         new_child->wfrow = child_state->wfrow;
         
@@ -511,9 +537,9 @@
         encoder_state_t *new_child = &child_state->children[i];
         
         new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
+        new_child->type  = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
+        new_child->frame = child_state->frame;
+        new_child->tile  = child_state->tile;
         new_child->slice = child_state->slice;
         new_child->wfrow = MALLOC(encoder_state_config_wfrow_t, 1);
         
@@ -668,9 +694,9 @@
     FREE_POINTER(state->tile);
   }

kvazaar-0.8.3.tar.gz/src/encoder_state-ctors_dtors.h -> kvazaar-1.0.0.tar.gz/src/encoder_state-ctors_dtors.h Changed

kvazaar-0.8.3.tar.gz/src/encoder_state-geometry.c -> kvazaar-1.0.0.tar.gz/src/encoder_state-geometry.c Changed

kvazaar-0.8.3.tar.gz/src/encoder_state-geometry.h -> kvazaar-1.0.0.tar.gz/src/encoder_state-geometry.h Changed

kvazaar-0.8.3.tar.gz/src/encoderstate.c -> kvazaar-1.0.0.tar.gz/src/encoderstate.c Changed

@@ -21,24 +21,21 @@
 #include "encoderstate.h"
 
 #include <math.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
 
-#include "tables.h"
 #include "cabac.h"
-#include "image.h"
-#include "nal.h"
 #include "context.h"
-#include "transform.h"
-#include "intra.h"
-#include "inter.h"
+#include "encode_coding_tree.h"
+#include "encoder_state-bitstream.h"
 #include "filter.h"
-#include "search.h"
-#include "sao.h"
-#include "rdo.h"
+#include "image.h"
 #include "rate_control.h"
-#include "strategies/strategies-picture.h"
+#include "sao.h"
+#include "search.h"
+#include "tables.h"
+
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -55,38 +52,57 @@
   videoframe_t* const frame = state->tile->frame;
   
   if (hor_buf) {
-    const int rdpx = lcu->position_px.x;
-    const int rdpy = lcu->position_px.y + lcu->size.y - 1;
-    const int by = lcu->position.y;
-    
     //Copy the bottom row of this LCU to the horizontal buffer
-    kvz_pixels_blit(&frame->rec->y[rdpy * frame->rec->stride + rdpx],
-                        &hor_buf->y[lcu->position_px.x + by * frame->width],
-                        lcu->size.x, 1, frame->rec->stride, frame->width);
-    kvz_pixels_blit(&frame->rec->u[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
-                        &hor_buf->u[lcu->position_px.x / 2 + by * frame->width / 2],
-                        lcu->size.x / 2, 1, frame->rec->stride / 2, frame->width / 2);
-    kvz_pixels_blit(&frame->rec->v[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
-                        &hor_buf->v[lcu->position_px.x / 2 + by * frame->width / 2],
-                        lcu->size.x / 2, 1, frame->rec->stride / 2, frame->width / 2);
+    vector2d_t bottom = { lcu->position_px.x, lcu->position_px.y + lcu->size.y - 1 };
+    const int lcu_row = lcu->position.y;
+
+    unsigned from_index = bottom.y * frame->rec->stride + bottom.x;
+    unsigned to_index = lcu->position_px.x + lcu_row * frame->width;
+    
+    kvz_pixels_blit(&frame->rec->y[from_index],
+                    &hor_buf->y[to_index],
+                    lcu->size.x, 1,
+                    frame->rec->stride, frame->width);
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      unsigned from_index_c = (bottom.y / 2) * frame->rec->stride / 2 + (bottom.x / 2);
+      unsigned to_index_c = lcu->position_px.x / 2 + lcu_row * frame->width / 2;
+
+      kvz_pixels_blit(&frame->rec->u[from_index_c],
+                      &hor_buf->u[to_index_c],
+                      lcu->size.x / 2, 1, 
+                      frame->rec->stride / 2, frame->width / 2);
+      kvz_pixels_blit(&frame->rec->v[from_index_c],
+                      &hor_buf->v[to_index_c],
+                      lcu->size.x / 2, 1,
+                      frame->rec->stride / 2, frame->width / 2);
+    }
   }
   
   if (ver_buf) {
-    const int rdpx = lcu->position_px.x + lcu->size.x - 1;
-    const int rdpy = lcu->position_px.y;
-    const int bx = lcu->position.x;
+    //Copy the right row of this LCU to the vertical buffer.
     
+    const int lcu_col = lcu->position.x;
+    vector2d_t left = { lcu->position_px.x + lcu->size.x - 1, lcu->position_px.y };
     
-    //Copy the right row of this LCU to the vertical buffer.
-    kvz_pixels_blit(&frame->rec->y[rdpy * frame->rec->stride + rdpx],
-                        &ver_buf->y[lcu->position_px.y + bx * frame->height],
-                        1, lcu->size.y, frame->rec->stride, 1);
-    kvz_pixels_blit(&frame->rec->u[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
-                        &ver_buf->u[lcu->position_px.y / 2 + bx * frame->height / 2],
-                        1, lcu->size.y / 2, frame->rec->stride / 2, 1);
-    kvz_pixels_blit(&frame->rec->v[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
-                        &ver_buf->v[lcu->position_px.y / 2 + bx * frame->height / 2],
-                        1, lcu->size.y / 2, frame->rec->stride / 2, 1);
+    kvz_pixels_blit(&frame->rec->y[left.y * frame->rec->stride + left.x],
+                    &ver_buf->y[lcu->position_px.y + lcu_col * frame->height],
+                    1, lcu->size.y,
+                    frame->rec->stride, 1);
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      unsigned from_index = (left.y / 2) * frame->rec->stride / 2 + (left.x / 2);
+      unsigned to_index = lcu->position_px.y / 2 + lcu_col * frame->height / 2;
+
+      kvz_pixels_blit(&frame->rec->u[from_index],
+                      &ver_buf->u[to_index],
+                      1, lcu->size.y / 2,
+                      frame->rec->stride / 2, 1);
+      kvz_pixels_blit(&frame->rec->v[from_index],
+                      &ver_buf->v[to_index],
+                      1, lcu->size.y / 2,
+                      frame->rec->stride / 2, 1);
+    }
   }
   
 }
@@ -172,8 +188,10 @@
   // If SAO is merged, nothing else needs to be coded.
   if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
     encode_sao_color(state, sao_luma, COLOR_Y);
-    encode_sao_color(state, sao_chroma, COLOR_U);
-    encode_sao_color(state, sao_chroma, COLOR_V);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      encode_sao_color(state, sao_chroma, COLOR_U);
+      encode_sao_color(state, sao_chroma, COLOR_V);
+    }
   }
 }
 
@@ -195,51 +213,9 @@
   }
 
   if (encoder->sao_enable) {
-    const int stride = frame->width_in_lcu;
-    int32_t merge_cost_luma[3] = { INT32_MAX };
-    int32_t merge_cost_chroma[3] = { INT32_MAX };
-    sao_info_t *sao_luma = &frame->sao_luma[lcu->position.y * stride + lcu->position.x];
-    sao_info_t *sao_chroma = &frame->sao_chroma[lcu->position.y * stride + lcu->position.x];
-
-    // Merge candidates
-    sao_info_t *sao_top_luma = lcu->position.y != 0 ? &frame->sao_luma[(lcu->position.y - 1) * stride + lcu->position.x] : NULL;
-    sao_info_t *sao_left_luma = lcu->position.x != 0 ? &frame->sao_luma[lcu->position.y * stride + lcu->position.x - 1] : NULL;
-    sao_info_t *sao_top_chroma = lcu->position.y != 0 ? &frame->sao_chroma[(lcu->position.y - 1) * stride + lcu->position.x] : NULL;
-    sao_info_t *sao_left_chroma = lcu->position.x != 0 ? &frame->sao_chroma[lcu->position.y * stride + lcu->position.x - 1] : NULL;
-
-    kvz_sao_search_luma(state, frame, lcu->position.x, lcu->position.y, sao_luma, sao_top_luma, sao_left_luma, merge_cost_luma);
-    kvz_sao_search_chroma(state, frame, lcu->position.x, lcu->position.y, sao_chroma, sao_top_chroma, sao_left_chroma, merge_cost_chroma);
-
-    sao_luma->merge_up_flag = sao_luma->merge_left_flag = 0;
-    // Check merge costs
-    if (sao_top_luma) {
-      // Merge up if cost is equal or smaller to the searched mode cost
-      if (merge_cost_luma[2] + merge_cost_chroma[2] <= merge_cost_luma[0] + merge_cost_chroma[0]) {        
-        *sao_luma = *sao_top_luma;
-        *sao_chroma = *sao_top_chroma;
-        sao_luma->merge_up_flag = 1;
-        sao_luma->merge_left_flag = 0;
-      }
-    }
-    if (sao_left_luma) {
-      // Merge left if cost is equal or smaller to the searched mode cost 
-      // AND smaller than merge up cost, if merge up was already chosen
-      if (merge_cost_luma[1] + merge_cost_chroma[1] <= merge_cost_luma[0] + merge_cost_chroma[0]) {
-        if (!sao_luma->merge_up_flag || merge_cost_luma[1] + merge_cost_chroma[1] < merge_cost_luma[2] + merge_cost_chroma[2]) {      
-          *sao_luma = *sao_left_luma;
-          *sao_chroma = *sao_left_chroma;
-          sao_luma->merge_left_flag = 1;
-          sao_luma->merge_up_flag = 0;
-        }
-      }
-    }
-    assert(sao_luma->eo_class < SAO_NUM_EO);
-    assert(sao_chroma->eo_class < SAO_NUM_EO);
-    
-    CHECKPOINT_SAO_INFO("sao_luma", *sao_luma);
-    CHECKPOINT_SAO_INFO("sao_chroma", *sao_chroma);
+    kvz_sao_search_lcu(state, lcu->position.x, lcu->position.y);
   }
-  
+
   // Copy LCU cu_array to main states cu_array, because that is the only one
   // which is given to the next frame through image_list_t.
   {
@@ -249,21 +225,17 @@
     while (main_state->parent) main_state = main_state->parent;
     assert(main_state != state);
 
-    unsigned child_width_in_scu = state->tile->frame->width_in_lcu << MAX_DEPTH;
-    unsigned main_width_in_scu = main_state->tile->frame->width_in_lcu << MAX_DEPTH;
-    unsigned tile_x = state->tile->lcu_offset_x;
-    unsigned tile_y = state->tile->lcu_offset_y;
-
-    unsigned x = lcu->position.x << MAX_DEPTH;
-    unsigned y = lcu->position.y << MAX_DEPTH;
-
-    for (unsigned lcu_row = 0; lcu_row < 8; ++lcu_row) {
-      cu_info_t *main_row = &main_state->tile->frame->cu_array->data[x + tile_x + (y + tile_y + lcu_row) * main_width_in_scu];
-      cu_info_t *child_row = &state->tile->frame->cu_array->data[x + (y + lcu_row) * child_width_in_scu];
-      memcpy(main_row, child_row, sizeof(cu_info_t) * 8);

kvazaar-0.8.3.tar.gz/src/encoderstate.h -> kvazaar-1.0.0.tar.gz/src/encoderstate.h Changed

@@ -26,25 +26,18 @@
  * Top level of the encoder implementation.
  */
 
-#include "global.h"
-
-#include "videoframe.h"
-#include "encoder.h"
-#include "image.h"
 #include "bitstream.h"
 #include "cabac.h"
+#include "cu.h"
+#include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "image.h"
+#include "imagelist.h"
+#include "kvazaar.h"
 #include "tables.h"
-#include "scalinglist.h"
 #include "threadqueue.h"
-#include "imagelist.h"
-
-// Submodules
-// Functions to obtain geometry information from LCU
-#include "encoder_state-geometry.h"
-// Constructors/destructors
-#include "encoder_state-ctors_dtors.h"
-// Functions writing bitstream parts
-#include "encoder_state-bitstream.h"
+#include "videoframe.h"
+#include "extras/crypto.h"
 
 
 typedef enum {
@@ -57,13 +50,13 @@
 
 
 
-typedef struct {
+typedef struct encoder_state_config_frame_t {
   double cur_lambda_cost; //!< \brief Lambda for SSE
   double cur_lambda_cost_sqrt; //!< \brief Lambda for SAD and SATD
   
-  int32_t frame;
-  int32_t poc; /*!< \brief picture order count */
-  int8_t gop_offset; /*!< \brief offset in the gop structure */
+  int32_t num;       /*!< \brief Frame number */
+  int32_t poc;       /*!< \brief Picture order count */
+  int8_t gop_offset; /*!< \brief Offset in the gop structure */
   
   int8_t QP;   //!< \brief Quantization parameter
   double QP_factor; //!< \brief Quantization factor
@@ -95,9 +88,9 @@
   double rc_alpha;
   double rc_beta;
 
-} encoder_state_config_global_t;
+} encoder_state_config_frame_t;
 
-typedef struct {
+typedef struct encoder_state_config_tile_t {
   //Current sub-frame
   videoframe_t *frame;
   
@@ -110,20 +103,29 @@
   //Position of the first element in tile scan in global coordinates
   int32_t lcu_offset_in_ts;
   
-  //Buffer for search
-  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
+  // This is a buffer for the non-loopfiltered bottom pixels of every LCU-row
+  // in the tile. They are packed such that each LCU-row index maps to the
+  // y-coordinate.
   yuv_t *hor_buf_search;
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
+  // This is a buffer for the non-loopfiltered rightmost pixels of every
+  // LCU-column. They are packed such that each LCU-column index maps to the
+  // x-coordinate.
   yuv_t *ver_buf_search;
   
+  // This is a buffer for the deblocked bottom pixels of every LCU-row in the
+  // tile. They are packed such that each LCU-row index maps to the y-coordinate.
   yuv_t *hor_buf_before_sao;
-  yuv_t *ver_buf_before_sao;
   
   //Jobs for each individual LCU of a wavefront row.
   threadqueue_job_t **wf_jobs;
+
+  // Instance of encryption generator by tile
+  Crypto_Handle dbs_g;
+  uint32_t m_prev_pos;
+
 } encoder_state_config_tile_t;
 
-typedef struct {
+typedef struct encoder_state_config_slice_t {
   int32_t id;
   
   //Global coordinates
@@ -135,7 +137,7 @@
   int32_t end_in_rs;
 } encoder_state_config_slice_t;
 
-typedef struct {
+typedef struct encoder_state_config_wfrow_t {
   //Row in tile coordinates of the wavefront
   int32_t lcu_offset_y;
 } encoder_state_config_wfrow_t;
@@ -171,7 +173,7 @@
   //Pointer to the encoder_state of the previous frame
   struct encoder_state_t *previous_encoder_state;
   
-  encoder_state_config_global_t *global;
+  encoder_state_config_frame_t  *frame;
   encoder_state_config_tile_t   *tile;
   encoder_state_config_slice_t  *slice;
   encoder_state_config_wfrow_t  *wfrow;
@@ -185,7 +187,7 @@
 
   /**
    * \brief Indicates that this encoder state is ready for encoding the
-   * next frame i.e. kvz_encoder_next_frame has been called.
+   * next frame i.e. kvz_encoder_prepare has been called.
    */
   int prepared;
 
@@ -203,24 +205,10 @@
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
 } encoder_state_t;
 
-void kvz_encode_one_frame(encoder_state_t *state);
-
-void kvz_encoder_next_frame(encoder_state_t *state);
-
+void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame);
 
-void kvz_encode_coding_tree(encoder_state_t *state, uint16_t x_ctb,
-                        uint16_t y_ctb, uint8_t depth);
+void kvz_encoder_prepare(encoder_state_t *state);
 
-void kvz_encode_last_significant_xy(encoder_state_t *state,
-                                uint8_t lastpos_x, uint8_t lastpos_y,
-                                uint8_t width, uint8_t height,
-                                uint8_t type, uint8_t scan);
-void kvz_encode_coeff_nxn(encoder_state_t *state, coeff_t *coeff, uint8_t width,
-                      uint8_t type, int8_t scan_mode, int8_t tr_skip);
-void kvz_encode_transform_coeff(encoder_state_t *state, int32_t x_cu, int32_t y_cu,
-                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
-void encode_block_residual(const encoder_control_t * const encoder,
-                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state);

kvazaar-1.0.0.tar.gz/src/extras/crypto.cpp Added

@@ -0,0 +1,126 @@
+#include <extras/crypto.h>
+
+#ifndef KVZ_SEL_ENCRYPTION
+extern int kvz_make_vs_ignore_crypto_not_having_symbols;
+int kvz_make_vs_ignore_crypto_not_having_symbols = 0;
+#else
+#include <cryptopp/aes.h>
+#include <cryptopp/modes.h>
+#include <cryptopp/osrng.h>
+typedef struct AESDecoder {
+#if AESEncryptionStreamMode
+        CryptoPP::CFB_Mode<CryptoPP::AES>::Encryption *CFBdec;
+#else
+    CryptoPP::CFB_Mode<CryptoPP::AES>::Decryption *CFBdec;
+#endif
+
+    byte key[CryptoPP::AES::DEFAULT_KEYLENGTH], iv[CryptoPP::AES::BLOCKSIZE], out_stream_counter[CryptoPP::AES::BLOCKSIZE], counter[CryptoPP::AES::BLOCKSIZE];
+    int couter_avail, counter_index, counter_index_pos;
+} AESDecoder;
+
+
+AESDecoder* Init() {
+    int init_val[32] = {201, 75, 219, 152, 6, 245, 237, 107, 179, 194, 81, 29, 66, 98, 198, 0, 16, 213, 27, 56, 255, 127, 242, 112, 97, 126, 197, 204, 25, 59, 38, 30};
+    AESDecoder * AESdecoder = (AESDecoder *)malloc(sizeof(AESDecoder));
+    for(int i=0;i<16; i++) {
+        AESdecoder->iv [i]     = init_val[i];
+        AESdecoder->counter[i] = init_val[5+i];
+        AESdecoder->key[i]     = init_val[i+16];
+    }
+#if AESEncryptionStreamMode
+    AESdecoder->CFBdec = new CryptoPP::CFB_Mode<CryptoPP::AES >::Encryption(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
+#else
+    AESdecoder->CFBdec = new CryptoPP::CFB_Mode<CryptoPP::AES >::Decryption(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
+#endif
+    AESdecoder->couter_avail      = 0;
+    AESdecoder->counter_index     = 0;
+    AESdecoder->counter_index_pos = 0;
+    return AESdecoder;
+}
+
+void DeleteCrypto(AESDecoder * AESdecoder) {
+    if(AESdecoder)
+        free(AESdecoder);
+}
+
+void Decrypt(AESDecoder *AESdecoder, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream) {
+    int nb_bytes = ceil((double)size_bits/8);
+    AESdecoder->CFBdec->ProcessData(out_stream, in_stream, nb_bytes);
+    if(size_bits&7)
+        AESdecoder->CFBdec->SetKeyWithIV(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
+    
+}
+void Incr_counter (unsigned char *counter) {
+    counter[0]++;
+}
+
+#if AESEncryptionStreamMode
+void Decrypt_counter(AESDecoder * AESdecoder) {
+    AESdecoder->CFBdec->ProcessData(AESdecoder->out_stream_counter, AESdecoder->counter, 16);
+    AESdecoder->couter_avail      = 128;
+    AESdecoder->counter_index     = 15;
+    AESdecoder->counter_index_pos = 8;
+    Incr_counter(AESdecoder->counter);
+}
+#endif
+
+#if AESEncryptionStreamMode
+unsigned int get_key (AESDecoder * AESdecoder, int nb_bits) {
+    unsigned int key_ = 0;
+    if(nb_bits > 32) {
+        printf("The Generator can not generate more than 32 bit %d \n", nb_bits);
+        return 0;
+    }
+    if( !nb_bits )
+        return 0;
+    if(!AESdecoder->couter_avail)
+        Decrypt_counter(AESdecoder);
+
+    if(AESdecoder->couter_avail >= nb_bits)
+        AESdecoder->couter_avail -= nb_bits;
+    else
+        AESdecoder->couter_avail = 0;
+    int nb = 0;
+    while( nb_bits ) {
+        if( nb_bits >= AESdecoder->counter_index_pos )
+            nb = AESdecoder->counter_index_pos;
+        else
+            nb = nb_bits;
+        key_ <<= nb;
+        key_ += (AESdecoder->out_stream_counter[AESdecoder->counter_index] & ((1<<nb)-1));
+        AESdecoder->out_stream_counter[AESdecoder->counter_index] >>= nb;
+        nb_bits -= nb;
+
+        if(AESdecoder->counter_index && nb == AESdecoder->counter_index_pos ) {
+            AESdecoder->counter_index--;
+            AESdecoder->counter_index_pos = 8;
+        } else {
+            AESdecoder->counter_index_pos -= nb;
+            if(nb_bits) {
+                Decrypt_counter(AESdecoder);
+                AESdecoder->couter_avail -=  nb_bits;
+            }
+        }
+    }
+    return key_;
+}
+#endif
+
+Crypto_Handle InitC(){
+    AESDecoder* AESdecoder = Init();
+    return AESdecoder;
+}
+#if AESEncryptionStreamMode
+unsigned int ff_get_key (Crypto_Handle *hdl, int nb_bits) {
+    return get_key ((AESDecoder*)*hdl, nb_bits);
+}
+#endif
+void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream) {
+    Decrypt((AESDecoder*)hdl, in_stream, size_bits, out_stream);
+}
+
+void DeleteCryptoC(Crypto_Handle hdl) {
+	  DeleteCrypto((AESDecoder *)hdl);
+}
+
+#endif // KVZ_SEL_ENCRYPTION

kvazaar-1.0.0.tar.gz/src/extras/crypto.h Added

@@ -0,0 +1,72 @@
+#ifndef CRYPTO_H_
+#define CRYPTO_H_
+
+#include "global.h"
+
+#ifdef KVZ_SEL_ENCRYPTION
+#define STUBBED extern
+#else
+#define STUBBED static
+#endif
+
+#include <stdio.h>
+#include <math.h>
+#define AESEncryptionStreamMode      1
+#ifdef __cplusplus
+extern "C" {
+#endif
+    typedef void* Crypto_Handle;
+
+    STUBBED Crypto_Handle InitC();
+    STUBBED void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream);
+#if AESEncryptionStreamMode
+    STUBBED unsigned int ff_get_key(Crypto_Handle *hdl, int nb_bits);
+#endif
+    STUBBED void DeleteCryptoC(Crypto_Handle hdl);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#ifndef KVZ_SEL_ENCRYPTION
+// Provide static stubs to allow linking without libcryptopp and allows us to
+// avoid sprinkling ifdefs everywhere and having a bunch of code that's not
+// compiled during normal development.
+// Provide them in the header so we can avoid compiling the cpp file, which
+// means we don't need a C++ compiler when crypto is not enabled.
+
+#include <assert.h>
+
+static INLINE Crypto_Handle InitC()
+{
+  // Stub.
+  assert(0);
+  return 0;
+}
+
+static INLINE void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream,
+              int size_bits, unsigned char  *out_stream)
+{
+  // Stub.
+  assert(0);
+}
+
+#if AESEncryptionStreamMode
+static INLINE unsigned int ff_get_key(Crypto_Handle *hdl, int nb_bits)
+{
+  // Stub.
+  assert(0);
+  return 0;
+}
+#endif
+
+static INLINE void DeleteCryptoC(Crypto_Handle hdl)
+{
+  // Stub.
+  assert(0);
+}
+
+#endif // KVZ_SEL_ENCRYPTION
+
+#endif // CRYPTO_H_

kvazaar-0.8.3.tar.gz/src/extras/getopt.c -> kvazaar-1.0.0.tar.gz/src/extras/getopt.c Changed

kvazaar-1.0.0.tar.gz/src/extras/libmd5.c Added

@@ -0,0 +1,258 @@
+/*
+ * This code implements the MD5 message-digest algorithm.  The algorithm was
+ * written by Ron Rivest.  This code was written by Colin Plumb in 1993, our
+ * understanding is that no copyright is claimed and that this code is in the
+ * public domain.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is functionally equivalent,
+ *
+ * To compute the message digest of a chunk of bytes, declare an MD5Context
+ * structure, pass it to kvz_md5_init, call kvz_md5_update as needed on buffers full of
+ * bytes, and then call kvz_md5_final, which will fill a supplied 16-byte array with
+ * the digest.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "extras/libmd5.h"
+
+
+//! \ingroup libMD5
+//! \{
+
+static void MD5Transform(uint32_t buf[4], uint32_t const in[16]);
+
+#ifndef __BIG_ENDIAN__
+# define byteReverse(buf, len)    /* Nothing */
+#else
+void byteReverse(uint32_t *buf, unsigned len);
+/*
+ * Note: this code is harmless on little-endian machines.
+ */
+void byteReverse(uint32_t *buf, unsigned len)
+{
+  uint32_t t;
+  do {
+    char* bytes = (char *) buf;
+    t = ((unsigned) bytes[3] << 8 | bytes[2]) << 16 |
+        ((unsigned) bytes[1] << 8 | bytes[0]);
+    *buf = t;
+    buf++;
+  } while (--len);
+}
+#endif
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void kvz_md5_init(context_md5_t *ctx)
+{
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bits[0] = 0;
+  ctx->bits[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void kvz_md5_update(context_md5_t *ctx, const unsigned char *buf, unsigned len)
+{
+  uint32_t t;
+
+  /* Update bitcount */
+
+  t = ctx->bits[0];
+  if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
+    ctx->bits[1]++;        /* Carry from low to high */
+  ctx->bits[1] += len >> 29;
+
+  t = (t >> 3) & 0x3f;    /* Bytes already in shsInfo->data */
+
+  /* Handle any leading odd-sized chunks */
+
+  if (t) {
+    unsigned char *p = ctx->in.b8 + t;
+
+    t = 64 - t;
+    if (len < t) {
+      memcpy(p, buf, len);
+      return;
+    }
+    memcpy(p, buf, t);
+    byteReverse(ctx->in.b32, 16);
+    MD5Transform(ctx->buf, ctx->in.b32);
+    buf += t;
+    len -= t;
+  }
+  /* Process data in 64-byte chunks */
+
+  while (len >= 64) {
+    memcpy(ctx->in.b8, buf, 64);
+    byteReverse(ctx->in.b32, 16);
+    MD5Transform(ctx->buf, ctx->in.b32);
+    buf += 64;
+    len -= 64;
+  }
+
+    /* Handle any remaining bytes of data. */
+
+  memcpy(ctx->in.b8, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void kvz_md5_final(unsigned char digest[16], context_md5_t *ctx)
+{
+  unsigned count;
+  unsigned char *p;
+
+  /* Compute number of bytes mod 64 */
+  count = (ctx->bits[0] >> 3) & 0x3F;
+
+  /* Set the first char of padding to 0x80.  This is safe since there is
+     always at least one byte free */
+  p = ctx->in.b8 + count;
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 64 bytes */
+  count = 64 - 1 - count;
+
+  /* Pad out to 56 mod 64 */
+  if (count < 8) {
+    /* Two lots of padding:  Pad the first block to 64 bytes */
+    memset(p, 0, count);
+    byteReverse(ctx->in.b32, 16);
+    MD5Transform(ctx->buf, ctx->in.b32);
+
+    /* Now fill the next block with 56 bytes */
+    memset(ctx->in.b8, 0, 56);
+  } else {
+    /* Pad block to 56 bytes */
+    memset(p, 0, count - 8);
+  }
+  byteReverse(ctx->in.b32, 14);
+
+  /* Append length in bits and transform */
+  ctx->in.b32[14] = ctx->bits[0];
+  ctx->in.b32[15] = ctx->bits[1];
+
+  MD5Transform(ctx->buf, ctx->in.b32);
+  byteReverse((uint32_t *) ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+
+  memset(ctx, 0, sizeof(* ctx));    /* In case it's sensitive */
+  /* The original version of this code omitted the asterisk. In
+     effect, only the first part of ctx was wiped with zeros, not
+     the whole thing. Bug found by Derek Jones. Original line: */
+  // memset(ctx, 0, sizeof(ctx));    /* In case it's sensitive */
+}
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+    ( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  kvz_md5_update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+static void MD5Transform(uint32_t buf[4], uint32_t const in[16])
+{
+  register uint32_t a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);

kvazaar-1.0.0.tar.gz/src/extras/libmd5.h Added

@@ -0,0 +1,58 @@
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2015, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+#include <stdint.h>
+
+//! \ingroup libMD5
+//! \{
+
+typedef struct _context_md5_t {
+  uint32_t buf[4];
+  uint32_t bits[2];
+  union {
+    unsigned char b8[64];
+    uint32_t b32[16];
+  } in;
+} context_md5_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void kvz_md5_init(context_md5_t *ctx);
+void kvz_md5_update(context_md5_t *ctx, const unsigned char *buf, unsigned len);
+void kvz_md5_final(unsigned char digest[16], context_md5_t *ctx);
+#ifdef __cplusplus
+}
+#endif
+
+//! \}

kvazaar-0.8.3.tar.gz/src/filter.c -> kvazaar-1.0.0.tar.gz/src/filter.c Changed

@@ -20,15 +20,14 @@
 
 #include "filter.h"
 
-#include <assert.h>
-#include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 
-#include "bitstream.h"
-#include "videoframe.h"
-#include "cabac.h"
+#include "cu.h"
+#include "encoder.h"
+#include "kvazaar.h"
 #include "transform.h"
+#include "videoframe.h"
+
 
 //////////////////////////////////////////////////////////////////////////
 // INITIALIZATIONS
@@ -180,9 +179,8 @@
                            int32_t y,
                            edge_dir dir)
 {
-  const cu_info_t *const scu = kvz_videoframe_get_cu(state->tile->frame,
-                                                     x >> MIN_SIZE,
-                                                     y >> MIN_SIZE);
+  const cu_info_t *const scu =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x, y);
   const int tu_width = LCU_WIDTH >> scu->tr_depth;
 
   if (dir == EDGE_HOR) {
@@ -207,16 +205,14 @@
                            int32_t y,
                            edge_dir dir)
 {
-  const cu_info_t *const scu = kvz_videoframe_get_cu(state->tile->frame,
-                                                     x >> MIN_SIZE,
-                                                     y >> MIN_SIZE);
+  const cu_info_t *const scu =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x, y);
   // Get the containing CU.
   const int32_t cu_width = LCU_WIDTH >> scu->depth;
   const int32_t x_cu = x & ~(cu_width - 1);
   const int32_t y_cu = y & ~(cu_width - 1);
-  const cu_info_t *const cu = kvz_videoframe_get_cu(state->tile->frame,
-                                                    x_cu >> MIN_SIZE,
-                                                    y_cu >> MIN_SIZE);
+  const cu_info_t *const cu =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu);
 
   const int num_pu = kvz_part_mode_num_parts[cu->part_size];
   for (int i = 0; i < num_pu; i++) {
@@ -285,8 +281,6 @@
 {
   videoframe_t * const frame = state->tile->frame;
   const encoder_control_t * const encoder = state->encoder_control;
-  
-  cu_info_t *cu_q = kvz_videoframe_get_cu(frame, x >> MIN_SIZE, y >> MIN_SIZE);
 
   {
     int32_t stride = frame->rec->stride;
@@ -295,12 +289,9 @@
     // TODO: support 10+bits
     kvz_pixel *orig_src = &frame->rec->y[x + y*stride];
     kvz_pixel *src = orig_src;
-    cu_info_t *cu_p = NULL;
-    int16_t x_cu = x >> MIN_SIZE;
-    int16_t y_cu = y >> MIN_SIZE;
 
     int8_t strength = 0;
-    int32_t qp              = state->global->QP;
+    int32_t qp              = state->frame->QP;
     int32_t bitdepth_scale  = 1 << (encoder->bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
     int32_t beta            = kvz_g_beta_table_8x8[b_index] * bitdepth_scale;
@@ -321,11 +312,22 @@
       int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d;
 
       {
-        // CU in the side we are filtering, update every 8-pixels
-        cu_p = kvz_videoframe_get_cu(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0));
+        // CUs on both sides of the edge
+        cu_info_t *cu_p;
+        cu_info_t *cu_q;
+        if (dir == EDGE_VER) {
+          int32_t y_coord = y + 4 * block_idx;
+          cu_p = kvz_cu_array_at(frame->cu_array, x - 1, y_coord);
+          cu_q = kvz_cu_array_at(frame->cu_array, x,     y_coord);
+
+        } else {
+          int32_t x_coord = x + 4 * block_idx;
+          cu_p = kvz_cu_array_at(frame->cu_array, x_coord, y - 1);
+          cu_q = kvz_cu_array_at(frame->cu_array, x_coord, y    );
+        }
 
-        bool nonzero_coeffs = cbf_is_set(cu_q->cbf.y, cu_q->tr_depth)
-                           || cbf_is_set(cu_p->cbf.y, cu_p->tr_depth);
+        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
+                           || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y);
 
         // Filter strength
         strength = 0;
@@ -343,7 +345,7 @@
         }
         
         // B-slice related checks
-        if(!strength && state->global->slicetype == KVZ_SLICE_B) {
+        if(!strength && state->frame->slicetype == KVZ_SLICE_B) {
 
           // Zero all undefined motion vectors for easier usage
           if(!(cu_q->inter.mv_dir & 1)) {
@@ -424,10 +426,6 @@
       dq = dq0 + dq3;
       d  =  d0 + d3;
 
-      #if ENABLE_PCM
-      // TODO: add PCM deblocking
-      #endif
-
       if (d < beta) {
         int8_t filter_P = (dp < side_threshold) ? 1 : 0;
         int8_t filter_Q = (dq < side_threshold) ? 1 : 0;
@@ -480,7 +478,6 @@
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
-  const cu_info_t *cu_q = kvz_videoframe_get_cu_const(frame, x >> (MIN_SIZE - 1), y >> (MIN_SIZE - 1));
 
   // For each subpart
   {
@@ -491,12 +488,9 @@
       &frame->rec->u[x + y*stride],
       &frame->rec->v[x + y*stride],
     };
-    const cu_info_t *cu_p = NULL;
-    int16_t x_cu = x >> (MIN_SIZE-1);
-    int16_t y_cu = y >> (MIN_SIZE-1);
     int8_t strength = 2;
 
-    int32_t QP             = kvz_g_chroma_scale[state->global->QP];
+    int32_t QP             = kvz_g_chroma_scale[state->frame->QP];
     int32_t bitdepth_scale = 1 << (encoder->bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = kvz_g_tc_table_8x8[TC_index]*bitdepth_scale;
@@ -508,7 +502,19 @@
 
     for (uint32_t blk_idx = 0; blk_idx < num_4px_parts; ++blk_idx)
     {
-      cu_p = kvz_videoframe_get_cu_const(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? blk_idx : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? blk_idx : 0));
+      // CUs on both sides of the edge
+      cu_info_t *cu_p;
+      cu_info_t *cu_q;
+      if (dir == EDGE_VER) {
+        int32_t y_coord = (y + 4 * blk_idx) << 1;
+        cu_p = kvz_cu_array_at(frame->cu_array, (x - 1) << 1, y_coord);
+        cu_q = kvz_cu_array_at(frame->cu_array,  x      << 1, y_coord);
+
+      } else {
+        int32_t x_coord = (x + 4 * blk_idx) << 1;
+        cu_p = kvz_cu_array_at(frame->cu_array, x_coord, (y - 1) << 1);
+        cu_q = kvz_cu_array_at(frame->cu_array, x_coord, (y    ) << 1);
+      }
 
       // Only filter when strenght == 2 (one of the blocks is intra coded)
       if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
@@ -575,7 +581,7 @@
   // Chroma pixel coordinates.
   const int32_t x_c = x >> 1;
   const int32_t y_c = y >> 1;
-  if (is_on_8x8_grid(x_c, y_c, dir)) {
+  if (state->encoder_control->chroma_format != KVZ_CSP_400 && is_on_8x8_grid(x_c, y_c, dir)) {
     filter_deblock_edge_chroma(state, x_c, y_c, length_c, dir, tu_boundary);
   }
 }
@@ -636,16 +642,18 @@
   }
 
   // Chroma
-  const int x_px_c = x_px >> 1;
-  const int y_px_c = y_px >> 1;
-  const int x_c = x_px_c - 4;
-  const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
-  for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
-    // The top edge of the whole frame is not filtered.
-    bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
-    bool pu_boundary = is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
-    if (y_c > 0 && (tu_boundary || pu_boundary)) {
-      filter_deblock_edge_chroma(state, x_c, y_c, 4, EDGE_HOR, tu_boundary);
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    const int x_px_c = x_px >> 1;
+    const int y_px_c = y_px >> 1;
+    const int x_c = x_px_c - 4;
+    const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
+    for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
+      // The top edge of the whole frame is not filtered.
+      bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
+      bool pu_boundary = is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
+      if (y_c > 0 && (tu_boundary || pu_boundary)) {

kvazaar-0.8.3.tar.gz/src/filter.h -> kvazaar-1.0.0.tar.gz/src/filter.h Changed

kvazaar-0.8.3.tar.gz/src/global.h -> kvazaar-1.0.0.tar.gz/src/global.h Changed

@@ -32,19 +32,22 @@
 #ifdef HAVE_CONFIG_H
 // Include config.h generated by automake. This needs to be before any other
 // includes in every file, which is why it's in global.
-#include "config.h"
+#include "config.h" // IWYU pragma: export
 #endif
 
-#include <assert.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
+// Include some basics in all files, like assert, primitives and NULL.
+// If you add anything to this list with export pragma, think long and
+// and hard if it's actually a good idea to incude it for every c-file.
+#include <assert.h> // IWYU pragma: export
+#include <stdbool.h> // IWYU pragma: export
+#include <stdint.h> // IWYU pragma: export
+#include <stddef.h> // IWYU pragma: export
+
+// The stdlib.h and string.h headers are needed because of MALLOC and FILL
+// macros defined here, as IWYU will remove them from files that use only
+// those macros.
 #include <stdlib.h>
 #include <string.h>
-#include <limits.h>
-
-#include "kvazaar.h"
-
 
 /**
  * \defgroup Bitstream
@@ -121,9 +124,6 @@
 //! spec: pcm_enabled_flag, Setting to 1 will enable using PCM blocks (current intra-search does not consider PCM)
 #define ENABLE_PCM 0
 
-//! Enable usage of temporal Motion Vector Prediction
-#define ENABLE_TEMPORAL_MVP 0
-
 //! skip residual coding when it's under _some_ threshold
 #define OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD 0
 
@@ -165,7 +165,6 @@
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
 #define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
-#define PU_INDEX(x_pu, y_pu) (((x_pu) % 2)  + 2 * ((y_pu) % 2))
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))
 
 #define LOG2_LCU_WIDTH 6
@@ -181,7 +180,9 @@
 
 // NOTE: When making a release, check to see if incrementing libversion in 
 // configure.ac is necessary.
+#ifndef KVZ_VERSION
 #define KVZ_VERSION 0.8.3
+#endif
 #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)
 
 //#define VERBOSE 1
@@ -248,6 +249,69 @@
 #define EXP_GOLOMB_TABLE_SIZE (4096*8)
 
 //Constants
-typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, NUM_COLORS } color_t;
+typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
+
+
+// Hardware data (abstraction of defines). Extend for other compilers
+#if defined(_M_IX86) || defined(__i586__) || defined(__i686__) || defined(_M_X64) || defined(_M_AMD64) || defined(__amd64__) || defined(__x86_64__)
+#  define COMPILE_INTEL 1
+#else
+#  define COMPILE_INTEL 0
+#endif
+
+// Visual Studio note:
+// Because these macros are only used to guard code that is guarded by CPUID
+// at runtime, use /arch parameter to disable them, but enable all intrinsics
+// supported by VisualStudio if SSE2 (highest) is enabled.
+// AVX and AVX2 are handled by /arch directly and sse intrinsics will use VEX
+// versions if they are defined.
+#define MSC_X86_SIMD(level) (_M_X64 || (_M_IX86_FP >= (level)))
+
+#if COMPILE_INTEL
+#  if defined(__MMX__) || MSC_X86_SIMD(1)
+#    define COMPILE_INTEL_MMX 1
+#  endif
+#  if defined(__SSE__) || MSC_X86_SIMD(1)
+#    define COMPILE_INTEL_SSE 1
+#  endif
+#  if defined(__SSE2__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSE2 1
+#  endif
+#  if defined(__SSE3__)
+#    define COMPILE_INTEL_SSE3 1
+#  endif
+#  if defined(__SSSE3__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSSE3 1
+#  endif
+#  if defined(__SSE4_1__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSE41 1
+#  endif
+#  if defined(__SSE4_2__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSE42 1
+#  endif
+#  if defined(__AVX__)
+#    define COMPILE_INTEL_AVX 1
+#   endif
+#  if defined(__AVX2__)
+#    define COMPILE_INTEL_AVX2 1
+#   endif
+#endif
+
+#if defined (_M_PPC) || defined(__powerpc64__) || defined(__powerpc__)
+#  define COMPILE_POWERPC 1
+#  ifdef __ALTIVEC__
+#    define COMPILE_POWERPC_ALTIVEC 1
+#  else
+#    define COMPILE_POWERPC_ALTIVEC 0
+#  endif
+#else
+#  define COMPILE_POWERPC 0
+#endif
+
+#if defined (_M_ARM) || defined(__arm__) || defined(__thumb__)
+#  define COMPILE_ARM 1
+#else
+#  define COMPILE_ARM 0
+#endif
 
 #endif

kvazaar-0.8.3.tar.gz/src/image.c -> kvazaar-1.0.0.tar.gz/src/image.c Changed

@@ -18,24 +18,29 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "threads.h"
 #include "image.h"
-#include "strategyselector.h"
 
-#include <string.h>
-#include <stdio.h>
+#include <limits.h>
 #include <stdlib.h>
-#include <math.h>
-#include <assert.h>
 
-#include "checkpoint.h"
-#include "sao.h"
+#include "strategies/strategies-picture.h"
+#include "threads.h"
+
+/**
+* \brief Allocate a new image with 420.
+* This function signature is part of the libkvz API.
+* \return image pointer or NULL on failure
+*/
+kvz_picture * kvz_image_alloc_420(const int32_t width, const int32_t height)
+{
+  return kvz_image_alloc(KVZ_CSP_420, width, height);
+}
 
 /**
  * \brief Allocate a new image.
  * \return image pointer or NULL on failure
  */
-kvz_picture *kvz_image_alloc(const int32_t width, const int32_t height)
+kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height)
 {
   //Assert that we have a well defined image
   assert((width % 2) == 0);
@@ -45,7 +50,10 @@
   if (!im) return NULL;
 
   unsigned int luma_size = width * height;
-  unsigned int chroma_size = luma_size / 4;
+  unsigned chroma_sizes[] = { 0, luma_size / 4, luma_size / 2, luma_size };
+  unsigned chroma_size = chroma_sizes[chroma_format];
+
+  im->chroma_format = chroma_format;
 
   //Allocate memory
   im->fulldata = MALLOC(kvz_pixel, (luma_size + 2 * chroma_size));
@@ -59,10 +67,17 @@
   im->width = width;
   im->height = height;
   im->stride = width;
+  im->chroma_format = chroma_format;
 
   im->y = im->data[COLOR_Y] = &im->fulldata[0];
-  im->u = im->data[COLOR_U] = &im->fulldata[luma_size];
-  im->v = im->data[COLOR_V] = &im->fulldata[luma_size + chroma_size];
+
+  if (chroma_format == KVZ_CSP_400) {
+    im->u = im->data[COLOR_U] = NULL;
+    im->v = im->data[COLOR_V] = NULL;
+  } else {
+    im->u = im->data[COLOR_U] = &im->fulldata[luma_size];
+    im->v = im->data[COLOR_V] = &im->fulldata[luma_size + chroma_size];
+  }
 
   im->pts = 0;
   im->dts = 0;
@@ -143,10 +158,13 @@
   im->width = width;
   im->height = height;
   im->stride = orig_image->stride;
+  im->chroma_format = orig_image->chroma_format;
 
   im->y = im->data[COLOR_Y] = &orig_image->y[x_offset + y_offset * orig_image->stride];
-  im->u = im->data[COLOR_U] = &orig_image->u[x_offset/2 + y_offset/2 * orig_image->stride/2];
-  im->v = im->data[COLOR_V] = &orig_image->v[x_offset/2 + y_offset/2 * orig_image->stride/2];
+  if (orig_image->chroma_format != KVZ_CSP_400) {
+    im->u = im->data[COLOR_U] = &orig_image->u[x_offset / 2 + y_offset / 2 * orig_image->stride / 2];
+    im->v = im->data[COLOR_V] = &orig_image->v[x_offset / 2 + y_offset / 2 * orig_image->stride / 2];
+  }
 
   im->pts = 0;
   im->dts = 0;
@@ -154,16 +172,22 @@
   return im;
 }
 
-yuv_t * kvz_yuv_t_alloc(int luma_size)
+yuv_t * kvz_yuv_t_alloc(int luma_size, int chroma_size)
 {
-  // Get buffers with separate mallocs in order to take advantage of
-  // automatic buffer overrun checks.
   yuv_t *yuv = (yuv_t *)malloc(sizeof(*yuv));
-  yuv->y = (kvz_pixel *)malloc(luma_size * sizeof(*yuv->y));
-  yuv->u = (kvz_pixel *)malloc(luma_size / 2 * sizeof(*yuv->u));
-  yuv->v = (kvz_pixel *)malloc(luma_size / 2 * sizeof(*yuv->v));
   yuv->size = luma_size;
 
+  // Get buffers with separate mallocs in order to take advantage of
+  // automatic buffer overrun checks.
+  yuv->y = (kvz_pixel *)malloc(luma_size * sizeof(*yuv->y));
+  if (chroma_size == 0) {
+    yuv->u = NULL;
+    yuv->v = NULL;
+  } else {
+    yuv->u = (kvz_pixel *)malloc(chroma_size * sizeof(*yuv->u));
+    yuv->v = (kvz_pixel *)malloc(chroma_size * sizeof(*yuv->v));
+  }
+  
   return yuv;
 }
 
@@ -469,4 +493,76 @@
   }
 
   return ssd;
-}
\ No newline at end of file
+}
+
+
+/**
+ * \brief BLock Image Transfer from one buffer to another.
+ *
+ * It's a stupidly simple loop that copies pixels.
+ *
+ * \param orig  Start of the originating buffer.
+ * \param dst  Start of the destination buffer.
+ * \param width  Width of the copied region.
+ * \param height  Height of the copied region.
+ * \param orig_stride  Width of a row in the originating buffer.
+ * \param dst_stride  Width of a row in the destination buffer.
+ *
+ * This should be inlined, but it's defined here for now to see if Visual
+ * Studios LTCG will inline it.
+ */
+#define BLIT_PIXELS_CASE(n) case n:\
+  for (y = 0; y < n; ++y) {\
+    memcpy(&dst[y*dst_stride], &orig[y*orig_stride], n * sizeof(kvz_pixel));\
+  }\
+  break;
+
+void kvz_pixels_blit(const kvz_pixel * const orig, kvz_pixel * const dst,
+                         const unsigned width, const unsigned height,
+                         const unsigned orig_stride, const unsigned dst_stride)
+{
+  unsigned y;
+  //There is absolutely no reason to have a width greater than the source or the destination stride.
+  assert(width <= orig_stride);
+  assert(width <= dst_stride);
+
+#ifdef CHECKPOINTS
+  char *buffer = malloc((3 * width + 1) * sizeof(char));
+  for (y = 0; y < height; ++y) {
+    int p;
+    for (p = 0; p < width; ++p) {
+      sprintf((buffer + 3*p), "%02X ", orig[y*orig_stride]);
+    }
+    buffer[3*width] = 0;
+    CHECKPOINT("kvz_pixels_blit_avx2: %04d: %s", y, buffer);
+  }
+  FREE_POINTER(buffer);
+#endif //CHECKPOINTS
+
+  if (width == orig_stride && width == dst_stride) {
+    memcpy(dst, orig, width * height * sizeof(kvz_pixel));
+    return;
+  }
+
+  int nxn_width = (width == height) ? width : 0;
+  switch (nxn_width) {
+    BLIT_PIXELS_CASE(4)
+    BLIT_PIXELS_CASE(8)
+    BLIT_PIXELS_CASE(16)
+    BLIT_PIXELS_CASE(32)
+    BLIT_PIXELS_CASE(64)
+  default:
+
+    if (orig == dst) {
+      //If we have the same array, then we should have the same stride
+      assert(orig_stride == dst_stride);
+      return;
+    }
+    assert(orig != dst || orig_stride == dst_stride);
+
+    for (y = 0; y < height; ++y) {
+      memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(kvz_pixel));
+    }
+    break;
+  }
+}

kvazaar-0.8.3.tar.gz/src/image.h -> kvazaar-1.0.0.tar.gz/src/image.h Changed

@@ -26,7 +26,7 @@
  * A reference counted YUV pixel buffer.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #include "kvazaar.h"
 
@@ -35,6 +35,7 @@
   kvz_pixel y[LCU_LUMA_SIZE];
   kvz_pixel u[LCU_CHROMA_SIZE];
   kvz_pixel v[LCU_CHROMA_SIZE];
+  enum kvz_chroma_format chroma_format;
 } lcu_yuv_t;
 
 typedef struct {
@@ -52,7 +53,8 @@
 } yuv_t;
 
 
-kvz_picture *kvz_image_alloc(const int32_t width, const int32_t height);
+kvz_picture *kvz_image_alloc_420(const int32_t width, const int32_t height);
+kvz_picture *kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height);
 
 void kvz_image_free(kvz_picture *im);
 
@@ -64,7 +66,7 @@
                              const unsigned width,
                              const unsigned height);
 
-yuv_t * kvz_yuv_t_alloc(int luma_size);
+yuv_t * kvz_yuv_t_alloc(int luma_size, int chroma_size);
 void kvz_yuv_t_free(yuv_t * yuv);
 
 hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size);
@@ -80,4 +82,10 @@
                   const int ref_stride, const int rec_stride,
                   const int width);
 
+
+void kvz_pixels_blit(const kvz_pixel* orig, kvz_pixel *dst,
+                         unsigned width, unsigned height,
+                         unsigned orig_stride, unsigned dst_stride);
+
+
 #endif

kvazaar-0.8.3.tar.gz/src/imagelist.c -> kvazaar-1.0.0.tar.gz/src/imagelist.c Changed

@@ -18,14 +18,13 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "threads.h"
 #include "imagelist.h"
-#include "strategyselector.h"
 
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
+
+#include "image.h"
+#include "threads.h"
 
 
 /**
@@ -36,13 +35,10 @@
 image_list_t * kvz_image_list_alloc(int size)
 {
   image_list_t *list = (image_list_t *)malloc(sizeof(image_list_t));
-  list->size = size;
-  if (size > 0) {
-    list->images = (kvz_picture**)malloc(sizeof(kvz_picture*) * size);
-    list->cu_arrays = (cu_array_t**)malloc(sizeof(cu_array_t*) * size);
-    list->pocs = malloc(sizeof(int32_t) * size);
-  }
-
+  list->size      = size;
+  list->images    = malloc(sizeof(kvz_picture*) * size);
+  list->cu_arrays = malloc(sizeof(cu_array_t*)  * size);
+  list->pocs      = malloc(sizeof(int32_t)      * size);
   list->used_size = 0;
 
   return list;
@@ -115,7 +111,8 @@
   }
 
   if (list->size == list->used_size) {
-    if (!kvz_image_list_resize(list, list->size*2)) return 0;
+    unsigned new_size = MAX(list->size + 1, list->size * 2);
+    if (!kvz_image_list_resize(list, new_size)) return 0;
   }
 
   for (i = list->used_size; i > 0; i--) {

kvazaar-0.8.3.tar.gz/src/imagelist.h -> kvazaar-1.0.0.tar.gz/src/imagelist.h Changed

kvazaar-0.8.3.tar.gz/src/input_frame_buffer.c -> kvazaar-1.0.0.tar.gz/src/input_frame_buffer.c Changed

@@ -19,8 +19,11 @@
  ****************************************************************************/
 
 #include "input_frame_buffer.h"
+
+#include "encoder.h"
 #include "encoderstate.h"
-#include <assert.h>
+#include "image.h"
+
 
 void kvz_init_input_frame_buffer(input_frame_buffer_t *input_buffer)
 {
@@ -35,42 +38,43 @@
 /**
  * \brief Pass an input frame to the encoder state.
  *
- * Sets the source image of the encoder state if there is a suitable image
- * available.
+ * Returns the image that should be encoded next if there is a suitable
+ * image available.
  *
  * The caller must not modify img_in after calling this function.
  *
  * \param buf     an input frame buffer
  * \param state   a main encoder state
  * \param img_in  input frame or NULL
- * \return        1 if the source image was set, 0 if not
+ * \return        pointer to the next picture, or NULL if no picture is
+ *                available
  */
-int kvz_encoder_feed_frame(input_frame_buffer_t *buf,
-                           encoder_state_t *const state,
-                           kvz_picture *const img_in)
+kvz_picture* kvz_encoder_feed_frame(input_frame_buffer_t *buf,
+                                    encoder_state_t *const state,
+                                    kvz_picture *const img_in)
 {
   const encoder_control_t* const encoder = state->encoder_control;
   const kvz_config* const cfg = encoder->cfg;
 
   const int gop_buf_size = 3 * cfg->gop_len;
 
-  assert(state->global->frame >= 0);
-
-  videoframe_t *frame = state->tile->frame;
-  assert(frame->source == NULL);
-  assert(frame->rec    != NULL);
+  assert(state->frame->num >= 0);
 
   if (cfg->gop_len == 0 || cfg->gop_lowdelay) {
-    // GOP disabled, just return the input frame.
+    // No reordering of output pictures necessary.
 
-    if (img_in == NULL) return 0;
+    if (img_in == NULL) return NULL;
 
     img_in->dts = img_in->pts;
-    frame->source   = kvz_image_copy_ref(img_in);
-    frame->rec->pts = img_in->pts;
-    frame->rec->dts = img_in->dts;
-    state->global->gop_offset = cfg->gop_lowdelay ? (state->global->frame-1) % cfg->gop_len : 0;
-    return 1;
+    state->frame->gop_offset = 0;
+    if (cfg->gop_lowdelay) {
+      state->frame->gop_offset = (state->frame->num - 1) % cfg->gop_len;
+      if (state->frame->gop_offset < 0) {
+        // Set gop_offset of IDR as the highest quality picture.
+        state->frame->gop_offset += cfg->gop_len;
+      }
+    }
+    return kvz_image_copy_ref(img_in);
   }
 
   if (img_in != NULL) {
@@ -101,7 +105,7 @@
 
   if (buf->num_out == buf->num_in) {
     // All frames returned.
-    return 0;
+    return NULL;
   }
 
   if (img_in == NULL && buf->num_in < cfg->gop_len) {
@@ -128,7 +132,7 @@
     // Output the first frame.
     idx_out = -1;
     dts_out = buf->pts_buffer[gop_buf_size - 1] + buf->delay;
-    gop_offset = 0;
+    gop_offset = 0; // highest quality picture
 
   } else {
     gop_offset = (buf->num_out - 1) % cfg->gop_len;
@@ -164,14 +168,12 @@
   // Index in buf->pic_buffer and buf->pts_buffer.
   int buf_idx = (idx_out + gop_buf_size) % gop_buf_size;
 
-  assert(buf->pic_buffer[buf_idx] != NULL);
-  frame->source      = buf->pic_buffer[buf_idx];
-  frame->rec->pts    = frame->source->pts;
-  frame->source->dts = dts_out;
-  frame->rec->dts    = dts_out;
+  kvz_picture* next_pic = buf->pic_buffer[buf_idx];
+  assert(next_pic != NULL);
+  next_pic->dts = dts_out;
   buf->pic_buffer[buf_idx] = NULL;
-  state->global->gop_offset = gop_offset;
+  state->frame->gop_offset = gop_offset;
 
   buf->num_out++;
-  return 1;
+  return next_pic;
 }

kvazaar-0.8.3.tar.gz/src/input_frame_buffer.h -> kvazaar-1.0.0.tar.gz/src/input_frame_buffer.h Changed

kvazaar-0.8.3.tar.gz/src/inter.c -> kvazaar-1.0.0.tar.gz/src/inter.c Changed

@@ -20,14 +20,16 @@
 
 #include "inter.h"
 
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
-#include "filter.h"
-#include "strategies/strategies-ipol.h"
-#include "strategies/generic/ipol-generic.h"
+#include "encoder.h"
+#include "imagelist.h"
 #include "strategies/generic/picture-generic.h"
+#include "strategies/strategies-ipol.h"
+#include "videoframe.h"
+
 
 static void inter_recon_frac_luma(const encoder_state_t * const state,
                                   const kvz_picture * const ref,
@@ -44,7 +46,7 @@
  #define FILTER_SIZE_Y 8 //Luma filter size
 
   // Fractional luma 1/4-pel
-  kvz_extended_block src = {0, 0, 0};
+  kvz_extended_block src = {0, 0, 0, 0};
 
   // Fractional luma
   kvz_get_extended_block(xpos,
@@ -60,7 +62,7 @@
                          block_width,
                          block_height,
                          &src);
-  kvz_sample_quarterpel_luma_generic(state->encoder_control,
+  kvz_sample_quarterpel_luma(state->encoder_control,
                                      src.orig_topleft,
                                      src.stride,
                                      block_width,
@@ -89,7 +91,7 @@
 #define FILTER_SIZE_Y 8 //Luma filter size
 
   // Fractional luma 1/4-pel
-  kvz_extended_block src = { 0, 0, 0 };
+  kvz_extended_block src = { 0, 0, 0, 0 };
 
   // Fractional luma
   kvz_get_extended_block(xpos,
@@ -105,7 +107,7 @@
                          block_width,
                          block_height,
                          &src);
-  kvz_sample_14bit_quarterpel_luma_generic(state->encoder_control,
+  kvz_sample_14bit_quarterpel_luma(state->encoder_control,
                                            src.orig_topleft,
                                            src.stride,
                                            block_width,
@@ -140,19 +142,19 @@
 #define FILTER_SIZE_C 4 //Chroma filter size
 
   // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0 };
+  kvz_extended_block src_u = { 0, 0, 0, 0 };
+  kvz_extended_block src_v = { 0, 0, 0, 0 };
 
   //Fractional chroma U
   kvz_get_extended_block(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
     ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_height, &src_u);
-  kvz_sample_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
+  kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
     block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   //Fractional chroma V
   kvz_get_extended_block(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
     ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_height, &src_v);
-  kvz_sample_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
+  kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
     block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   if (src_u.malloc_used) free(src_u.buffer);
@@ -180,8 +182,8 @@
 #define FILTER_SIZE_C 4 //Chroma filter size
 
   // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0 };
+  kvz_extended_block src_u = { 0, 0, 0, 0 };
+  kvz_extended_block src_v = { 0, 0, 0, 0 };
 
   //Fractional chroma U
   kvz_get_extended_block(xpos,
@@ -197,7 +199,7 @@
                          block_width,
                          block_height,
                          &src_u);
-  kvz_sample_14bit_octpel_chroma_generic(state->encoder_control,
+  kvz_sample_14bit_octpel_chroma(state->encoder_control,
                                          src_u.orig_topleft,
                                          src_u.stride,
                                          block_width,
@@ -222,7 +224,7 @@
                          block_width,
                          block_height,
                          &src_v);
-  kvz_sample_14bit_octpel_chroma_generic(state->encoder_control,
+  kvz_sample_14bit_octpel_chroma(state->encoder_control,
                                          src_v.orig_topleft,
                                          src_v.stride,
                                          block_width,
@@ -237,17 +239,56 @@
   if (src_v.malloc_used) free(src_v.buffer);
 }
 
+
+/**
+* \brief Copy from frame with extended border.
+*
+* \param ref_buf      pointer to the start of ref buffer
+* \param ref_stride   stride of ref buffer
+* \param ref_width    width of frame
+* \param ref_height   height of frame
+* \param rec_buf      pointer to the start of pu in rec buffer
+* \param rec_stride   stride of rec buffer
+* \param width        width of copied block
+* \param height       height of copied block
+* \param mv_in_frame  coordinates of copied block in frame coordinates
+*/
+static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride,
+                                     int ref_width, int ref_height,
+                                     kvz_pixel *rec_buf, int rec_stride,
+                                     int width, int height,
+                                     const vector2d_t *mv_in_frame)
+{
+  for (int y = mv_in_frame->y; y < mv_in_frame->y + height; ++y) {
+    for (int x = mv_in_frame->x; x < mv_in_frame->x + width; ++x) {
+      vector2d_t in_frame = {
+        CLIP(0, ref_width - 1, x),
+        CLIP(0, ref_height - 1, y),
+      };
+      vector2d_t in_pu = {
+        x - mv_in_frame->x,
+        y - mv_in_frame->y,
+      };
+      int pu_index = in_pu.y * rec_stride + in_pu.x;
+      int frame_index = in_frame.y * ref_stride + in_frame.x;
+      rec_buf[pu_index] = ref_buf[frame_index];
+    }
+  }
+}
+
+
 /**
  * \brief Reconstruct inter block
- * \param ref picture to copy the data from
- * \param xpos block x position
- * \param ypos block y position
- * \param width block width
- * \param height block height
- * \param mv[2] motion vector
- * \param lcu destination lcu
- * \param hi_prec destination of high precision output (null if not needed)
- * \returns Void
+ *
+ * \param state         encoder state
+ * \param ref           picture to copy the data from
+ * \param xpos          block x position
+ * \param ypos          block y position
+ * \param width         block width
+ * \param height        block height
+ * \param mv_param      motion vector
+ * \param lcu           destination lcu
+ * \param hi_prec_out   destination of high precision output (null if not needed)
 */
 void kvz_inter_recon_lcu(const encoder_state_t * const state,
                          const kvz_picture * const ref,
@@ -259,161 +300,122 @@
                          lcu_t *lcu,
                          hi_prec_buf_t *hi_prec_out)
 {
-  int x,y,coord_x,coord_y;
-  int16_t mv[2] = { mv_param[0], mv_param[1] };
-
-  int32_t dst_width_c = LCU_WIDTH>>1; //!< Destination picture width in chroma pixels
-  int32_t ref_width_c = ref->width>>1; //!< Reference picture width in chroma pixels
-
-  // negative overflow flag
-  int8_t overflow_neg_x = (state->tile->lcu_offset_x * LCU_WIDTH + xpos + (mv[0]>>2) < 0)?1:0;
-  int8_t overflow_neg_y = (state->tile->lcu_offset_y * LCU_WIDTH + ypos + (mv[1]>>2) < 0)?1:0;
-
-  // positive overflow flag
-  int8_t overflow_pos_x = (state->tile->lcu_offset_x * LCU_WIDTH + xpos + (mv[0]>>2) + width > ref->width )?1:0;
-  int8_t overflow_pos_y = (state->tile->lcu_offset_y * LCU_WIDTH + ypos + (mv[1]>>2) + height > ref->height)?1:0;
-
-  int8_t chroma_halfpel = ((mv[0]>>2)&1) || ((mv[1]>>2)&1); //!< (luma integer mv) lsb is set -> chroma is half-pel
-  // Luma quarter-pel
-  int8_t fractional_mv = (mv[0]&1) || (mv[1]&1) || (mv[0]&2) || (mv[1]&2); // either of 2 lowest bits of mv set -> mv is fractional
-
-  if(fractional_mv) {
-    if (state->encoder_control->cfg->bipred && hi_prec_out){

kvazaar-0.8.3.tar.gz/src/inter.h -> kvazaar-1.0.0.tar.gz/src/inter.h Changed

@@ -26,11 +26,12 @@
  * Inter prediction.
  */
 
-#include "global.h"
-
-#include "image.h"
-#include "encoder.h"
+#include "cu.h"
 #include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "image.h"
+#include "kvazaar.h"
+
 
 typedef struct {
   uint8_t dir;
@@ -60,17 +61,6 @@
                                 int16_t mv_param[2][2],
                                 lcu_t* lcu);
 
-void kvz_inter_get_spatial_merge_candidates(int32_t x,
-                                            int32_t y,
-                                            int32_t width,
-                                            int32_t height,
-                                            cu_info_t **b0,
-                                            cu_info_t **b1,
-                                            cu_info_t **b2,
-                                            cu_info_t **a0,
-                                            cu_info_t **a1,
-                                            lcu_t *lcu);
-
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t x,
                            int32_t y,
@@ -81,6 +71,15 @@
                            lcu_t *lcu,
                            int8_t reflist);
 
+void kvz_inter_get_mv_cand_cua(const encoder_state_t * const state,
+                               int32_t x,
+                               int32_t y,
+                               int32_t width,
+                               int32_t height,
+                               int16_t mv_cand[2][2],
+                               const cu_info_t* cur_cu,
+                               int8_t reflist);
+
 uint8_t kvz_inter_get_merge_cand(const encoder_state_t * const state,
                                  int32_t x, int32_t y,
                                  int32_t width, int32_t height,

kvazaar-0.8.3.tar.gz/src/intra.c -> kvazaar-1.0.0.tar.gz/src/intra.c Changed

@@ -20,48 +20,33 @@
 
 #include "intra.h"
 
-#include <assert.h>
-#include <stdio.h>
 #include <stdlib.h>
 
-#include "encoder.h"
-#include "transform.h"
+#include "image.h"
+#include "kvz_math.h"
 #include "strategies/strategies-intra.h"
-#include "strategies/strategies-picture.h"
+#include "tables.h"
+#include "transform.h"
+#include "videoframe.h"
 
 
 int8_t kvz_intra_get_dir_luma_predictor(
   const uint32_t x,
   const uint32_t y,
   int8_t *preds,
-  const cu_info_t *const cur_cu,
-  const cu_info_t *const left_cu,
-  const cu_info_t *const above_cu)
+  const cu_info_t *const cur_pu,
+  const cu_info_t *const left_pu,
+  const cu_info_t *const above_pu)
 {
-  int y_cu = y>>3;
-
   // The default mode if block is not coded yet is INTRA_DC.
   int8_t left_intra_dir  = 1;
-  int8_t above_intra_dir = 1;
-
-  if (x & 4) {
-    // If current CU is NxN and PU is on the right half, take mode from the
-    // left half of the same CU.
-    left_intra_dir = cur_cu->intra[PU_INDEX(0, y >> 2)].mode;
-  } else if (left_cu && left_cu->type == CU_INTRA) {
-    // Otherwise take the mode from the right side of the CU on the left.
-    left_intra_dir = left_cu->intra[PU_INDEX(1, y >> 2)].mode;
+  if (left_pu && left_pu->type == CU_INTRA) {
+    left_intra_dir = left_pu->intra.mode;
   }
 
-  if (y & 4) {
-    // If current CU is NxN and PU is on the bottom half, take mode from the
-    // top half of the same CU.
-    above_intra_dir = cur_cu->intra[PU_INDEX(x >> 2, 0)].mode;
-  } else if (above_cu && above_cu->type == CU_INTRA &&
-             (y_cu * (LCU_WIDTH>>MAX_DEPTH)) % LCU_WIDTH != 0)
-  {
-    // Otherwise take the mode from the bottom half of the CU above.
-    above_intra_dir = above_cu->intra[PU_INDEX(x >> 2, 1)].mode;
+  int8_t above_intra_dir = 1;
+  if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
+    above_intra_dir = above_pu->intra.mode;
   }
 
   // If the predictions are the same, add new predictions
@@ -213,7 +198,8 @@
   int_fast8_t log2_width,
   int_fast8_t mode,
   color_t color,
-  kvz_pixel *dst)
+  kvz_pixel *dst,
+  bool filter_boundary)
 {
   const int_fast8_t width = 1 << log2_width;
 
@@ -227,7 +213,7 @@
     // Angular modes use smoothed reference pixels, unless the mode is close
     // to being either vertical or horizontal.
     static const int kvz_intra_hor_ver_dist_thres[5] = { 0, 7, 1, 0, 0 };
-    int filter_threshold = kvz_intra_hor_ver_dist_thres[g_to_bits[width]];
+    int filter_threshold = kvz_intra_hor_ver_dist_thres[kvz_math_floor_log2(width) - 2];
     int dist_from_vert_or_hor = MIN(abs(mode - 26), abs(mode - 10));
     if (dist_from_vert_or_hor > filter_threshold) {
       used_ref = &refs->filtered_ref;
@@ -249,7 +235,7 @@
     }
   } else {
     kvz_angular_pred(log2_width, mode, used_ref->top, used_ref->left, dst);
-    if (color == COLOR_Y && width < 32) {
+    if (color == COLOR_Y && width < 32 && filter_boundary) {
       if (mode == 10) {
         intra_post_process_angular(width, 1, used_ref->top, dst);
       } else if (mode == 26) {
@@ -449,12 +435,12 @@
     kvz_intra_recon_lcu_luma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
-        cbf_set(&cur_cu->cbf.y, depth);
-      }
+      uint16_t child_cbfs[3] = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
     }
 
     return;
@@ -468,7 +454,9 @@
   kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs);
 
   kvz_pixel pred[32 * 32];
-  kvz_intra_predict(&refs, log2_width, intra_mode, COLOR_Y, pred);
+  const kvz_config *cfg = state->encoder_control->cfg;
+  bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
+  kvz_intra_predict(&refs, log2_width, intra_mode, COLOR_Y, pred, filter_boundary);
   
   kvz_pixel *block_in_lcu = &lcu->rec.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
   kvz_pixels_blit(pred, block_in_lcu, width, width, width, LCU_WIDTH);
@@ -502,18 +490,15 @@
     kvz_intra_recon_lcu_chroma(state, x,          y + offset, depth+1, intra_mode, NULL, lcu);
     kvz_intra_recon_lcu_chroma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
-    if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
-        cbf_set(&cur_cu->cbf.u, depth);
-      }
-      if (cbf_is_set(cu_a->cbf.v, depth+1) || cbf_is_set(cu_b->cbf.v, depth+1) || cbf_is_set(cu_c->cbf.v, depth+1)) {
-        cbf_set(&cur_cu->cbf.v, depth);
-      }
+    if (depth <= MAX_DEPTH) {
+      uint16_t child_cbfs[3] = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
     }
-
     return;
   }
 
@@ -528,7 +513,7 @@
       kvz_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs);
 
       kvz_pixel pred[32 * 32];
-      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_U, pred);
+      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_U, pred, false);
 
       kvz_pixel *pu_in_lcu = &lcu->rec.u[lcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4];
       kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);
@@ -540,7 +525,7 @@
       kvz_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs);
       
       kvz_pixel pred[32 * 32];
-      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_V, pred);
+      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_V, pred, false);
 
       kvz_pixel *pu_in_lcu = &lcu->rec.v[lcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4];
       kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);

kvazaar-0.8.3.tar.gz/src/intra.h -> kvazaar-1.0.0.tar.gz/src/intra.h Changed

@@ -26,9 +26,11 @@
 * Intra prediction.
 */
 
-#include "global.h"
-
+#include "cu.h"
 #include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+
 
 typedef struct {
   kvz_pixel left[2 * 32 + 1];
@@ -44,19 +46,21 @@
 
 /**
 * \brief Function for deriving intra luma predictions
-* \param pic picture to use
-* \param x_cu x CU position (smallest CU)
-* \param y_cu y CU position (smallest CU)
-* \param preds output buffer for 3 predictions
-* \returns (predictions are found)?1:0
+* \param x          x-coordinate of the PU in pixels
+* \param y          y-coordinate of the PU in pixels
+* \param preds      output buffer for 3 predictions
+* \param cur_pu     PU to check
+* \param left_pu    PU to the left of cur_pu
+* \param above_pu   PU above cur_pu
+* \returns          1 if predictions are found, otherwise 0
 */
 int8_t kvz_intra_get_dir_luma_predictor(
   const uint32_t x,
   const uint32_t y,
   int8_t *preds,
-  const cu_info_t *const cur_cu,
-  const cu_info_t *const left_cu,
-  const cu_info_t *const above_cu);
+  const cu_info_t *const cur_pu,
+  const cu_info_t *const left_pu,
+  const cu_info_t *const above_pu);
 
 /**
 * \brief Generage angular predictions.
@@ -78,18 +82,20 @@
 
 /**
  * \brief Generate intra predictions.
- * \param refs   Reference pixels used for the prediction.     
- * \param log2_width  Width of the predicted block.
- * \param mode   Intra mode used for the prediction.
- * \param color  Color of the prediction.
- * \param dst    Buffer for the predicted pixels.
+ * \param refs            Reference pixels used for the prediction.
+ * \param log2_width      Width of the predicted block.
+ * \param mode            Intra mode used for the prediction.
+ * \param color           Color of the prediction.
+ * \param dst             Buffer for the predicted pixels.
+ * \param filter_boundary Whether to filter the boundary on modes 10 and 26.
  */
 void kvz_intra_predict(
   kvz_intra_references *refs,
   int_fast8_t log2_width,
   int_fast8_t mode,
   color_t color,
-  kvz_pixel *dst);
+  kvz_pixel *dst,
+  bool filter_boundary);
 
 /**
  * \brief Do a full intra prediction cycle on a CU in lcu for luma.

kvazaar-0.8.3.tar.gz/src/kvazaar.c -> kvazaar-1.0.0.tar.gz/src/kvazaar.c Changed

@@ -18,17 +18,26 @@
 * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
 ****************************************************************************/
 
-#include "kvazaar_internal.h"
+#include "kvazaar.h"
 
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
+#include "bitstream.h"
 #include "cfg.h"
+#include "checkpoint.h"
 #include "encoder.h"
-#include "strategyselector.h"
+#include "encoder_state-bitstream.h"
+#include "encoder_state-ctors_dtors.h"
 #include "encoderstate.h"
-#include "checkpoint.h"
-#include "bitstream.h"
+#include "global.h"
+#include "image.h"
 #include "input_frame_buffer.h"
+#include "kvazaar_internal.h"
+#include "strategyselector.h"
+#include "threadqueue.h"
+#include "videoframe.h"
 
 
 static void kvazaar_close(kvz_encoder *encoder)
@@ -66,7 +75,9 @@
     goto kvazaar_open_failure;
   }
 
-  encoder->control = kvz_encoder_control_init(cfg);
+  // FIXME: const qualifier disgarded. I don't want to change kvazaar_open
+  // but I really need to change cfg.
+  encoder->control = kvz_encoder_control_init((kvz_config*)cfg);
   if (!encoder->control) {
     goto kvazaar_open_failure;
   }
@@ -91,7 +102,7 @@
       goto kvazaar_open_failure;
     }
 
-    encoder->states[i].global->QP = (int8_t)cfg->qp;
+    encoder->states[i].frame->QP = (int8_t)cfg->qp;
   }
 
   for (int i = 0; i < encoder->num_encoder_states; ++i) {
@@ -103,7 +114,7 @@
     kvz_encoder_state_match_children_of_previous_frame(&encoder->states[i]);
   }
 
-  encoder->states[encoder->cur_state_num].global->frame = -1;
+  encoder->states[encoder->cur_state_num].frame->num = -1;
 
   return encoder;
 
@@ -115,10 +126,10 @@
 
 static void set_frame_info(kvz_frame_info *const info, const encoder_state_t *const state)
 {
-  info->poc = state->global->poc,
-  info->qp = state->global->QP;
-  info->nal_unit_type = state->global->pictype;
-  info->slice_type = state->global->slicetype;
+  info->poc = state->frame->poc,
+  info->qp = state->frame->QP;
+  info->nal_unit_type = state->frame->pictype;
+  info->slice_type = state->frame->slicetype;
   kvz_encoder_get_ref_lists(state, info->ref_list_len, info->ref_list);
 }
 
@@ -203,18 +214,19 @@
   encoder_state_t *state = &enc->states[enc->cur_state_num];
 
   if (!state->prepared) {
-    kvz_encoder_next_frame(state);
+    kvz_encoder_prepare(state);
   }
 
   if (pic_in != NULL) {
     // FIXME: The frame number printed here is wrong when GOP is enabled.
-    CHECKPOINT_MARK("read source frame: %d", state->global->frame + enc->control->cfg->seek);
+    CHECKPOINT_MARK("read source frame: %d", state->frame->num + enc->control->cfg->seek);
   }
 
-  if (kvz_encoder_feed_frame(&enc->input_buffer, state, pic_in)) {
-    assert(state->global->frame == enc->frames_started);
+  kvz_picture* frame = kvz_encoder_feed_frame(&enc->input_buffer, state, pic_in);
+  if (frame) {
+    assert(state->frame->num == enc->frames_started);
     // Start encoding.
-    kvz_encode_one_frame(state);
+    kvz_encode_one_frame(state, frame);
     enc->frames_started += 1;
   }
 
@@ -274,14 +286,14 @@
   struct {
     kvz_data_chunk* data_out;
     uint32_t len_out;
-  } first = { 0 }, second = { 0 };
+  } first = { 0, 0 }, second = { 0, 0 };
 
   if (pic_in != NULL) {
-    first_field = kvz_image_alloc(state->encoder_control->in.width, state->encoder_control->in.height);
+    first_field = kvz_image_alloc(state->encoder_control->chroma_format, state->encoder_control->in.width, state->encoder_control->in.height);
     if (first_field == NULL) {
       goto kvazaar_field_encoding_adapter_failure;
     }
-    second_field = kvz_image_alloc(state->encoder_control->in.width, state->encoder_control->in.height);
+    second_field = kvz_image_alloc(state->encoder_control->chroma_format, state->encoder_control->in.width, state->encoder_control->in.height);
     if (second_field == NULL) {
       goto kvazaar_field_encoding_adapter_failure;
     }
@@ -345,7 +357,7 @@
   .config_destroy = kvz_config_destroy,
   .config_parse = kvz_config_parse,
 
-  .picture_alloc = kvz_image_alloc,
+  .picture_alloc = kvz_image_alloc_420,
   .picture_free = kvz_image_free,
 
   .chunk_free = kvz_bitstream_free_chunks,
@@ -354,6 +366,8 @@
   .encoder_close = kvazaar_close,
   .encoder_headers = kvazaar_headers,
   .encoder_encode = kvazaar_field_encoding_adapter,
+
+  .picture_alloc_csp = kvz_image_alloc,
 };

kvazaar-0.8.3.tar.gz/src/kvazaar.h -> kvazaar-1.0.0.tar.gz/src/kvazaar.h Changed

@@ -26,9 +26,9 @@
  * This file defines the public API of Kvazaar when used as a library.
  */
 
-#include <stddef.h>
 #include <stdint.h>
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -88,6 +88,10 @@
   KVZ_IME_HEXBS = 0,
   KVZ_IME_TZ = 1,
   KVZ_IME_FULL = 2,
+  KVZ_IME_FULL8 = 3, //! \since 3.6.0
+  KVZ_IME_FULL16 = 4, //! \since 3.6.0
+  KVZ_IME_FULL32 = 5, //! \since 3.6.0
+  KVZ_IME_FULL64 = 6, //! \since 3.6.0
 };
 
 /**
@@ -102,6 +106,92 @@
 };
 
 /**
+* \brief Constrain movement vectors.
+* \since 3.3.0
+*/
+enum kvz_mv_constraint
+{
+  KVZ_MV_CONSTRAIN_NONE = 0,
+  KVZ_MV_CONSTRAIN_FRAME = 1,  // Don't refer outside the frame.
+  KVZ_MV_CONSTRAIN_TILE = 2,  // Don't refer to other tiles.
+  KVZ_MV_CONSTRAIN_FRAME_AND_TILE = 3,  // Don't refer outside the tile.
+  KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN = 4,  // Keep enough margin for fractional pixel margins not to refer outside the tile.
+};
+
+/**
+* \brief Constrain movement vectors.
+* \since 3.5.0
+*/
+enum kvz_hash
+{
+  KVZ_HASH_NONE = 0,
+  KVZ_HASH_CHECKSUM = 1,
+  KVZ_HASH_MD5 = 2,
+};
+
+/**
+* \brief cu split termination mode
+* \since since 3.8.0
+*/
+enum kvz_cu_split_termination
+{
+  KVZ_CU_SPLIT_TERMINATION_ZERO = 0,
+  KVZ_CU_SPLIT_TERMINATION_OFF = 1
+};
+
+/**
+* \brief Enable and disable crypto features.
+* \since 3.7.0
+*/
+enum kvz_crypto_features {
+  KVZ_CRYPTO_OFF = 0,
+  KVZ_CRYPTO_MVs = (1 << 0),
+  KVZ_CRYPTO_MV_SIGNS = (1 << 1),
+  KVZ_CRYPTO_TRANSF_COEFFS = (1 << 2),
+  KVZ_CRYPTO_TRANSF_COEFF_SIGNS = (1 << 3),
+  KVZ_CRYPTO_ON = (1 << 4) - 1,
+};
+
+/**
+* \brief me early termination mode
+* \since since 3.8.0
+*/
+enum kvz_me_early_termination
+{
+  KVZ_ME_EARLY_TERMINATION_OFF = 0,
+  KVZ_ME_EARLY_TERMINATION_ON = 1,
+  KVZ_ME_EARLY_TERMINATION_SENSITIVE = 2
+};
+
+
+/**
+ * \brief Format the pixels are read in.
+ * This is separate from chroma subsampling, because we might want to read
+ * interleaved formats in the future.
+ * \since 3.12.0
+ */
+enum kvz_input_format {
+  KVZ_FORMAT_P400 = 0,
+  KVZ_FORMAT_P420 = 1,
+  KVZ_FORMAT_P422 = 2,
+  KVZ_FORMAT_P444 = 3,
+};
+
+/**
+* \brief Chroma subsampling format used for encoding.
+* \since 3.12.0
+*/
+enum kvz_chroma_format {
+  KVZ_CSP_400 = 0,
+  KVZ_CSP_420 = 1,
+  KVZ_CSP_422 = 2,
+  KVZ_CSP_444 = 3,
+};
+
+// Map from input format to chroma format.
+#define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"[format])
+
+/**
  * \brief GoP picture configuration.
  */
 typedef struct kvz_gop_config {
@@ -204,6 +294,31 @@
 
   int8_t mv_rdo;            /*!< \brief MV RDO calculation in search (0: estimation, 1: RDO). */
   int8_t calc_psnr;         /*!< \since 3.1.0 \brief Print PSNR in CLI. */
+
+  enum kvz_mv_constraint mv_constraint;  /*!< \since 3.3.0 \brief Constrain movement vectors. */
+  enum kvz_hash hash;  /*!< \since 3.5.0 \brief What hash algorithm to use. */
+
+  enum kvz_cu_split_termination cu_split_termination; /*!< \since 3.8.0 \brief Mode of cu split termination. */
+
+  enum kvz_crypto_features crypto_features; /*!< \since 3.7.0 */
+
+  enum kvz_me_early_termination me_early_termination; /*!< \since 3.8.0 \brief Mode of me early termination. */
+
+  int32_t lossless; /*!< \brief Use lossless coding. */
+
+  int32_t tmvp_enable; /*!> \brief Use Temporal Motion Vector Predictors. */
+
+  int32_t rdoq_skip; /*!< \brief Mode of rdoq skip */
+
+  enum kvz_input_format input_format; /*!< \brief Use Temporal Motion Vector Predictors. */
+  int32_t input_bitdepth; /*!< \brief Use Temporal Motion Vector Predictors. */
+
+  struct {
+    unsigned d;  // depth
+    unsigned t;  // temporal
+  } gop_lp_definition;
+
+  int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */
 } kvz_config;
 
 /**
@@ -231,6 +346,7 @@
   int64_t dts;             //!< \brief Decompression timestamp.
 
   enum kvz_interlacing interlacing; //!< \since 3.2.0 \brief Field order for interlaced pictures.
+  enum kvz_chroma_format chroma_format;
 } kvz_picture;
 
 /**
@@ -508,6 +624,19 @@
                                   kvz_picture **pic_out,
                                   kvz_picture **src_out,
                                   kvz_frame_info *info_out);
+
+  /**
+   * \brief Allocate a kvz_picture.
+   *
+   * The returned kvz_picture should be deallocated by calling picture_free.
+   *
+   * \since 3.12.0
+   * \param chroma_fomat  Chroma subsampling to use.
+   * \param width   width of luma pixel array to allocate
+   * \param height  height of luma pixel array to allocate
+   * \return        allocated picture, or NULL if allocation failed.
+   */
+  kvz_picture * (*picture_alloc_csp)(enum kvz_chroma_format chroma_fomat, int32_t width, int32_t height);
 } kvz_api;

kvazaar-0.8.3.tar.gz/src/kvazaar_internal.h -> kvazaar-1.0.0.tar.gz/src/kvazaar_internal.h Changed

kvazaar-1.0.0.tar.gz/src/kvz_math.h Added

@@ -0,0 +1,55 @@
+#ifndef MATH_H_
+#define MATH_H_
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2013-2015 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify it under
+* the terms of the GNU Lesser General Public License as published by the
+* Free Software Foundation; either version 2.1 of the License, or (at your
+* option) any later version.
+*
+* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+* FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+* more details.
+*
+* You should have received a copy of the GNU General Public License along
+* with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+/**
+* \file
+* Generic math functions
+*/
+
+#include "global.h" // IWYU pragma: keep
+
+
+static INLINE unsigned kvz_math_floor_log2(unsigned value)
+{
+  assert(value > 0);
+
+  unsigned result = 0;
+
+  for (int i = 4; i >= 0; --i) {
+    unsigned bits = 1ull << i;
+    unsigned shift = value >= (1 << bits) ? bits : 0;
+    result += shift;
+    value >>= shift;
+  }
+
+  return result;
+}
+
+static INLINE unsigned kvz_math_ceil_log2(unsigned value)
+{
+  assert(value > 0);
+
+  // The ceil_log2 is just floor_log2 + 1, except for exact powers of 2.
+  return kvz_math_floor_log2(value) + ((value & (value - 1)) ? 1 : 0);
+}
+
+#endif //CHECKPOINT_H_

kvazaar-0.8.3.tar.gz/src/nal.c -> kvazaar-1.0.0.tar.gz/src/nal.c Changed

@@ -19,15 +19,10 @@
  ****************************************************************************/
 
 #include "nal.h"
-#include "strategyselector.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
 
 #include "bitstream.h"
-#include "cabac.h"
-#include "encoder.h"
+#include "strategies/strategies-nal.h"
+
 
 /**
  * \brief Write a Network Abstraction Layer (NAL) packet to the output.
@@ -72,6 +67,25 @@
   kvz_array_checksum(im->y, im->height, im->width, im->width, checksum_out[0], bitdepth);
 
   /* The number of chroma pixels is half that of luma. */
-  kvz_array_checksum(im->u, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out[1], bitdepth);
-  kvz_array_checksum(im->v, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out[2], bitdepth);
+  if (im->chroma_format != KVZ_CSP_400) {
+    kvz_array_checksum(im->u, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out[1], bitdepth);
+    kvz_array_checksum(im->v, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out[2], bitdepth);
+  }
+}
+
+/*!
+\brief Calculate md5 for all colors of the picture.
+\param im The image that md5 is calculated for.
+\param checksum_out Result of the calculation.
+\returns Void
+*/
+void kvz_image_md5(const kvz_picture *im, unsigned char checksum_out[][SEI_HASH_MAX_LENGTH], const uint8_t bitdepth)
+{
+  kvz_array_md5(im->y, im->height, im->width, im->width, checksum_out[0], bitdepth);
+
+  /* The number of chroma pixels is half that of luma. */
+  if (im->chroma_format != KVZ_CSP_400) {
+    kvz_array_md5(im->u, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out[1], bitdepth);
+    kvz_array_md5(im->v, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out[2], bitdepth);
+  }
 }

kvazaar-0.8.3.tar.gz/src/nal.h -> kvazaar-1.0.0.tar.gz/src/nal.h Changed

kvazaar-0.8.3.tar.gz/src/rate_control.c -> kvazaar-1.0.0.tar.gz/src/rate_control.c Changed

@@ -22,6 +22,10 @@
 
 #include <math.h>
 
+#include "encoder.h"
+#include "kvazaar.h"
+
+
 static const int SMOOTHING_WINDOW = 40;
 
 /**
@@ -38,19 +42,19 @@
   const double bpp = state->stats_bitstream_length * 8 / pixels_per_picture;
   const double log_bpp = log(bpp);
 
-  const double alpha_old = state->global->rc_alpha;
-  const double beta_old = state->global->rc_beta;
+  const double alpha_old = state->frame->rc_alpha;
+  const double beta_old = state->frame->rc_beta;
   // lambda computed from real bpp
   const double lambda_comp = CLIP(0.1, 10000, alpha_old * pow(bpp, beta_old));
   // lambda used in encoding
-  const double lambda_real = state->global->cur_lambda_cost;
+  const double lambda_real = state->frame->cur_lambda_cost;
   const double lambda_log_ratio = log(lambda_real) - log(lambda_comp);
 
   const double alpha = alpha_old + 0.1 * lambda_log_ratio * alpha_old;
-  state->global->rc_alpha = CLIP(0.05, 20, alpha);
+  state->frame->rc_alpha = CLIP(0.05, 20, alpha);
 
   const double beta = beta_old + 0.05 * lambda_log_ratio * CLIP(-5, 1, log_bpp);
-  state->global->rc_beta = CLIP(-3, -0.1, beta);
+  state->frame->rc_beta = CLIP(-3, -0.1, beta);
 }
 
 /**
@@ -67,14 +71,14 @@
 
   // At this point, total_bits_coded of the current state contains the
   // number of bits written encoder->owf frames before the current frame.
-  int bits_coded = state->global->total_bits_coded;
-  int pictures_coded = MAX(0, state->global->frame - encoder->owf);
+  uint64_t bits_coded = state->frame->total_bits_coded;
+  int pictures_coded = MAX(0, state->frame->num - encoder->owf);
 
-  int gop_offset = (state->global->gop_offset - encoder->owf) % MAX(1, encoder->cfg->gop_len);
+  int gop_offset = (state->frame->gop_offset - encoder->owf) % MAX(1, encoder->cfg->gop_len);
   // Only take fully coded GOPs into account.
   if (encoder->cfg->gop_len > 0 && gop_offset != encoder->cfg->gop_len - 1) {
     // Subtract number of bits in the partially coded GOP.
-    bits_coded -= state->global->cur_gop_bits_coded;
+    bits_coded -= state->frame->cur_gop_bits_coded;
     // Subtract number of pictures in the partially coded GOP.
     pictures_coded -= gop_offset + 1;
   }
@@ -82,7 +86,7 @@
   double gop_target_bits =
     (encoder->target_avg_bppic * (pictures_coded + SMOOTHING_WINDOW) - bits_coded)
     * MAX(1, encoder->cfg->gop_len) / SMOOTHING_WINDOW;
-  state->global->cur_gop_target_bits = MAX(200, gop_target_bits);
+  state->frame->cur_gop_target_bits = MAX(200, gop_target_bits);
 }
 
 /**
@@ -95,12 +99,12 @@
   const encoder_control_t * const encoder = state->encoder_control;
 
   if (encoder->cfg->gop_len <= 0) {
-    return state->global->cur_gop_target_bits;
+    return state->frame->cur_gop_target_bits;
   }
 
   const double pic_weight = encoder->gop_layer_weights[
-    encoder->cfg->gop[state->global->gop_offset].layer - 1];
-  double pic_target_bits = state->global->cur_gop_target_bits * pic_weight;
+    encoder->cfg->gop[state->frame->gop_offset].layer - 1];
+  double pic_target_bits = state->frame->cur_gop_target_bits * pic_weight;
   return MAX(100, pic_target_bits);
 }
 
@@ -118,17 +122,20 @@
 
   assert(encoder->cfg->target_bitrate > 0);
 
-  if (state->global->frame > encoder->owf) {
+  if (state->frame->num > encoder->owf) {
     // At least one frame has been written.
     update_rc_parameters(state);
   }
 
-  if (encoder->cfg->gop_len == 0 || state->global->gop_offset == 0) {
+  if (encoder->cfg->gop_len == 0 ||
+      state->frame->gop_offset == 0 ||
+      state->frame->num == 0)
+  {
     // A new GOP begins at this frame.
     gop_allocate_bits(state);
   } else {
-    state->global->cur_gop_target_bits =
-      state->previous_encoder_state->global->cur_gop_target_bits;
+    state->frame->cur_gop_target_bits =
+      state->previous_encoder_state->frame->cur_gop_target_bits;
   }
 
   // TODO: take the picture headers into account
@@ -136,7 +143,7 @@
   const double target_bits_per_pixel =
     target_bits_current_picture / encoder->in.pixels_per_pic;
   const double lambda =
-    state->global->rc_alpha * pow(target_bits_per_pixel, state->global->rc_beta);
+    state->frame->rc_alpha * pow(target_bits_per_pixel, state->frame->rc_beta);
   return CLIP(0.1, 10000, lambda);
 }
 
@@ -160,9 +167,9 @@
   const int intra_period = state->encoder_control->cfg->intra_period;
   const int keyframe_period = gop_len > 0 ? gop_len : intra_period;
   
-  double lambda = pow(2.0, (state->global->QP - 12) / 3.0);
+  double lambda = pow(2.0, (state->frame->QP - 12) / 3.0);
 
-  if (state->global->slicetype == KVZ_SLICE_I) {
+  if (state->frame->slicetype == KVZ_SLICE_I) {
     lambda *= 0.57;
     
     // Reduce lambda for I-frames according to the number of references.
@@ -172,14 +179,14 @@
       lambda *= 1.0 - CLIP(0.0, 0.5, 0.05 * (keyframe_period - 1));
     }
   } else if (gop_len > 0) {
-    lambda *= state->global->QP_factor;
+    lambda *= state->frame->QP_factor;
   } else {
     lambda *= 0.4624;
   }
 
   // Increase lambda if not key-frame.
-  if (keyframe_period > 0 && state->global->poc % keyframe_period != 0) {
-    lambda *= CLIP(2.0, 4.0, (state->global->QP - 12) / 6.0);
+  if (keyframe_period > 0 && state->frame->poc % keyframe_period != 0) {
+    lambda *= CLIP(2.0, 4.0, (state->frame->QP - 12) / 6.0);
   }
   
   return lambda;

kvazaar-0.8.3.tar.gz/src/rate_control.h -> kvazaar-1.0.0.tar.gz/src/rate_control.h Changed

kvazaar-0.8.3.tar.gz/src/rdo.c -> kvazaar-1.0.0.tar.gz/src/rdo.c Changed

@@ -1,4 +1,4 @@
-/*****************************************************************************
+/*****************************************************************************
  * This file is part of Kvazaar HEVC encoder.
  *
  * Copyright (C) 2013-2015 Tampere University of Technology and others (see
@@ -18,17 +18,20 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdio.h>
+#include "rdo.h"
+
 #include <stdlib.h>
 #include <string.h>
 
-#include "rdo.h"
-#include "transform.h"
-#include "context.h"
 #include "cabac.h"
-#include "transform.h"
-#include "strategies/strategies-quant.h"
+#include "context.h"
+#include "encode_coding_tree.h"
+#include "encoder.h"
+#include "imagelist.h"
 #include "inter.h"
+#include "scalinglist.h"
+#include "tables.h"
+#include "transform.h"
 
 
 #define QUANT_SHIFT          14
@@ -39,8 +42,6 @@
 const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
 const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
 
-
-#define CTX_ENTROPY_BITS(ctx,val) kvz_entropy_bits[(ctx)->uc_state ^ val]
 /**
  * Entropy bits to estimate coded bits in RDO / RDOQ (From HM 12.0)
  */
@@ -126,97 +127,6 @@
 };
 
 
-/**
- * \brief Function to compare RDO costs
- * \param rdo_costs array of current costs
- * \param cost new cost to check
- * \returns -1 if cost is worse than the one in the array or array position for worst cost
-
- This function derives the prediction samples for planar mode (intra coding).
-*/
-int kvz_intra_rdo_cost_compare(uint32_t *rdo_costs,int8_t rdo_modes_to_check, uint32_t cost)
-{
-  int i;
-  int found = 0;
-
-  for(i = 0; i < rdo_modes_to_check; i++) {
-    if(rdo_costs[i] > cost) {
-      found = 1;
-      break;
-    }
-  }
-  // Search for worst cost
-  if(found) {
-    uint32_t worst_cost = 0;
-    int worst_mode = -1;
-    for(i = 0; i < rdo_modes_to_check; i++) {
-      if(rdo_costs[i] > worst_cost) {
-        worst_cost = rdo_costs[i];
-        worst_mode = i;
-      }
-    }
-    return worst_mode;
-  }
-
-  return -1;
-}
-
-
-/**
- * \brief RDO function to calculate cost for intra
- * \returns cost to code pred block
-
- ** Only for luma
- */
-uint32_t kvz_rdo_cost_intra(encoder_state_t * const state, kvz_pixel *pred, kvz_pixel *orig_block, int width, int8_t mode, int tr_depth)
-{
-    const encoder_control_t * const encoder = state->encoder_control;
-    coeff_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-    int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-    int16_t temp_block[LCU_WIDTH*LCU_WIDTH>>2];
-    coeff_t temp_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-    int8_t luma_scan_mode = SCAN_DIAG;
-
-    int i = 0,x,y;
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        block[i++] = orig_block[x + y*width]- pred[x + y*width];
-      }
-    }
-    // Scan mode is diagonal, except for 4x4 and 8x8, where:
-    // - angular 6-14 = vertical
-    // - angular 22-30 = horizontal
-    if (width <= 8) {
-      if (mode >= 6 && mode <= 14) {
-        luma_scan_mode = SCAN_VER;
-      } else if (mode >= 22 && mode <= 30) {
-        luma_scan_mode = SCAN_HOR;
-      }
-    }
-    kvz_transform2d(encoder, block,pre_quant_coeff,width,0);
-    if(encoder->rdoq_enable) {
-      kvz_rdoq(state, pre_quant_coeff, temp_coeff, width, width, 0, luma_scan_mode, CU_INTRA, tr_depth);
-    } else {
-      kvz_quant(state, pre_quant_coeff, temp_coeff, width, width, 0, luma_scan_mode, CU_INTRA);
-    }
-    kvz_dequant(state, temp_coeff, pre_quant_coeff, width, width, 0, CU_INTRA);
-    kvz_itransform2d(encoder, temp_block,pre_quant_coeff,width,0);
-
-    unsigned ssd = 0;
-    // SSD between original and reconstructed
-    for (i = 0; i < width*width; i++) {
-      //int diff = temp_block[i]-block[i];
-      int diff = orig_block[i] - CLIP(0, PIXEL_MAX, pred[i] + temp_block[i]);
-
-      ssd += diff*diff;
-    }
-
-    double coeff_bits = kvz_get_coeff_cost(state, temp_coeff, width, 0, luma_scan_mode);
-
-    return (uint32_t)(0.5 + ssd + coeff_bits * state->global->cur_lambda_cost);
-}
-
-
 /** Calculate actual (or really close to actual) bitcost for coding coefficients
  * \param coeff coefficient array
  * \param width coeff block width
@@ -347,7 +257,7 @@
   cabac_ctx_t* base_sig_model = type?(cabac->ctx.cu_sig_model_chroma):(cabac->ctx.cu_sig_model_luma);
 
   if( !last && max_abs_level < 3 ) {
-    *coded_cost_sig = state->global->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
+    *coded_cost_sig = state->frame->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
     *coded_cost     = *coded_cost0 + *coded_cost_sig;
     if (max_abs_level == 0) return best_abs_level;
   } else {
@@ -355,13 +265,13 @@
   }
 
   if( !last ) {
-    cur_cost_sig = state->global->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
+    cur_cost_sig = state->frame->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
   }
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
     double err       = (double)(level_double - ( abs_level << q_bits ) );
-    double cur_cost  = err * err * temp + state->global->cur_lambda_cost *
+    double cur_cost  = err * err * temp + state->frame->cur_lambda_cost *
                        kvz_get_ic_rate( state, abs_level, ctx_num_one, ctx_num_abs,
                                     abs_go_rice, c1_idx, c2_idx, type);
     cur_cost        += cur_cost_sig;
@@ -398,7 +308,7 @@
   if( ctx_y > 3 ) {
     uiCost += 32768.0 * ((ctx_y-2)>>1);
   }
-  return state->global->cur_lambda_cost*uiCost;
+  return state->frame->cur_lambda_cost*uiCost;
 }
 
 static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t height, int8_t type,
@@ -432,6 +342,7 @@
   last_y_bits[ctx] = bits_y;
 }
 
+
 void kvz_rdoq_sign_hiding(const encoder_state_t *const state,
                       const int32_t qp_scaled,
                       const uint32_t *const scan,
@@ -444,36 +355,34 @@
                       coeff_t *const dest_coeff)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const int32_t size = width * width;
-  
+
   int64_t rd_factor = (int64_t)(
     kvz_g_inv_quant_scales[qp_scaled % 6] * kvz_g_inv_quant_scales[qp_scaled % 6] * (1 << (2 * (qp_scaled / 6)))
-    / state->global->cur_lambda_cost / 16 / (1 << (2 * (encoder->bitdepth - 8)))
+    / state->frame->cur_lambda_cost / 16 / (1 << (2 * (encoder->bitdepth - 8)))
     + 0.5);
   int32_t lastCG = -1;
   int32_t absSum = 0;
-  int32_t n, subset;
 
-  for (subset = (size - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
+  for (int32_t subset = (width - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {

kvazaar-0.8.3.tar.gz/src/rdo.h -> kvazaar-1.0.0.tar.gz/src/rdo.h Changed

@@ -26,23 +26,19 @@
  * Rate-Distortion Optimization related functionality.
  */
 
-#include "global.h"
-
-#include "encoder.h"
+#include "cabac.h"
+#include "cu.h"
 #include "encoderstate.h"
-#include "inter.h"
+#include "global.h" // IWYU pragma: keep
+#include "search_inter.h"
 
 
 extern const uint32_t kvz_g_go_rice_range[5];
 extern const uint32_t kvz_g_go_rice_prefix_len[5];
 
-int kvz_intra_rdo_cost_compare(uint32_t *rdo_costs,int8_t rdo_modes_to_check, uint32_t cost);
-
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
-uint32_t kvz_rdo_cost_intra(encoder_state_t *state, kvz_pixel* pred, kvz_pixel* orig_block, int width, int8_t mode, int tr_depth);
-
 int32_t kvz_get_coeff_cost(const encoder_state_t *state, coeff_t *coeff, int32_t width, int32_t type, int8_t scan_mode);
 
 int32_t kvz_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_one, uint16_t ctx_num_abs,
@@ -54,12 +50,16 @@
                          uint32_t c1_idx, uint32_t c2_idx,
                          int32_t q_bits,double temp, int8_t last, int8_t type);
 
-int kvz_calc_mvd_cost_cabac(const encoder_state_t * const state, int x, int y, int mv_shift,
-  int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-  int16_t num_cand, int32_t ref_idx, uint32_t *bitcost);
-uint32_t kvz_get_mvd_coding_cost_cabac(vector2d_t *mvd, cabac_data_t* cabac);
+kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
+
+uint32_t kvz_get_mvd_coding_cost_cabac(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac);
+
+// Fixed points fractional bits, 16b.16b
+extern const uint32_t kvz_entropy_bits[128];
+#define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits[(ctx)->uc_state ^ (val)]
 
+// Floating point fractional bits, derived from kvz_entropy_bits
 extern const float kvz_f_entropy_bits[128];
-#define CTX_ENTROPY_FBITS(ctx,val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]
+#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]
 
 #endif

kvazaar-0.8.3.tar.gz/src/sao.c -> kvazaar-1.0.0.tar.gz/src/sao.c Changed

@@ -19,97 +19,18 @@
  ****************************************************************************/
 
 #include "sao.h"
-#include "rdo.h"
-#include "strategies/strategies-picture.h"
 
-#include <string.h>
+#include <limits.h>
 #include <stdlib.h>
-#include <assert.h>
-
-// Offsets of a and b in relation to c.
-// dir_offset[dir][a or b]
-// |       |   a   | a     |     a |
-// | a c b |   c   |   c   |   c   |
-// |       |   b   |     b | b     |
-static const vector2d_t g_sao_edge_offsets[SAO_NUM_EO][2] = {
-  { { -1, 0 }, { 1, 0 } },
-  { { 0, -1 }, { 0, 1 } },
-  { { -1, -1 }, { 1, 1 } },
-  { { 1, -1 }, { -1, 1 } }
-};
-
-// Mapping of edge_idx values to eo-classes.
-
-
-static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
-{
-  // Mapping relationships between a, b and c to eo_idx.
-  static const int sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
-
-  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
-
-  return sao_eo_idx_to_eo_category[eo_idx];
-}
-
-
-int kvz_sao_band_ddistortion(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int band_pos, int sao_bands[4])
-{
-  int y, x;
-  int shift = state->encoder_control->bitdepth-5;
-  int sum = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      int band = (rec_data[y * block_width + x] >> shift) - band_pos;
-      int offset = 0;
-      if (band >= 0 && band < 4) {
-        offset = sao_bands[band];
-      }
-      if (offset != 0) {
-        int diff = orig_data[y * block_width + x] - rec_data[y * block_width + x];
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
-
-  return sum;
-}
-
-
-int kvz_sao_edge_ddistortion(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int eo_class, int offsets[NUM_SAO_EDGE_CATEGORIES])
-{
-  int y, x;
-  int sum = 0;
-  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
-  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
-
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const kvz_pixel *c_data = &rec_data[y * block_width + x];
-      kvz_pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
-      kvz_pixel c = c_data[0];
-      kvz_pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
-
-      int offset = offsets[sao_calc_eo_cat(a, b, c)];
-
-      if (offset != 0) {
-        int diff = orig_data[y * block_width + x] - c;
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
+#include <string.h>
 
-  return sum;
-}
+#include "cabac.h"
+#include "image.h"
+#include "rdo.h"
+#include "strategies/strategies-sao.h"
 
 
-void kvz_init_sao_info(sao_info_t *sao) {
+static void init_sao_info(sao_info_t *sao) {
   sao->type = SAO_TYPE_NONE;
   sao->merge_left_flag = 0;
   sao->merge_up_flag = 0;
@@ -240,7 +161,7 @@
 /**
  * \brief calculate an array of intensity correlations for each intensity value
  */
-static void calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i)
+void kvz_calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i)
 {
   int val;
   int values = (1<<encoder->bitdepth);
@@ -341,78 +262,6 @@
 
 
 /**
- * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
- * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
- * \param dir_offsets
- * \param is_chroma  0 for luma, 1 for chroma. Indicates
- */
-static void calc_sao_edge_dir(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                              int eo_class, int block_width, int block_height,
-                              int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
-{
-  int y, x;
-  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
-  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
-  // Arrays orig_data and rec_data are quarter size for chroma.
-
-  // Don't sample the edge pixels because this function doesn't have access to
-  // their neighbours.
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const kvz_pixel *c_data = &rec_data[y * block_width + x];
-      kvz_pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
-      kvz_pixel c = c_data[0];
-      kvz_pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
-
-      int eo_cat = sao_calc_eo_cat(a, b, c);
-
-      cat_sum_cnt[0][eo_cat] += orig_data[y * block_width + x] - c;
-      cat_sum_cnt[1][eo_cat] += 1;
-    }
-  }
-}
-
-static void sao_reconstruct_color(const encoder_control_t * const encoder, 
-                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
-                                  const sao_info_t *sao,
-                                  int stride, int new_stride,
-                                  int block_width, int block_height,
-                                  color_t color_i)
-{
-  int y, x;
-  // Arrays orig_data and rec_data are quarter size for chroma.
-  int offset_v = color_i == COLOR_V ? 5 : 0;
-
-  if(sao->type == SAO_TYPE_BAND) {
-    int offsets[1<<KVZ_BIT_DEPTH];
-    calc_sao_offset_array(encoder, sao, offsets, color_i);
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
-        new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
-      }
-    }
-  } else {
-    // Don't sample the edge pixels because this function doesn't have access to
-    // their neighbours.
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
-        vector2d_t a_ofs = g_sao_edge_offsets[sao->eo_class][0];
-        vector2d_t b_ofs = g_sao_edge_offsets[sao->eo_class][1];
-        const kvz_pixel *c_data = &rec_data[y * stride + x];
-        kvz_pixel *new_data = &new_rec_data[y * new_stride + x];
-        kvz_pixel a = c_data[a_ofs.y * stride + a_ofs.x];
-        kvz_pixel c = c_data[0];
-        kvz_pixel b = c_data[b_ofs.y * stride + b_ofs.x];
-
-        int eo_cat = sao_calc_eo_cat(a, b, c);
-
-        new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
-      }
-    }
-  }
-}
-
-/**
  * \brief Calculate dimensions of the buffer used by sao reconstruction.
 
  * \param pic  Picture.
@@ -575,7 +424,7 @@
                       tl.y + block.y + br.y,
                       pic_stride, buf_stride);
 
-  sao_reconstruct_color(encoder, &buf_rec[tl.y * buf_stride + tl.x],
+  kvz_sao_reconstruct_color(encoder, &buf_rec[tl.y * buf_stride + tl.x],
                         &new_rec[(ofs.y + tl.y) * lcu_stride + ofs.x + tl.x],

kvazaar-0.8.3.tar.gz/src/sao.h -> kvazaar-1.0.0.tar.gz/src/sao.h Changed

@@ -26,14 +26,14 @@
  * Sample Adaptive Offset filter.
  */
 
-#include "global.h"
-
 #include "checkpoint.h"
-#include "global.h"
-#include "videoframe.h"
+#include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
-#include "math.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "videoframe.h"
+
 
 typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
 typedef enum { SAO_EO0 = 0, SAO_EO1, SAO_EO2, SAO_EO3, SAO_NUM_EO } sao_eo_class;
@@ -50,6 +50,20 @@
   int offsets[NUM_SAO_EDGE_CATEGORIES * 2];
 } sao_info_t;
 
+
+// Offsets of a and b in relation to c.
+// dir_offset[dir][a or b]
+// |       |   a   | a     |     a |
+// | a c b |   c   |   c   |   c   |
+// |       |   b   |     b | b     |
+static const vector2d_t g_sao_edge_offsets[SAO_NUM_EO][2] = {
+  { { -1, 0 }, { 1, 0 } },
+  { { 0, -1 }, { 0, 1 } },
+  { { -1, -1 }, { 1, 1 } },
+  { { 1, -1 }, { -1, 1 } }
+};
+
+
 #define CHECKPOINT_SAO_INFO(prefix_str, sao) CHECKPOINT(prefix_str " type=%d eo_class=%d ddistortion=%d " \
   "merge_left_flag=%d merge_up_flag=%d band_position=%d " \
   "offsets[0]=%d offsets[1]=%d offsets[2]=%d offsets[3]=%d offsets[4]=%d", \
@@ -58,12 +72,11 @@
   (sao).offsets[0], (sao).offsets[1], (sao).offsets[2], (sao).offsets[3], (sao).offsets[4])
 
 
-void kvz_init_sao_info(sao_info_t *sao);
-void kvz_sao_search_chroma(const encoder_state_t * state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost[3]);
-void kvz_sao_search_luma(const encoder_state_t * state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost[3]);
 void kvz_sao_reconstruct(const encoder_control_t * encoder, videoframe_t *frame, const kvz_pixel *old_rec,
                      unsigned x_ctb, unsigned y_ctb,
                      const sao_info_t *sao, color_t color_i);
 void kvz_sao_reconstruct_frame(encoder_state_t *state);
+void kvz_sao_search_lcu(const encoder_state_t* const state, int lcu_x, int lcu_y);
+void kvz_calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i);
 
 #endif

kvazaar-0.8.3.tar.gz/src/scalinglist.c -> kvazaar-1.0.0.tar.gz/src/scalinglist.c Changed

kvazaar-0.8.3.tar.gz/src/scalinglist.h -> kvazaar-1.0.0.tar.gz/src/scalinglist.h Changed

kvazaar-0.8.3.tar.gz/src/search.c -> kvazaar-1.0.0.tar.gz/src/search.c Changed

@@ -20,18 +20,22 @@
 
 #include "search.h"
 
-#include <stdio.h>
-#include <stdlib.h>
+#include <limits.h>
 #include <string.h>
-#include <assert.h>
 
-#include "intra.h"
+#include "cabac.h"
+#include "encoder.h"
+#include "imagelist.h"
 #include "inter.h"
+#include "intra.h"
+#include "kvazaar.h"
 #include "rdo.h"
-#include "transform.h"
 #include "search_inter.h"
 #include "search_intra.h"
-#include "strategies/strategies-picture.h"
+#include "threadqueue.h"
+#include "transform.h"
+#include "videoframe.h"
+
 
 #define IN_FRAME(x, y, width, height, block_width, block_height) \
   ((x) >= 0 && (y) >= 0 \
@@ -43,10 +47,7 @@
 # define INTRA_TRESHOLD 20
 #endif
 
-// Disable early cu-split pruning.
-#ifndef FULL_CU_SPLIT_SEARCH
-#  define FULL_CU_SPLIT_SEARCH false
-#endif
+
 // Modify weight of luma SSD.
 #ifndef LUMA_MULT
 # define LUMA_MULT 0.8
@@ -66,14 +67,13 @@
 
   // Copy non-reference CUs.
   {
-    const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-    const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-    const int width_cu = LCU_WIDTH >> MAX_DEPTH >> depth;
-    int x, y;
-    for (y = y_cu; y < y_cu + width_cu; ++y) {
-      for (x = x_cu; x < x_cu + width_cu; ++x) {
-        const cu_info_t *from_cu = LCU_GET_CU(&work_tree[depth + 1], x, y);
-        cu_info_t *to_cu = LCU_GET_CU(&work_tree[depth], x, y);
+    const int x_orig = SUB_SCU(x_px);
+    const int y_orig = SUB_SCU(y_px);
+    const int width_cu = LCU_WIDTH >> depth;
+    for (int y = y_orig; y < y_orig + width_cu; y += SCU_WIDTH) {
+      for (int x = x_orig; x < x_orig + width_cu; x += SCU_WIDTH) {
+        const cu_info_t *from_cu = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x, y);
+        cu_info_t *to_cu = LCU_GET_CU_AT_PX(&work_tree[depth], x, y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -94,20 +94,24 @@
     lcu_coeff_t *to_coeff = &work_tree[depth].coeff;
 
     kvz_pixels_blit(&from->y[luma_index], &to->y[luma_index],
-                        width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    kvz_pixels_blit(&from->u[chroma_index], &to->u[chroma_index],
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    kvz_pixels_blit(&from->v[chroma_index], &to->v[chroma_index],
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    width_px, width_px, LCU_WIDTH, LCU_WIDTH);
+    if (from->chroma_format != KVZ_CSP_400) {
+      kvz_pixels_blit(&from->u[chroma_index], &to->u[chroma_index],
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+      kvz_pixels_blit(&from->v[chroma_index], &to->v[chroma_index],
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    }
 
     // Copy coefficients up. They do not have to be copied down because they
     // are not used for the search.
     kvz_coefficients_blit(&from_coeff->y[luma_index], &to_coeff->y[luma_index],
-                        width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    kvz_coefficients_blit(&from_coeff->u[chroma_index], &to_coeff->u[chroma_index],
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    kvz_coefficients_blit(&from_coeff->v[chroma_index], &to_coeff->v[chroma_index],
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                          width_px, width_px, LCU_WIDTH, LCU_WIDTH);
+    if (from->chroma_format != KVZ_CSP_400) {
+      kvz_coefficients_blit(&from_coeff->u[chroma_index], &to_coeff->u[chroma_index],
+                            width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+      kvz_coefficients_blit(&from_coeff->v[chroma_index], &to_coeff->v[chroma_index],
+                            width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    }
   }
 }
 
@@ -125,15 +129,13 @@
   int d;
 
   for (d = depth + 1; d < MAX_PU_DEPTH + 1; ++d) {
-    const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-    const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-    const int width_cu = width_px >> MAX_DEPTH;
-
-    int x, y;
-    for (y = y_cu; y < y_cu + width_cu; ++y) {
-      for (x = x_cu; x < x_cu + width_cu; ++x) {
-        const cu_info_t *from_cu = LCU_GET_CU(&work_tree[depth], x, y);
-        cu_info_t *to_cu = LCU_GET_CU(&work_tree[d], x, y);
+    const int x_orig = SUB_SCU(x_px);
+    const int y_orig = SUB_SCU(y_px);
+
+    for (int y = y_orig; y < y_orig + width_px; y += SCU_WIDTH) {
+      for (int x = x_orig; x < x_orig + width_px; x += SCU_WIDTH) {
+        const cu_info_t *from_cu = LCU_GET_CU_AT_PX(&work_tree[depth], x, y);
+        cu_info_t *to_cu = LCU_GET_CU_AT_PX(&work_tree[d], x, y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -151,27 +153,28 @@
     lcu_yuv_t *to = &work_tree[d].rec;
 
     kvz_pixels_blit(&from->y[luma_index], &to->y[luma_index],
-                        width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    kvz_pixels_blit(&from->u[chroma_index], &to->u[chroma_index],
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    kvz_pixels_blit(&from->v[chroma_index], &to->v[chroma_index],
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    width_px, width_px, LCU_WIDTH, LCU_WIDTH);
+    if (from->chroma_format != KVZ_CSP_400) {
+      kvz_pixels_blit(&from->u[chroma_index], &to->u[chroma_index],
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+      kvz_pixels_blit(&from->v[chroma_index], &to->v[chroma_index],
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    }
   }
 }
 
 
 void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
 {
-  const int width_cu = LCU_CU_WIDTH >> depth;
-  const vector2d_t lcu_cu = { SUB_SCU(x_px) / 8, SUB_SCU(y_px) / 8 };
-  int x, y;
+  const int width = LCU_WIDTH >> depth;
+  const vector2d_t lcu_cu = { SUB_SCU(x_px), SUB_SCU(y_px) };
 
   // Depth 4 doesn't go inside the loop. Set the top-left CU.
-  LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y)->tr_depth = tr_depth;
+  LCU_GET_CU_AT_PX(lcu, lcu_cu.x, lcu_cu.y)->tr_depth = tr_depth;
 
-  for (y = 0; y < width_cu; ++y) {
-    for (x = 0; x < width_cu; ++x) {
-      cu_info_t *cu = LCU_GET_CU(lcu, lcu_cu.x + x, lcu_cu.y + y);
+  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
+    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
+      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, lcu_cu.x + x, lcu_cu.y + y);
       cu->tr_depth = tr_depth;
     }
   }
@@ -180,48 +183,41 @@
 
 static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pred_mode, int chroma_mode, int part_mode)
 {
-  const int width_cu = LCU_CU_WIDTH >> depth;
-  const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-  const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-  int x, y;
+  const int width = LCU_WIDTH >> depth;
+  const int x_cu  = SUB_SCU(x_px);
+  const int y_cu  = SUB_SCU(y_px);
 
-  // NxN can only be applied to a single CU at a time.
   if (part_mode == SIZE_NxN) {
-    cu_info_t *cu = LCU_GET_CU(lcu, x_cu, y_cu);
-    cu->depth = MAX_DEPTH;
-    cu->type = CU_INTRA;
-    cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode = pred_mode;
-    cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode_chroma = chroma_mode;
-    cu->part_size = part_mode;
-    return;
+    assert(depth == MAX_DEPTH + 1);
+    assert(width == SCU_WIDTH);
+  }
+
+  if (depth > MAX_DEPTH) {
+    depth = MAX_DEPTH;
+    assert(part_mode == SIZE_NxN);
   }
 
   // Set mode in every CU covered by part_mode in this depth.
-  for (y = y_cu; y < y_cu + width_cu; ++y) {
-    for (x = x_cu; x < x_cu + width_cu; ++x) {
-      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
+  for (int y = y_cu; y < y_cu + width; y += SCU_WIDTH) {
+    for (int x = x_cu; x < x_cu + width; x += SCU_WIDTH) {
+      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
       cu->depth = depth;

kvazaar-0.8.3.tar.gz/src/search.h -> kvazaar-1.0.0.tar.gz/src/search.h Changed

kvazaar-0.8.3.tar.gz/src/search_inter.c -> kvazaar-1.0.0.tar.gz/src/search_inter.c Changed

@@ -20,62 +20,197 @@
 
 #include "search_inter.h"
 
+#include <limits.h>
 #include <stdlib.h>
 
+#include "cabac.h"
+#include "encoder.h"
+#include "image.h"
+#include "imagelist.h"
 #include "inter.h"
-#include "strategies/strategies-picture.h"
-#include "strategies/strategies-ipol.h"
+#include "kvazaar.h"
 #include "rdo.h"
+#include "strategies/strategies-ipol.h"
+#include "strategies/strategies-picture.h"
+#include "videoframe.h"
+
+
+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
+{
+  if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
+    return (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2));
+  };
+
+  int margin = 0;
+  if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
+    // Enforce a distance of 8 from any tile boundary.
+    margin = 4 * 4;
+  }
+
+  // TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE.
+  const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y };
+
+  // Check that both margin and wpp_limit constraints are satisfied.
+  if (abs_mv.x >= margin && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) - margin &&
+      abs_mv.y >= margin && abs_mv.y + (height << 2) <= (state->tile->frame->height << 2) - margin &&
+      (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2)))
+  {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+static INLINE int get_wpp_limit(const encoder_state_t *state, const vector2d_t* orig)
+{
+  const encoder_control_t *ctrl = state->encoder_control;
+  if (ctrl->owf && ctrl->wpp) {
+    // Limit motion vectors to the LCU-row below this row.
+    // To avoid fractional pixel interpolation depending on things outside
+    // this range, add a margin of 4 pixels.
+    // - fme needs 4 pixels
+    // - odd chroma interpolation needs 4 pixels
+    int wpp_limit = 2 * LCU_WIDTH - 4 - orig->y % LCU_WIDTH;
+    if (ctrl->deblock_enable && !ctrl->sao_enable) {
+      // As a special case, when deblocking is enabled but SAO is not, we have
+      // to avoid the possibility of interpolation filters reaching the
+      // non-deblocked pixels. The deblocking for the horizontal edge on the
+      // LCU boundary can reach 4 pixels. If SAO is enabled, this WPP-row
+      // depends on the SAO job, which depends on the deblocking having
+      // already been done.
+      wpp_limit -= 4;
+    }
+    return wpp_limit;
+  } else {
+    return -1;
+  }
+}
+
+
+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
+{
+  return fracmv_within_tile(state, orig, x << 2, y << 2, width, height, wpp_limit);
+}
+
+
+static unsigned get_ep_ex_golomb_bitcost(unsigned symbol)
+{
+  // Calculate 2 * log2(symbol + 2)
+
+  unsigned bins = 0;
+  symbol += 2;
+  if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; }
+  if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; }
+  if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; }
+  if (symbol >= 1 << 1) { bins += 2; }
+
+  // TODO: It might be a good idea to put a small slope on this function to
+  // make sure any search function that follows the gradient heads towards
+  // a smaller MVD, but that would require fractinal costs and bits being
+  // used everywhere in inter search.
+  // return num_bins + 0.001 * symbol;
+
+  return bins;
+}
 
 
-static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
+/**Checks if mv is one of the merge candidates
+* \return true if found else return false
+*/
+static bool mv_in_merge(const inter_merge_cand_t* merge_cand, int16_t num_cand, const vector2d_t* mv)
 {
-  int32_t num_bins = 0;
-  while (symbol >= (uint32_t)(1 << count)) {
-    ++num_bins;
-    symbol -= 1 << count;
-    ++count;
+  for (int i = 0; i < num_cand; ++i) {
+    if (merge_cand[i].dir == 3) continue;
+    const vector2d_t merge_mv = {
+      merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2,
+      merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2
+    };
+    if (merge_mv.x == mv->x && merge_mv.y == mv->y) {
+      return true;
+    }
   }
-  num_bins ++;
+  return false;
+}
+
+
+static unsigned select_starting_point(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state,
+                                      const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref,
+                                      int16_t mv_cand[2][2], int32_t ref_idx, unsigned best_cost, unsigned *best_index, uint32_t *best_bitcost,
+                                      kvz_mvd_cost_func *calc_mvd){
+  // Go through candidates
+  for (unsigned i = 0; i < num_cand; ++i) {
+    if (merge_cand[i].dir == 3) continue;
+    mv->x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
+    mv->y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
 
-  return num_bins;
+    if (mv->x == 0 && mv->y == 0) continue;
+    if (!intmv_within_tile(state, orig, mv->x, mv->y, width, height, wpp_limit)) {
+      continue;
+    }
+
+    uint32_t bitcost = 0;
+    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+      (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x,
+      (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y,
+      width, height, -1);
+    cost += calc_mvd(state, mv->x, mv->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+    if (cost < best_cost) {
+      best_cost = cost;
+      *best_index = i;
+      *best_bitcost = bitcost;
+    }
+  }  
+  if (*best_index < num_cand) {
+    mv->x = merge_cand[*best_index].mv[merge_cand[*best_index].dir - 1][0] >> 2;
+    mv->y = merge_cand[*best_index].mv[merge_cand[*best_index].dir - 1][1] >> 2;
+  } else if (*best_index == num_cand) {
+    mv->x = mv_in_out->x >> 2;
+    mv->y = mv_in_out->y >> 2;
+  } else {
+    mv->x = 0;
+    mv->y = 0;
+  }
+  return best_cost;
 }
 
 
-static uint32_t get_mvd_coding_cost(vector2d_t *mvd, cabac_data_t* cabac)
+static uint32_t get_mvd_coding_cost(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac)
 {
-  uint32_t bitcost = 0;
-  const int32_t mvd_hor = mvd->x;
-  const int32_t mvd_ver = mvd->y;
-  const int8_t hor_abs_gr0 = mvd_hor != 0;
-  const int8_t ver_abs_gr0 = mvd_ver != 0;
-  const uint32_t mvd_hor_abs = abs(mvd_hor);
-  const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-  // Greater than 0 for x/y
-  bitcost += 2;
-
-  if (hor_abs_gr0) {
-    if (mvd_hor_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs-2, 1) - 2; // TODO: tune the costs
+  unsigned bitcost = 0;
+  const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) };
+
+  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.x > 0);
+  if (abs_mvd.x > 0) {
+    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.x > 1);
+    if (abs_mvd.x > 1) {

kvazaar-0.8.3.tar.gz/src/search_inter.h -> kvazaar-1.0.0.tar.gz/src/search_inter.h Changed

@@ -26,16 +26,51 @@
  * Inter prediction parameter search.
  */
 
-#include "global.h"
-
+#include "cu.h"
 #include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "inter.h"
+#include "kvazaar.h"
+
+#define FILTER_SIZE 8
+#define HALF_FILTER (FILTER_SIZE>>1)
+
+// Maximum extra width a block needs to filter 
+// a fractional pixel with positive fractional mv.x and mv.y
+#define KVZ_EXT_PADDING (FILTER_SIZE - 1)
+
+// Maximum block width for extended block
+#define KVZ_EXT_BLOCK_W (LCU_WIDTH + KVZ_EXT_PADDING)
+
+typedef kvz_pixel frac_search_block[(LCU_WIDTH + 1) * (LCU_WIDTH + 1)];
+
+enum hpel_position {
+  HPEL_POS_HOR = 0,
+  HPEL_POS_VER = 1,
+  HPEL_POS_DIA = 2
+};
+
+typedef int kvz_mvd_cost_func(encoder_state_t * const state,
+                              int x, int y,
+                              int mv_shift,
+                              int16_t mv_cand[2][2],
+                              inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                              int16_t num_cand,
+                              int32_t ref_idx,
+                              uint32_t *bitcost);
 
-int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu);
+void kvz_search_cu_inter(encoder_state_t * const state,
+                         int x, int y, int depth,
+                         lcu_t *lcu,
+                         double *inter_cost,
+                         uint32_t *inter_bitcost);
 
-int kvz_search_cu_smp(const encoder_state_t * const state,
-                      int x, int y,
-                      int depth,
-                      part_mode_t part_mode,
-                      lcu_t *lcu);
+void kvz_search_cu_smp(encoder_state_t * const state,
+                       int x, int y,
+                       int depth,
+                       part_mode_t part_mode,
+                       lcu_t *lcu,
+                       double *inter_cost,
+                       uint32_t *inter_bitcost);
 
 #endif // SEARCH_INTER_H_

kvazaar-0.8.3.tar.gz/src/search_intra.c -> kvazaar-1.0.0.tar.gz/src/search_intra.c Changed

@@ -20,12 +20,18 @@
 
 #include "search_intra.h"
 
+#include <limits.h>
+
+#include "cabac.h"
+#include "encoder.h"
 #include "encoderstate.h"
-#include "videoframe.h"
-#include "strategies/strategies-picture.h"
+#include "image.h"
+#include "intra.h"
+#include "kvazaar.h"
 #include "rdo.h"
 #include "search.h"
-#include "intra.h"
+#include "strategies/strategies-picture.h"
+#include "videoframe.h"
 
 
 // Normalize SAD for comparison against SATD to estimate transform skip
@@ -105,10 +111,13 @@
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
     const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
-    ctx = &state->cabac.ctx.transform_skip_model_chroma;
-    trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
 
-    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + state->global->cur_lambda_cost_sqrt * trskip_bits;
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+    }
+
+    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + state->frame->cur_lambda_cost_sqrt * trskip_bits;
     if (sad_cost < satd_cost) {
       return sad_cost;
     }
@@ -145,14 +154,17 @@
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
     const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
-    ctx = &state->cabac.ctx.transform_skip_model_chroma;
-    trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+    }
 
     unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 };
     double sad_costs[PARALLEL_BLKS] = { 0 };
     sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      sad_costs[i] = TRSKIP_RATIO * (double)unsigned_sad_costs[i] + state->global->cur_lambda_cost_sqrt * trskip_bits;
+      sad_costs[i] = TRSKIP_RATIO * (double)unsigned_sad_costs[i] + state->frame->cur_lambda_cost_sqrt * trskip_bits;
       if (sad_costs[i] < (double)satd_costs[i]) {
         costs_out[i] = sad_costs[i];
       }
@@ -189,14 +201,14 @@
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
-  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
+  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4) && state->encoder_control->chroma_format != KVZ_CSP_400;
 
   struct {
     kvz_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH];
     kvz_pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH];
     kvz_pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH];
   } nosplit_pixels;
-  cu_cbf_t nosplit_cbf = { .y = 0, .u = 0, .v = 0 };
+  uint16_t nosplit_cbf = 0;
 
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
@@ -207,14 +219,14 @@
 
     nosplit_cost = 0.0;
 
-    cbf_clear(&pred_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4));
+    cbf_clear(&pred_cu->cbf, depth, COLOR_Y);
 
     kvz_intra_recon_lcu_luma(state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
     nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
     if (reconstruct_chroma) {
-      cbf_clear(&pred_cu->cbf.u, depth);
-      cbf_clear(&pred_cu->cbf.v, depth);
+      cbf_clear(&pred_cu->cbf, depth, COLOR_U);
+      cbf_clear(&pred_cu->cbf, depth, COLOR_V);
 
       kvz_intra_recon_lcu_chroma(state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
       nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
@@ -242,7 +254,7 @@
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->global->cur_lambda_cost;
+    split_cost = 3 * state->frame->cur_lambda_cost;
 
     split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
     if (split_cost < nosplit_cost) {
@@ -271,20 +283,20 @@
     // if this and any previous transform block has no chroma coefficients.
     // When searching the first block we don't actually know the real values,
     // so this will code cbf as 0 and not code the cbf at all for descendants.
-    {
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
       const uint8_t tr_depth = depth - pred_cu->depth;
 
       const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chroma[tr_depth]);
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
+      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
       }
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
+      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
       }
     }
 
     double bits = tr_split_bit + cbf_bits;
-    split_cost += bits * state->global->cur_lambda_cost;
+    split_cost += bits * state->frame->cur_lambda_cost;
   } else {
     assert(width <= TR_MAX_WIDTH);
   }
@@ -337,7 +349,7 @@
   kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
     if (modes[i] == luma_mode) continue;
-    kvz_intra_predict(refs_u, log2_width_c, modes[i], COLOR_U, pred);
+    kvz_intra_predict(refs_u, log2_width_c, modes[i], COLOR_U, pred, false);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
@@ -345,7 +357,7 @@
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
     if (modes[i] == luma_mode) continue;
-    kvz_intra_predict(refs_v, log2_width_c, modes[i], COLOR_V, pred);
+    kvz_intra_predict(refs_v, log2_width_c, modes[i], COLOR_V, pred, false);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
@@ -398,6 +410,9 @@
   cost_pixel_nxn_multi_func *satd_dual_func = kvz_pixels_get_satd_dual_func(width);
   cost_pixel_nxn_multi_func *sad_dual_func = kvz_pixels_get_sad_dual_func(width);
 
+  const kvz_config *cfg = state->encoder_control->cfg;
+  const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
+
   // Temporary block arrays
   kvz_pixel _preds[PARALLEL_BLKS * 32 * 32 + SIMD_ALIGNMENT];
   pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT);
@@ -428,7 +443,9 @@
     
     double costs_out[PARALLEL_BLKS] = { 0 };
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      if (mode + i * offset <= 34) kvz_intra_predict(refs, log2_width, mode + i * offset, COLOR_Y, preds[i]);
+      if (mode + i * offset <= 34) {
+        kvz_intra_predict(refs, log2_width, mode + i * offset, COLOR_Y, preds[i], filter_boundary);
+      }
     }
     
     //TODO: add generic version of get cost  multi
@@ -465,7 +482,9 @@
 
       if (mode_in_range) {
         for (int i = 0; i < PARALLEL_BLKS; ++i) {
-          if (test_modes[i] >= 2 && test_modes[i] <= 34) kvz_intra_predict(refs, log2_width, test_modes[i], COLOR_Y, preds[i]);
+          if (test_modes[i] >= 2 && test_modes[i] <= 34) {
+            kvz_intra_predict(refs, log2_width, test_modes[i], COLOR_Y, preds[i], filter_boundary);
+          }
         }
 
         //TODO: add generic version of get cost multi
@@ -501,7 +520,7 @@
     }
 
     if (!has_mode) {
-      kvz_intra_predict(refs, log2_width, mode, COLOR_Y, preds[0]);
+      kvz_intra_predict(refs, log2_width, mode, COLOR_Y, preds[0], filter_boundary);
       costs[modes_selected] = get_cost(state, preds[0], orig_block, satd_func, sad_func, width);
       modes[modes_selected] = mode;
       ++modes_selected;
@@ -510,7 +529,7 @@
 
   // Add prediction mode coding cost as the last thing. We don't want this
   // affecting the halving search.
-  int lambda_cost = (int)(state->global->cur_lambda_cost_sqrt + 0.5);
+  int lambda_cost = (int)(state->frame->cur_lambda_cost_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
     costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds);
   }
@@ -581,18 +600,15 @@

kvazaar-0.8.3.tar.gz/src/search_intra.h -> kvazaar-1.0.0.tar.gz/src/search_intra.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/altivec/picture-altivec.c -> kvazaar-1.0.0.tar.gz/src/strategies/altivec/picture-altivec.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/altivec/picture-altivec.h -> kvazaar-1.0.0.tar.gz/src/strategies/altivec/picture-altivec.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/dct-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/dct-avx2.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/dct-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/dct-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/intra-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/intra-avx2.c Changed

@@ -18,14 +18,15 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "strategies/avx2/intra-avx2.h"
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include <immintrin.h>
 #include <stdlib.h>
 
-#include "intra-avx2.h"
+#include "kvazaar.h"
 #include "strategyselector.h"
 
-#if COMPILE_INTEL_AVX2 && defined X86_64
-#include <immintrin.h>
-#include "strategies/strategies-common.h"
 
  /**
  * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register.
@@ -250,14 +251,14 @@
       int rx = 0;
       int ry = y;
 
-      row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
-      row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
-      row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
-      row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
-      row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
-      row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
-      row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
-      row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
+      row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0));
+      row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1));
+      row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0));
+      row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1));
+      row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0));
+      row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1));
+      row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0));
+      row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1));
 
       _mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0));
       _mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1));
@@ -341,14 +342,14 @@
       } else {
 
         //Move all filtered pixels to the lower lane to reduce memory accesses
-        row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
-        row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
-        row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
-        row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
-        row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
-        row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
-        row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
-        row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
+        row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0));
+        row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1));
+        row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0));
+        row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1));
+        row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0));
+        row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1));
+        row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0));
+        row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1));
 
         _mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0));
         _mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1));

kvazaar-0.8.3.tar.gz/src/strategies/avx2/intra-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/intra-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/ipol-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/ipol-avx2.c Changed

@@ -22,17 +22,19 @@
 * \file
 */
 
-#include "ipol-avx2.h"
-#include "strategyselector.h"
+#include "strategies/avx2/ipol-avx2.h"
 
 #if COMPILE_INTEL_AVX2
-#include <stdlib.h>
-
 #include <immintrin.h>
-
+#include <stdio.h>
+#include <string.h>
 
 #include "encoder.h"
+#include "kvazaar.h"
 #include "strategies/generic/picture-generic.h"
+#include "strategies/strategies-ipol.h"
+#include "strategyselector.h"
+#include "strategies/generic/ipol-generic.h"
 
 
 #define FILTER_OFFSET 3
@@ -62,6 +64,235 @@
   _mm_storeu_si128(dst, a);
 }
 
+static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *filter, int32_t offset23, int32_t shift23)
+{
+  __m128i temp[8];
+  __m128i temp_lo;
+  __m128i temp_hi;
+  __m128i fir = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
+
+  temp[0] = _mm_madd_epi16(row[0], fir);
+  temp[1] = _mm_madd_epi16(row[1], fir);
+  temp_lo = _mm_unpacklo_epi32(temp[0], temp[1]);
+  temp_hi = _mm_unpackhi_epi32(temp[0], temp[1]);
+  temp[0] = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp[2] = _mm_madd_epi16(row[2], fir);
+  temp[3] = _mm_madd_epi16(row[3], fir);
+  temp_lo = _mm_unpacklo_epi32(temp[2], temp[3]);
+  temp_hi = _mm_unpackhi_epi32(temp[2], temp[3]);
+  temp[2] = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp[4] = _mm_madd_epi16(row[4], fir);
+  temp[5] = _mm_madd_epi16(row[5], fir);
+  temp_lo = _mm_unpacklo_epi32(temp[4], temp[5]);
+  temp_hi = _mm_unpackhi_epi32(temp[4], temp[5]);
+  temp[4] = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp[6] = _mm_madd_epi16(row[6], fir);
+  temp[7] = _mm_madd_epi16(row[7], fir);
+  temp_lo = _mm_unpacklo_epi32(temp[6], temp[7]);
+  temp_hi = _mm_unpackhi_epi32(temp[6], temp[7]);
+  temp[6] = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp_lo = _mm_unpacklo_epi32(temp[0], temp[2]);
+  temp_hi = _mm_unpackhi_epi32(temp[0], temp[2]);
+  temp[0] = _mm_add_epi32(temp_lo, temp_hi);
+  temp[0] = _mm_shuffle_epi32(temp[0], _MM_SHUFFLE(3, 1, 2, 0));
+
+  temp_lo = _mm_unpacklo_epi32(temp[4], temp[6]);
+  temp_hi = _mm_unpackhi_epi32(temp[4], temp[6]);
+  temp[4] = _mm_add_epi32(temp_lo, temp_hi);
+  temp[4] = _mm_shuffle_epi32(temp[4], _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m128i add = _mm_set1_epi32(offset23);
+  temp[0] = _mm_add_epi32(temp[0], add);
+  temp[4] = _mm_add_epi32(temp[4], add);
+  temp[0] = _mm_srai_epi32(temp[0], shift23);
+  temp[4] = _mm_srai_epi32(temp[4], shift23);
+
+  temp[0] = _mm_packus_epi32(temp[0], temp[4]);
+  temp[0] = _mm_packus_epi16(temp[0], temp[0]);
+
+  return temp[0];
+}
+
+static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t *filter[2], int32_t offset23, int32_t shift23)
+{
+  __m256i temp[8];
+  __m256i temp_lo;
+  __m256i temp_hi;
+  __m256i fir = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)filter[0]), _mm_loadl_epi64((__m128i*)filter[1])));
+
+  temp[0] = _mm256_madd_epi16(row[0], fir);
+  temp[1] = _mm256_madd_epi16(row[1], fir);
+  temp_lo = _mm256_unpacklo_epi32(temp[0], temp[1]);
+  temp_hi = _mm256_unpackhi_epi32(temp[0], temp[1]);
+  temp[0] = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp[2] = _mm256_madd_epi16(row[2], fir);
+  temp[3] = _mm256_madd_epi16(row[3], fir);
+  temp_lo = _mm256_unpacklo_epi32(temp[2], temp[3]);
+  temp_hi = _mm256_unpackhi_epi32(temp[2], temp[3]);
+  temp[2] = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp[4] = _mm256_madd_epi16(row[4], fir);
+  temp[5] = _mm256_madd_epi16(row[5], fir);
+  temp_lo = _mm256_unpacklo_epi32(temp[4], temp[5]);
+  temp_hi = _mm256_unpackhi_epi32(temp[4], temp[5]);
+  temp[4] = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp[6] = _mm256_madd_epi16(row[6], fir);
+  temp[7] = _mm256_madd_epi16(row[7], fir);
+  temp_lo = _mm256_unpacklo_epi32(temp[6], temp[7]);
+  temp_hi = _mm256_unpackhi_epi32(temp[6], temp[7]);
+  temp[6] = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp_lo = _mm256_unpacklo_epi32(temp[0], temp[2]);
+  temp_hi = _mm256_unpackhi_epi32(temp[0], temp[2]);
+  temp[0] = _mm256_add_epi32(temp_lo, temp_hi);
+  temp[0] = _mm256_shuffle_epi32(temp[0], _MM_SHUFFLE(3, 1, 2, 0));
+
+  temp_lo = _mm256_unpacklo_epi32(temp[4], temp[6]);
+  temp_hi = _mm256_unpackhi_epi32(temp[4], temp[6]);
+  temp[4] = _mm256_add_epi32(temp_lo, temp_hi);
+  temp[4] = _mm256_shuffle_epi32(temp[4], _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i add = _mm256_set1_epi32(offset23);
+  temp[0] = _mm256_add_epi32(temp[0], add);
+  temp[4] = _mm256_add_epi32(temp[4], add);
+  temp[0] = _mm256_srai_epi32(temp[0], shift23);
+  temp[4] = _mm256_srai_epi32(temp[4], shift23);
+
+  temp[0] = _mm256_packus_epi32(temp[0], temp[4]);
+  temp[0] = _mm256_packus_epi16(temp[0], temp[0]);
+
+  return temp[0];
+}
+
+/*
+static __m128i kvz_eight_tap_filter_flip_x8_avx2(__m128i *row, int8_t *filter,  int32_t shift1)
+{
+  __m128i temp[4];
+  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
+  
+  temp[0] = _mm_unpacklo_epi64(row[0], row[1]);
+  temp[0] = _mm_maddubs_epi16(temp[0], fir);
+
+  temp[1] = _mm_unpacklo_epi64(row[2], row[3]);
+  temp[1] = _mm_maddubs_epi16(temp[1], fir);
+
+  temp[0] = _mm_hadd_epi16(temp[0], temp[1]);
+
+  temp[2] = _mm_unpacklo_epi64(row[4], row[5]);
+  temp[2] = _mm_maddubs_epi16(temp[2], fir);
+
+  temp[3] = _mm_unpacklo_epi64(row[6], row[7]);
+  temp[3] = _mm_maddubs_epi16(temp[3], fir);
+  
+  temp[2] = _mm_hadd_epi16(temp[2], temp[3]);
+
+  temp[0] = _mm_hadd_epi16(temp[0], temp[2]);
+
+  temp[0] = _mm_srai_epi16(temp[0], shift1);
+
+  return temp[0];
+}
+*/
+
+static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filter[2],  int32_t shift1)
+{
+  __m256i temp[4];
+  __m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter[0])), _mm_loadl_epi64((__m128i*)filter[1]), 1);
+  fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(1, 0, 1, 0));
+  
+  temp[0] = _mm256_unpacklo_epi64(row[0], row[1]);
+  temp[0] = _mm256_maddubs_epi16(temp[0], fir);
+
+  temp[1] = _mm256_unpacklo_epi64(row[2], row[3]);
+  temp[1] = _mm256_maddubs_epi16(temp[1], fir);
+
+  temp[0] = _mm256_hadd_epi16(temp[0], temp[1]);
+
+  temp[2] = _mm256_unpacklo_epi64(row[4], row[5]);
+  temp[2] = _mm256_maddubs_epi16(temp[2], fir);
+
+  temp[3] = _mm256_unpacklo_epi64(row[6], row[7]);
+  temp[3] = _mm256_maddubs_epi16(temp[3], fir);
+  
+  temp[2] = _mm256_hadd_epi16(temp[2], temp[3]);
+
+  temp[0] = _mm256_hadd_epi16(temp[0], temp[2]);
+
+  temp[0] = _mm256_srai_epi16(temp[0], shift1);
+
+  return temp[0];
+}
+
+/*
+static INLINE void kvz_filter_flip_shift_x8_avx2(kvz_pixel *src, int16_t src_stride, int8_t *filter, int32_t shift1, int16_t *dst){
+
+  __m128i rows[8];
+  rows[0] = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride));
+  rows[1] = _mm_loadl_epi64((__m128i*)(src + 1 * src_stride));

kvazaar-0.8.3.tar.gz/src/strategies/avx2/ipol-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/ipol-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -21,13 +21,16 @@
 /*
  * \file
  */
-#include "picture-avx2.h"
-#include "strategyselector.h"
+#include "strategies/avx2/picture-avx2.h"
 
 #if COMPILE_INTEL_AVX2
-#  include "image.h"
-#  include "strategies/strategies-common.h"
-#  include <immintrin.h>
+#include <immintrin.h>
+#include <string.h>
+
+#include "kvazaar.h"
+#include "strategies/strategies-picture.h"
+#include "strategyselector.h"
+#include "strategies/generic/picture-generic.h"
 
 
 /**
@@ -171,9 +174,9 @@
 
   row3 = _mm_add_epi16(row2, row3);
 
-  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
-  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
-  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
+  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 3, 2) ));
+  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
+  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
 
   unsigned sum = _mm_extract_epi16(row3, 0);
   unsigned satd = (sum + 1) >> 1;
@@ -218,9 +221,9 @@
 
   row3 = _mm256_add_epi16(row2, row3);
 
-  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
-  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
-  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
+  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 3, 2) ));
+  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
+  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
 
   unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
   sum1 = (sum1 + 1) >> 1;
@@ -237,18 +240,18 @@
   __m128i mask_pos = _mm_set1_epi16(1);
   __m128i mask_neg = _mm_set1_epi16(-1);
   __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
-  __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
+  __m128i temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
   *row = _mm_sign_epi16(*row, sign_mask);
   *row = _mm_add_epi16(*row, temp);
 
   sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
-  temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
+  temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
   *row = _mm_sign_epi16(*row, sign_mask);
   *row = _mm_add_epi16(*row, temp);
 
   sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
-  temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
-  temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
+  temp = _mm_shufflelo_epi16(*row, _MM_SHUFFLE(2,3,0,1));
+  temp = _mm_shufflehi_epi16(temp, _MM_SHUFFLE(2,3,0,1));
   *row = _mm_sign_epi16(*row, sign_mask);
   *row = _mm_add_epi16(*row, temp);
 }
@@ -258,18 +261,18 @@
   __m256i mask_pos = _mm256_set1_epi16(1);
   __m256i mask_neg = _mm256_set1_epi16(-1);
   __m256i sign_mask = _mm256_unpacklo_epi64(mask_pos, mask_neg);
-  __m256i temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
+  __m256i temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
   *row = _mm256_sign_epi16(*row, sign_mask);
   *row = _mm256_add_epi16(*row, temp);
 
   sign_mask = _mm256_unpacklo_epi32(mask_pos, mask_neg);
-  temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
+  temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
   *row = _mm256_sign_epi16(*row, sign_mask);
   *row = _mm256_add_epi16(*row, temp);
 
   sign_mask = _mm256_unpacklo_epi16(mask_pos, mask_neg);
-  temp = _mm256_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
-  temp = _mm256_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
+  temp = _mm256_shufflelo_epi16(*row, _MM_SHUFFLE(2,3,0,1));
+  temp = _mm256_shufflehi_epi16(temp, _MM_SHUFFLE(2,3,0,1));
   *row = _mm256_sign_epi16(*row, sign_mask);
   *row = _mm256_add_epi16(*row, temp);
 }
@@ -353,8 +356,8 @@
   haddwd_accumulate_avx2(&sad, ver_row + 6);
   haddwd_accumulate_avx2(&sad, ver_row + 7);
 
-  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
-  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
+  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 3, 2)));
+  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(0, 1, 0, 1)));
 
   return _mm_cvtsi128_si32(sad);
 }
@@ -371,8 +374,8 @@
   haddwd_accumulate_dual_avx2(&sad, ver_row + 6);
   haddwd_accumulate_dual_avx2(&sad, ver_row + 7);
 
-  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
-  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
+  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 3, 2)));
+  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(0, 1, 0, 1)));
 
   *sum0 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 0));
   *sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1));
@@ -451,6 +454,45 @@
   hor_transform_row_dual_avx2((*row_diff) + 7);
 }
 
+static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned stride1,
+                                                const kvz_pixel * buf2, unsigned stride2,
+                                                const kvz_pixel * orig, unsigned stride_orig,
+                                                unsigned *sum0, unsigned *sum1)
+{
+  __m256i temp[8];
+
+  diff_blocks_dual_avx2(&temp, buf1, stride1, buf2, stride2, orig, stride_orig);
+  hor_transform_block_dual_avx2(&temp);
+  ver_transform_block_dual_avx2(&temp);
+  
+  sum_block_dual_avx2(temp, sum0, sum1);
+
+  *sum0 = (*sum0 + 2) >> 2;
+  *sum1 = (*sum1 + 2) >> 2;
+}
+
+/**
+* \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
+*/
+static unsigned kvz_satd_4x4_subblock_8bit_avx2(const kvz_pixel * buf1,
+                                                const int32_t     stride1,
+                                                const kvz_pixel * buf2,
+                                                const int32_t     stride2)
+{
+  // TODO: AVX2 implementation
+  return kvz_satd_4x4_subblock_generic(buf1, stride1, buf2, stride2);
+}
+
+static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4],
+                                       const int strides[4],
+                                       const kvz_pixel *orig,
+                                       const int orig_stride,
+                                       unsigned costs[4])
+{
+  // TODO: AVX2 implementation
+  kvz_satd_4x4_subblock_quad_generic(preds, strides, orig, orig_stride, costs);
+}
+
 static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
 {
   __m128i temp[8];
@@ -465,6 +507,15 @@
   return result;
 }
 
+static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds,
+  const int *strides,
+  const kvz_pixel *orig,
+  const int orig_stride,
+  unsigned *costs)
+{
+  kvz_satd_8bit_8x8_general_dual_avx2(preds[0], strides[0], preds[1], strides[1], orig, orig_stride, &costs[0], &costs[1]);
+  kvz_satd_8bit_8x8_general_dual_avx2(preds[2], strides[2], preds[3], strides[3], orig, orig_stride, &costs[2], &costs[3]);
+}
 
 SATD_NxN(8bit_avx2,  8)
 SATD_NxN(8bit_avx2, 16)
@@ -472,25 +523,6 @@
 SATD_NxN(8bit_avx2, 64)
 SATD_ANY_SIZE(8bit_avx2)
 
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned stride1,
-                                                const kvz_pixel * buf2, unsigned stride2,
-                                                const kvz_pixel * orig, unsigned stride_orig,
-                                                unsigned *sum0, unsigned *sum1)
-{
-  __m256i temp[8];
-
-  diff_blocks_dual_avx2(&temp, buf1, stride1, buf2, stride2, orig, stride_orig);
-  hor_transform_block_dual_avx2(&temp);
-  ver_transform_block_dual_avx2(&temp);
-  
-  sum_block_dual_avx2(temp, sum0, sum1);
-
-  *sum0 = (*sum0 + 2) >> 2;
-  *sum1 = (*sum1 + 2) >> 2;
-}

kvazaar-0.8.3.tar.gz/src/strategies/avx2/picture-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/picture-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -22,25 +22,29 @@
 * \file
 */
 
+#include "strategies/avx2/quant-avx2.h"
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include <immintrin.h>
 #include <stdlib.h>
 
-#include "quant-avx2.h"
-#include "../generic/quant-generic.h"
-#include "../strategies-common.h"
-#include "strategyselector.h"
+#include "cu.h"
 #include "encoder.h"
-#include "transform.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
 #include "rdo.h"
+#include "scalinglist.h"
+#include "strategies/generic/quant-generic.h"
+#include "strategies/strategies-quant.h"
+#include "strategyselector.h"
+#include "tables.h"
+#include "transform.h"
 
-#if COMPILE_INTEL_AVX2 && defined X86_64
-#include <immintrin.h>
-#include <smmintrin.h>
 
 /**
-* \brief quantize transformed coefficents
-*
-*/
-
+ * \brief quantize transformed coefficents
+ *
+ */
 void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
   int32_t height, int8_t type, int8_t scan_idx, int8_t block_type)
 {
@@ -48,13 +52,13 @@
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift;
-  const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
+  const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
   assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t
@@ -96,8 +100,8 @@
   }
 
   __m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1));
-  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1)));
-  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0)));
+  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(1, 0, 3, 2)));
+  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(0, 1, 0, 1)));
   ac_sum += _mm_cvtsi128_si32(temp);
 
   if (!(encoder->sign_hiding && ac_sum >= 2)) return;
@@ -376,7 +380,7 @@
   }
 
   // Quantize coeffs. (coeff -> quant_coeff)
-  if (state->encoder_control->rdoq_enable) {
+  if (state->encoder_control->rdoq_enable && (width > 4 || !state->encoder_control->cfg->rdoq_skip)) {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     kvz_rdoq(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
@@ -453,7 +457,7 @@
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (kvz_g_convert_to_bit[ width ] + 2);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth-8)*6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth-8)*6);
 
   shift = 20 - QUANT_SHIFT - transform_shift;

kvazaar-0.8.3.tar.gz/src/strategies/avx2/quant-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/quant-avx2.h Changed

kvazaar-1.0.0.tar.gz/src/strategies/avx2/sao-avx2.c Added

@@ -0,0 +1,358 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/avx2/sao-avx2.h"
+
+#if COMPILE_INTEL_AVX2
+#include <immintrin.h>
+
+#include "cu.h"
+#include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "sao.h"
+#include "strategyselector.h"
+
+
+// These optimizations are based heavily on sao-generic.c.
+// Might be useful to check that if (when) this file
+// is difficult to understand.
+
+
+static INLINE __m256i load_6_offsets(const int* offsets){
+
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_loadl_epi64((__m128i*)&(offsets[4])), 1);
+}
+
+static INLINE __m128i load_6_pixels(const kvz_pixel* data){
+
+  return _mm_insert_epi16(_mm_cvtsi32_si128(*(int32_t*)&(data[0])), *(int16_t*)&(data[4]), 2);
+}
+
+static INLINE __m256i load_5_offsets(const int* offsets){
+
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_insert_epi32(_mm_setzero_si128(), offsets[4], 0), 1);
+}
+
+
+static __m128i sao_calc_eo_cat_avx2(__m128i* a, __m128i* b, __m128i* c)
+{
+  __m128i v_eo_idx = _mm_set1_epi16(2);
+  __m128i v_a = _mm_cvtepu8_epi16(*a);
+  __m128i v_c = _mm_cvtepu8_epi16(*c);
+  __m128i v_b = _mm_cvtepu8_epi16(*b);
+  
+  __m128i temp_a = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_a));
+  __m128i temp_b = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_b));
+  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_a);
+  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_b);
+  
+  v_eo_idx = _mm_packus_epi16(v_eo_idx, v_eo_idx);
+  __m128i v_cat_lookup = _mm_setr_epi8(1,2,0,3,4,0,0,0,0,0,0,0,0,0,0,0);
+  __m128i v_cat = _mm_shuffle_epi8(v_cat_lookup, v_eo_idx);
+
+
+  return v_cat;
+}
+
+
+int kvz_sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int eo_class, int offsets[NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  int sum = 0;
+  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
+
+  __m256i v_accum = { 0 };
+
+  for (y = 1; y < block_height - 1; ++y) {
+
+    for (x = 1; x < block_width - 8; x+=8) {
+      const kvz_pixel *c_data = &rec_data[y * block_width + x];
+
+      __m128i v_c_data = _mm_loadl_epi64((__m128i*)c_data);
+      __m128i v_a = _mm_loadl_epi64((__m128i*)(&c_data[a_ofs.y * block_width + a_ofs.x]));
+      __m128i v_c = v_c_data;
+      __m128i v_b = _mm_loadl_epi64((__m128i*)(&c_data[b_ofs.y * block_width + b_ofs.x]));
+
+      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+      __m256i v_offset = _mm256_loadu_si256((__m256i*) offsets);
+      v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
+   
+      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_data[y * block_width + x])));
+      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+      __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
+      __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
+      v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+    }
+
+    //Handle last 6 pixels separately to prevent reading over boundary
+    const kvz_pixel *c_data = &rec_data[y * block_width + x];
+    __m128i v_c_data = load_6_pixels(c_data);
+    const kvz_pixel* a_ptr = &c_data[a_ofs.y * block_width + a_ofs.x];
+    const kvz_pixel* b_ptr = &c_data[b_ofs.y * block_width + b_ofs.x];
+    __m128i v_a = load_6_pixels(a_ptr);
+    __m128i v_c = v_c_data;
+    __m128i v_b = load_6_pixels(b_ptr);
+
+    __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+    __m256i v_offset = load_6_offsets(offsets);
+    v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
+   
+    const kvz_pixel* orig_ptr = &(orig_data[y * block_width + x]);
+    __m256i v_diff = _mm256_cvtepu8_epi32(load_6_pixels(orig_ptr));
+    v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+
+    __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
+    __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
+    v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+  }
+
+  //Full horizontal sum
+  v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
+  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 3, 2)));
+  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(0, 1, 0, 1)));
+  sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
+
+  return sum;
+}
+
+
+static INLINE void accum_count_eo_cat_avx2(__m256i*  __restrict v_diff_accum, __m256i* __restrict v_count, __m256i* __restrict v_cat, __m256i* __restrict v_diff, int eo_cat){
+        __m256i v_mask = _mm256_cmpeq_epi32(*v_cat, _mm256_set1_epi32(eo_cat));
+        *v_diff_accum = _mm256_add_epi32(*v_diff_accum, _mm256_and_si256(*v_diff, v_mask));
+        *v_count = _mm256_sub_epi32(*v_count, v_mask);
+}
+
+
+#define ACCUM_COUNT_EO_CAT_AVX2(EO_CAT, V_CAT) \
+  \
+  accum_count_eo_cat_avx2(&(v_diff_accum[ EO_CAT ]), &(v_count[ EO_CAT ]), &V_CAT , &v_diff, EO_CAT);
+
+
+void kvz_calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                              int eo_class, int block_width, int block_height,
+                              int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+
+  __m256i v_diff_accum[NUM_SAO_EDGE_CATEGORIES] = { { 0 } };
+  __m256i v_count[NUM_SAO_EDGE_CATEGORIES] = { { 0 } };
+
+  for (y = 1; y < block_height - 1; ++y) {
+
+    //Calculation for 8 pixels per round
+    for (x = 1; x < block_width - 8; x += 8) {
+      const kvz_pixel *c_data = &rec_data[y * block_width + x];
+
+      __m128i v_c_data = _mm_loadl_epi64((__m128i* __restrict)c_data);
+      __m128i v_a = _mm_loadl_epi64((__m128i* __restrict)(&c_data[a_ofs.y * block_width + a_ofs.x]));
+      __m128i v_c = v_c_data;
+      __m128i v_b = _mm_loadl_epi64((__m128i* __restrict)(&c_data[b_ofs.y * block_width + b_ofs.x]));
+
+      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i* __restrict)&(orig_data[y * block_width + x])));
+      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+
+      //Accumulate differences and occurrences for each category
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT0, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT1, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT2, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT3, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT4, v_cat);
+    }
+
+    //Handle last 6 pixels separately to prevent reading over boundary
+    const kvz_pixel *c_data = &rec_data[y * block_width + x];
+    __m128i v_c_data = load_6_pixels(c_data);
+    const kvz_pixel* a_ptr = &c_data[a_ofs.y * block_width + a_ofs.x];
+    const kvz_pixel* b_ptr = &c_data[b_ofs.y * block_width + b_ofs.x];
+    __m128i v_a = load_6_pixels(a_ptr);
+    __m128i v_c = v_c_data;
+    __m128i v_b = load_6_pixels(b_ptr);

kvazaar-1.0.0.tar.gz/src/strategies/avx2/sao-avx2.h Added

@@ -0,0 +1,34 @@
+#ifndef STRATEGIES_SAO_AVX2_H_
+#define STRATEGIES_SAO_AVX2_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * AVX2 implementations of optimized functions.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+
+int kvz_strategy_register_sao_avx2(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_SAO_AVX2_H_

kvazaar-0.8.3.tar.gz/src/strategies/generic/dct-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/dct-generic.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/dct-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/dct-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/intra-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/intra-generic.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/intra-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/intra-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/ipol-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/ipol-generic.c Changed

@@ -18,12 +18,15 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdlib.h>
+#include "strategies/generic/ipol-generic.h"
+
+#include <stdio.h>
+#include <string.h>
 
-#include "ipol-generic.h"
-#include "strategyselector.h"
 #include "encoder.h"
-#include "picture-generic.h"
+#include "strategies/generic/picture-generic.h"
+#include "strategies/strategies-ipol.h"
+#include "strategyselector.h"
 
 extern int8_t kvz_g_luma_filter[4][8];
 extern int8_t kvz_g_chroma_filter[8][4];
@@ -410,6 +413,219 @@
   }
 }
 
+void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter[0];
+  int8_t *fir2 = kvz_g_luma_filter[2];
+
+  int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      filtered[HPEL_POS_HOR][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[HPEL_POS_VER][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_hpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter[0];
+  int8_t *fir2 = kvz_g_luma_filter[2];
+
+  int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      filtered[HPEL_POS_HOR][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[HPEL_POS_VER][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[HPEL_POS_DIA][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter[0];
+  int8_t *fir2 = kvz_g_luma_filter[2];
+  int8_t *fir1 = kvz_g_luma_filter[1];
+  int8_t *fir3 = kvz_g_luma_filter[3];
+
+  int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped1[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped3[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+  
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped1[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir1, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped3[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir3, &src[src_stride*ypos + xpos]) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      
+      // HPEL
+      filtered[ 0][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 1][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 2][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      
+      // QPEL
+      // Horizontal
+      filtered[ 3][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 4][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 5][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 6][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+
+      // Vertical
+      filtered[ 7][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 8][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 9][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[10][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_qpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter[0];
+  int8_t *fir2 = kvz_g_luma_filter[2];
+  int8_t *fir1 = kvz_g_luma_filter[1];
+  int8_t *fir3 = kvz_g_luma_filter[3];
+
+  int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped1[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+  int16_t flipped3[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)];
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+  
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped1[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir1, &src[src_stride*ypos + xpos]) >> shift1;
+      flipped3[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir3, &src[src_stride*ypos + xpos]) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      
+      // HPEL
+      filtered[ 0][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 1][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      filtered[ 2][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3);
+      
+      // QPEL
+      // Horizontal
+      filtered[ 3][y * dst_stride + x]  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3);

kvazaar-0.8.3.tar.gz/src/strategies/generic/ipol-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/ipol-generic.h Changed

@@ -26,16 +26,13 @@
  * Generic C implementations of optimized functions.
  */
 
-#include "global.h"
-
 #include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
 
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
-
-//TODO: create strategies from sample functions
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+
 #endif //STRATEGIES_IPOL_GENERIC_H_

kvazaar-0.8.3.tar.gz/src/strategies/generic/nal-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/nal-generic.c Changed

@@ -18,12 +18,29 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdlib.h>
-#include <assert.h>
+#include "strategies/generic/nal-generic.h"
 
-#include "strategyselector.h"
+#include "extras/libmd5.h"
+#include "kvazaar.h"
 #include "nal.h"
+#include "strategyselector.h"
+
+
+static void array_md5_generic(const kvz_pixel* data,
+                              const int height, const int width,
+                              const int stride,
+                              unsigned char checksum_out[SEI_HASH_MAX_LENGTH], const uint8_t bitdepth)
+{
+  assert(SEI_HASH_MAX_LENGTH >= 16);
 
+  context_md5_t md5_ctx;
+  kvz_md5_init(&md5_ctx);
+  
+  unsigned bytes = width * height * sizeof(kvz_pixel);
+  kvz_md5_update(&md5_ctx, (const unsigned char *)data, bytes);
+
+  kvz_md5_final(checksum_out, &md5_ctx);
+}
 
 static void array_checksum_generic(const kvz_pixel* data,
                                    const int height, const int width,
@@ -150,6 +167,7 @@
 int kvz_strategy_register_nal_generic(void* opaque, uint8_t bitdepth) {
   bool success = true;
 
+  success &= kvz_strategyselector_register(opaque, "array_md5", "generic", 0, &array_md5_generic);
   success &= kvz_strategyselector_register(opaque, "array_checksum", "generic", 0, &array_checksum_generic);
   success &= kvz_strategyselector_register(opaque, "array_checksum", "generic4", 1, &array_checksum_generic4);
   success &= kvz_strategyselector_register(opaque, "array_checksum", "generic8", 2, &array_checksum_generic8);

kvazaar-0.8.3.tar.gz/src/strategies/generic/nal-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/nal-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -18,8 +18,11 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "strategies/generic/picture-generic.h"
+
 #include <stdlib.h>
 
+#include "strategies/strategies-picture.h"
 #include "strategyselector.h"
 
 // Function to clip int16_t to pixel. (0-255 or 0-1023)
@@ -95,19 +98,13 @@
   return sad;
 }
 
-
 /**
- * \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
+ * \brief  Transform differences between two 4x4 blocks.
  * From HM 13.0
  */
-static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
+static int32_t hadamard_4x4_generic(int32_t diff[4*4])
 {
-  int32_t k, satd = 0, diff[16], m[16], d[16];
-  for (k = 0; k < 16; ++k) {
-    diff[k] = piOrg[k] - piCur[k];
-  }
-
-  /*===== hadamard transform =====*/
+  int32_t m[4 * 4];
   m[0] = diff[0] + diff[12];
   m[1] = diff[1] + diff[13];
   m[2] = diff[2] + diff[14];
@@ -125,6 +122,7 @@
   m[14] = diff[2] - diff[14];
   m[15] = diff[3] - diff[15];
 
+  int32_t d[4 * 4];
   d[0] = m[0] + m[4];
   d[1] = m[1] + m[5];
   d[2] = m[2] + m[6];
@@ -176,8 +174,9 @@
   d[14] = m[14] + m[15];
   d[15] = m[15] - m[14];
 
-  for (k = 0; k<16; ++k) {
-    satd += abs(d[k]);
+  int32_t satd = 0;
+  for (int i = 0; i < 16; i++) {
+    satd += abs(d[i]);
   }
   satd = ((satd + 1) >> 1);
 
@@ -185,6 +184,57 @@
 }
 
 /**
+ * \brief  Calculate SATD between two 4x4 blocks.
+ */
+static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
+{
+  int32_t diff[4 * 4];
+  for (int i = 0; i < 4 * 4; i++) {
+    diff[i] = piOrg[i] - piCur[i];
+  }
+  return hadamard_4x4_generic(diff);
+}
+
+/**
+* \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
+*/
+unsigned kvz_satd_4x4_subblock_generic(const kvz_pixel * buf1,
+                                       const int32_t     stride1,
+                                       const kvz_pixel * buf2,
+                                       const int32_t     stride2)
+{
+  int32_t diff[4 * 4];
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < 4; x++) {
+      diff[x + y * 4] = buf1[x + y * stride1] - buf2[x + y * stride2];
+    }
+  }
+  return hadamard_4x4_generic(diff);
+}
+
+void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds[4],
+                                       const int strides[4],
+                                       const kvz_pixel *orig,
+                                       const int orig_stride,
+                                       unsigned costs[4])
+{
+  int32_t diff[4][4 * 4];
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < 4; x++) {
+      diff[0][x + y * 4] = orig[x + y * orig_stride] - preds[0][x + y * strides[0]];
+      diff[1][x + y * 4] = orig[x + y * orig_stride] - preds[1][x + y * strides[1]];
+      diff[2][x + y * 4] = orig[x + y * orig_stride] - preds[2][x + y * strides[2]];
+      diff[3][x + y * 4] = orig[x + y * orig_stride] - preds[3][x + y * strides[3]];
+    }
+  }
+
+  costs[0] = hadamard_4x4_generic(diff[0]);
+  costs[1] = hadamard_4x4_generic(diff[1]);
+  costs[2] = hadamard_4x4_generic(diff[2]);
+  costs[3] = hadamard_4x4_generic(diff[3]);
+}
+
+/**
 * \brief  Calculate SATD between two 8x8 blocks inside bigger arrays.
 */
 static unsigned satd_8x8_subblock_generic(const kvz_pixel * piOrg, const int32_t iStrideOrg,
@@ -277,6 +327,18 @@
   return sad;
 }
 
+static void satd_8x8_subblock_quad_generic(const kvz_pixel **preds,
+                                       const int *strides,
+                                       const kvz_pixel *orig,
+                                       const int orig_stride,
+                                       unsigned *costs)
+{
+  costs[0] = satd_8x8_subblock_generic(orig, orig_stride, preds[0], strides[0]);
+  costs[1] = satd_8x8_subblock_generic(orig, orig_stride, preds[1], strides[1]);
+  costs[2] = satd_8x8_subblock_generic(orig, orig_stride, preds[2], strides[2]);
+  costs[3] = satd_8x8_subblock_generic(orig, orig_stride, preds[3], strides[3]);
+}
+
 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
 SATD_NxN(generic,  8)
 SATD_NxN(generic, 16)
@@ -327,6 +389,72 @@
 SATD_DUAL_NXN(32, kvz_pixel)
 SATD_DUAL_NXN(64, kvz_pixel)
 
+#define SATD_ANY_SIZE_MULTI_GENERIC(suffix, num_parallel_blocks) \
+  static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
+  static void satd_any_size_ ## suffix ( \
+      int width, int height, \
+      const kvz_pixel **preds, \
+      const int *strides, \
+      const kvz_pixel *orig, \
+      const int orig_stride, \
+      unsigned num_modes, \
+      unsigned *costs_out, \
+      int8_t *valid) \
+  { \
+    unsigned sums[num_parallel_blocks] = { 0 }; \
+    const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
+    const kvz_pixel *orig_ptr = orig; \
+    costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
+    if (width % 8 != 0) { \
+      /* Process the first column using 4x4 blocks. */ \
+      for (int y = 0; y < height; y += 4) { \
+        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+            } \
+      orig_ptr += 4; \
+      for(int blk = 0; blk < num_parallel_blocks; ++blk){\
+        pred_ptrs[blk] += 4; \
+            }\
+      width -= 4; \
+            } \
+    if (height % 8 != 0) { \
+      /* Process the first row using 4x4 blocks. */ \
+      for (int x = 0; x < width; x += 4 ) { \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+            } \
+      orig_ptr += 4 * orig_stride; \
+      for(int blk = 0; blk < num_parallel_blocks; ++blk){\
+        pred_ptrs[blk] += 4 * strides[blk]; \
+            }\
+      height -= 4; \
+        } \
+    /* The rest can now be processed with 8x8 blocks. */ \
+    for (int y = 0; y < height; y += 8) { \
+      orig_ptr = &orig[y * orig_stride]; \
+      pred_ptrs[0] = &preds[0][y * strides[0]]; \
+      pred_ptrs[1] = &preds[1][y * strides[1]]; \
+      pred_ptrs[2] = &preds[2][y * strides[2]]; \
+      pred_ptrs[3] = &preds[3][y * strides[3]]; \
+      for (int x = 0; x < width; x += 8) { \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        orig_ptr += 8; \
+        pred_ptrs[0] += 8; \
+        pred_ptrs[1] += 8; \
+        pred_ptrs[2] += 8; \
+        pred_ptrs[3] += 8; \
+        costs_out[0] += sums[0]; \
+        costs_out[1] += sums[1]; \
+        costs_out[2] += sums[2]; \
+        costs_out[3] += sums[3]; \
+      } \
+    } \
+    for(int i = 0; i < num_parallel_blocks; ++i){\
+      costs_out[i] = costs_out[i] >> (KVZ_BIT_DEPTH - 8);\
+    } \
+    return; \
+  }
+

kvazaar-0.8.3.tar.gz/src/strategies/generic/picture-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/picture-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -18,13 +18,16 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "strategies/generic/quant-generic.h"
+
 #include <stdlib.h>
 
-#include "quant-generic.h"
-#include "strategyselector.h"
 #include "encoder.h"
-#include "transform.h"
 #include "rdo.h"
+#include "scalinglist.h"
+#include "strategies/strategies-quant.h"
+#include "strategyselector.h"
+#include "transform.h"
 
 #define QUANT_SHIFT 14
 /**
@@ -38,13 +41,13 @@
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift;
-  const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
+  const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
   uint32_t ac_sum = 0;
@@ -210,7 +213,7 @@
   }
 
   // Quantize coeffs. (coeff -> quant_coeff)
-  if (state->encoder_control->rdoq_enable) {
+  if (state->encoder_control->rdoq_enable && (width > 4 || !state->encoder_control->cfg->rdoq_skip)) {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     kvz_rdoq(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
@@ -283,7 +286,7 @@
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (kvz_g_convert_to_bit[ width ] + 2);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth-8)*6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth-8)*6);
 
   shift = 20 - QUANT_SHIFT - transform_shift;

kvazaar-0.8.3.tar.gz/src/strategies/generic/quant-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/quant-generic.h Changed

kvazaar-1.0.0.tar.gz/src/strategies/generic/sao-generic.c Added

@@ -0,0 +1,184 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/generic/sao-generic.h"
+
+#include "cu.h"
+#include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "sao.h"
+#include "strategyselector.h"
+
+
+// Mapping of edge_idx values to eo-classes.
+static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
+{
+  // Mapping relationships between a, b and c to eo_idx.
+  static const int sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
+
+  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
+
+  return sao_eo_idx_to_eo_category[eo_idx];
+}
+
+
+int kvz_sao_edge_ddistortion_generic(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int eo_class, int offsets[NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  int sum = 0;
+  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
+
+  for (y = 1; y < block_height - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const kvz_pixel *c_data = &rec_data[y * block_width + x];
+      kvz_pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
+      kvz_pixel c = c_data[0];
+      kvz_pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
+
+      int offset = offsets[sao_calc_eo_cat(a, b, c)];
+
+      if (offset != 0) {
+        int diff = orig_data[y * block_width + x] - c;
+        // Offset is applied to reconstruction, so it is subtracted from diff.
+        sum += (diff - offset) * (diff - offset) - diff * diff;
+      }
+    }
+  }
+
+  return sum;
+}
+
+
+/**
+ * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param dir_offsets
+ * \param is_chroma  0 for luma, 1 for chroma. Indicates
+ */
+void kvz_calc_sao_edge_dir_generic(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                              int eo_class, int block_width, int block_height,
+                              int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1];
+  // Arrays orig_data and rec_data are quarter size for chroma.
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+  for (y = 1; y < block_height - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const kvz_pixel *c_data = &rec_data[y * block_width + x];
+      kvz_pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
+      kvz_pixel c = c_data[0];
+      kvz_pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
+
+      int eo_cat = sao_calc_eo_cat(a, b, c);
+
+      cat_sum_cnt[0][eo_cat] += orig_data[y * block_width + x] - c;
+      cat_sum_cnt[1][eo_cat] += 1;
+    }
+  }
+}
+
+
+void kvz_sao_reconstruct_color_generic(const encoder_control_t * const encoder, 
+                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
+                                  const sao_info_t *sao,
+                                  int stride, int new_stride,
+                                  int block_width, int block_height,
+                                  color_t color_i)
+{
+  int y, x;
+  // Arrays orig_data and rec_data are quarter size for chroma.
+  int offset_v = color_i == COLOR_V ? 5 : 0;
+
+  if(sao->type == SAO_TYPE_BAND) {
+    int offsets[1<<KVZ_BIT_DEPTH];
+    kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
+    for (y = 0; y < block_height; ++y) {
+      for (x = 0; x < block_width; ++x) {
+        new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
+      }
+    }
+  } else {
+    // Don't sample the edge pixels because this function doesn't have access to
+    // their neighbours.
+    for (y = 0; y < block_height; ++y) {
+      for (x = 0; x < block_width; ++x) {
+        vector2d_t a_ofs = g_sao_edge_offsets[sao->eo_class][0];
+        vector2d_t b_ofs = g_sao_edge_offsets[sao->eo_class][1];
+        const kvz_pixel *c_data = &rec_data[y * stride + x];
+        kvz_pixel *new_data = &new_rec_data[y * new_stride + x];
+        kvz_pixel a = c_data[a_ofs.y * stride + a_ofs.x];
+        kvz_pixel c = c_data[0];
+        kvz_pixel b = c_data[b_ofs.y * stride + b_ofs.x];
+
+        int eo_cat = sao_calc_eo_cat(a, b, c);
+
+        new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
+      }
+    }
+  }
+}
+
+
+int kvz_sao_band_ddistortion_generic(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int band_pos, int sao_bands[4])
+{
+  int y, x;
+  int shift = state->encoder_control->bitdepth-5;
+  int sum = 0;
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      int band = (rec_data[y * block_width + x] >> shift) - band_pos;
+      int offset = 0;
+      if (band >= 0 && band < 4) {
+        offset = sao_bands[band];
+      }
+      if (offset != 0) {
+        int diff = orig_data[y * block_width + x] - rec_data[y * block_width + x];
+        // Offset is applied to reconstruction, so it is subtracted from diff.
+        sum += (diff - offset) * (diff - offset) - diff * diff;
+      }
+    }
+  }
+
+  return sum;
+}
+
+
+int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+  
+  success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "generic", 0, &kvz_sao_edge_ddistortion_generic);
+  success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "generic", 0, &kvz_calc_sao_edge_dir_generic);
+  success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "generic", 0, &kvz_sao_reconstruct_color_generic);
+  success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "generic", 0, &kvz_sao_band_ddistortion_generic);
+
+  return success;
+}

kvazaar-1.0.0.tar.gz/src/strategies/generic/sao-generic.h Added

@@ -0,0 +1,33 @@
+#ifndef STRATEGIES_SAO_GENERIC_H_
+#define STRATEGIES_SAO_GENERIC_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_SAO_GENERIC_H_

kvazaar-0.8.3.tar.gz/src/strategies/sse2/picture-sse2.c -> kvazaar-1.0.0.tar.gz/src/strategies/sse2/picture-sse2.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/sse2/picture-sse2.h -> kvazaar-1.0.0.tar.gz/src/strategies/sse2/picture-sse2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/sse41/picture-sse41.c -> kvazaar-1.0.0.tar.gz/src/strategies/sse41/picture-sse41.c Changed

@@ -18,18 +18,18 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "picture-sse41.h"
-#include "strategyselector.h"
+#include "strategies/sse41/picture-sse41.h"
 
 #if COMPILE_INTEL_SSE41
-#  include "image.h"
-#  include <immintrin.h>
-#  include <assert.h>
-#  include <stdlib.h>
+#include <immintrin.h>
+#include <stdlib.h>
+
+#include "kvazaar.h"
+#include "strategyselector.h"
 
 
-static unsigned reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
-                        const int width, const int height, const unsigned stride1, const unsigned stride2)
+unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int width, const int height, const unsigned stride1, const unsigned stride2)
 {
   int y, x;
   unsigned sad = 0;
@@ -94,7 +94,7 @@
   bool success = true;
 #if COMPILE_INTEL_SSE41
   if (bitdepth == 8){
-    success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &reg_sad_sse41);
+    success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
   }
 #endif
   return success;

kvazaar-0.8.3.tar.gz/src/strategies/sse41/picture-sse41.h -> kvazaar-1.0.0.tar.gz/src/strategies/sse41/picture-sse41.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-common.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-common.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-dct.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-dct.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-dct.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-dct.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-intra.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-intra.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-intra.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-intra.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-ipol.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-ipol.c Changed

@@ -18,18 +18,23 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "strategies-ipol.h"
+#include "strategies/strategies-ipol.h"
+
+#include "strategies/avx2/ipol-avx2.h"
+#include "strategies/generic/ipol-generic.h"
 #include "strategyselector.h"
 
+
 // Define function pointers.
 ipol_func *kvz_filter_inter_quarterpel_luma;
 ipol_func *kvz_filter_inter_halfpel_chroma;
 ipol_func *kvz_filter_inter_octpel_chroma;
+ipol_frac_blocks_func *kvz_filter_frac_blocks_luma;
 epol_func *kvz_get_extended_block;
-
-// Headers for platform optimizations.
-#include "generic/ipol-generic.h"
-#include "avx2/ipol-avx2.h"
+kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
+kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
+kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
+kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
 
 
 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {
@@ -41,4 +46,4 @@
     success &= kvz_strategy_register_ipol_avx2(opaque, bitdepth);
   }
   return success;
-}
\ No newline at end of file
+}

kvazaar-0.8.3.tar.gz/src/strategies/strategies-ipol.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-ipol.h Changed

@@ -26,26 +26,39 @@
  * Interface for subpixel interpolation functions.
  */
 
-#include "global.h"
-
-#include <stdint.h>
-
 #include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "search_inter.h"
+
 
 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
 
 typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst,
   int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
 
+typedef unsigned(ipol_frac_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
+  frac_search_block filtered_out[15], int8_t fme_level);
+
 typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
   int filter_size, int width, int height, kvz_extended_block *out);
 
+typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+
+typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
 // Declare function pointers.
 extern ipol_func * kvz_filter_inter_quarterpel_luma;
 extern ipol_func * kvz_filter_inter_halfpel_chroma;
 extern ipol_func * kvz_filter_inter_octpel_chroma;
+extern ipol_frac_blocks_func *kvz_filter_frac_blocks_luma;
 extern epol_func * kvz_get_extended_block;
+extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
+extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
+extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
+extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
 
 
 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@@ -55,6 +68,11 @@
   {"filter_inter_quarterpel_luma", (void**) &kvz_filter_inter_quarterpel_luma}, \
   {"filter_inter_halfpel_chroma", (void**) &kvz_filter_inter_halfpel_chroma}, \
   {"filter_inter_octpel_chroma", (void**) &kvz_filter_inter_octpel_chroma}, \
+  {"filter_frac_blocks_luma", (void**) &kvz_filter_frac_blocks_luma}, \
+  {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
+  {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
+  {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
+  {"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
   {"get_extended_block", (void**) &kvz_get_extended_block}, \

kvazaar-0.8.3.tar.gz/src/strategies/strategies-nal.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-nal.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-nal.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-nal.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-picture.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-picture.c Changed

@@ -18,9 +18,17 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "strategies-picture.h"
+#include "strategies/strategies-picture.h"
+
+#include "strategies/altivec/picture-altivec.h"
+#include "strategies/avx2/picture-avx2.h"
+#include "strategies/generic/picture-generic.h"
+#include "strategies/sse2/picture-sse2.h"
+#include "strategies/sse41/picture-sse41.h"
+#include "strategies/x86_asm/picture-x86-asm.h"
 #include "strategyselector.h"
 
+
 // Define function pointers.
 reg_sad_func * kvz_reg_sad = 0;
 
@@ -49,17 +57,7 @@
 cost_pixel_nxn_multi_func * kvz_satd_64x64_dual = 0;
 
 cost_pixel_any_size_func * kvz_satd_any_size = 0;
-
-pixels_blit_func * kvz_pixels_blit = 0;
-
-
-// Headers for platform optimizations.
-#include "generic/picture-generic.h"
-#include "sse2/picture-sse2.h"
-#include "sse41/picture-sse41.h"
-#include "avx2/picture-avx2.h"
-#include "altivec/picture-altivec.h"
-#include "x86_asm/picture-x86-asm.h"
+cost_pixel_any_size_multi_func * kvz_satd_any_size_quad = 0;
 
 
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {

kvazaar-0.8.3.tar.gz/src/strategies/strategies-picture.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -26,9 +26,9 @@
  * Interface for distortion metric functions.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
 
-#include "../image.h"
 
 typedef kvz_pixel (*pred_buffer)[32 * 32];
 
@@ -66,17 +66,38 @@
       const kvz_pixel *block2, int stride2) \
   { \
     unsigned sum = 0; \
+    if (width % 8 != 0) { \
+      /* Process the first column using 4x4 blocks. */ \
+      for (int y = 0; y < height; y += 4) { \
+        sum += kvz_satd_4x4_subblock_ ## suffix(&block1[y * stride1], stride1, \
+                                                &block2[y * stride2], stride2); \
+      } \
+      block1 += 4; \
+      block2 += 4; \
+      width -= 4; \
+    } \
+    if (height % 8 != 0) { \
+      /* Process the first row using 4x4 blocks. */ \
+      for (int x = 0; x < width; x += 4) { \
+        sum += kvz_satd_4x4_subblock_ ## suffix(&block1[x], stride1, \
+                                                &block2[x], stride2); \
+      } \
+      block1 += 4 * stride1; \
+      block2 += 4 * stride2; \
+      height -= 4; \
+    } \
+    /* The rest can now be processed with 8x8 blocks. */ \
     for (int y = 0; y < height; y += 8) { \
       const kvz_pixel *row1 = &block1[y * stride1]; \
       const kvz_pixel *row2 = &block2[y * stride2]; \
       for (int x = 0; x < width; x += 8) { \
-        sum += satd_8x8_subblock_ ## suffix(&row1[x], stride1, &row2[x], stride2); \
+        sum += satd_8x8_subblock_ ## suffix(&row1[x], stride1, \
+                                            &row2[x], stride2); \
       } \
     } \
     return sum >> (KVZ_BIT_DEPTH - 8); \
   }
 
-
 typedef unsigned(reg_sad_func)(const kvz_pixel *const data1, const kvz_pixel *const data2,
   const int width, const int height,
   const unsigned stride1, const unsigned stride2);
@@ -87,10 +108,7 @@
     const kvz_pixel *block2, int stride2
 );
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const kvz_pixel *orig, unsigned num_modes, unsigned *costs_out);
-
-typedef void pixels_blit_func(const kvz_pixel* orig, kvz_pixel *dst,
-                         unsigned width, unsigned height,
-                         unsigned orig_stride, unsigned dst_stride);
+typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int *strides, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
 
 
 // Declare function pointers.
@@ -121,8 +139,7 @@
 extern cost_pixel_nxn_multi_func * kvz_satd_32x32_dual;
 extern cost_pixel_nxn_multi_func * kvz_satd_64x64_dual;
 
-extern pixels_blit_func * kvz_pixels_blit;
-
+extern cost_pixel_any_size_multi_func *kvz_satd_any_size_quad;
 
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
@@ -153,7 +170,7 @@
   {"satd_16x16_dual", (void**) &kvz_satd_16x16_dual}, \
   {"satd_32x32_dual", (void**) &kvz_satd_32x32_dual}, \
   {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \
-  {"pixels_blit", (void**) &kvz_pixels_blit}, \
+  {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \

kvazaar-0.8.3.tar.gz/src/strategies/strategies-quant.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-quant.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-quant.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.0.0.tar.gz/src/strategies/strategies-sao.c Added

@@ -0,0 +1,44 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/strategies-sao.h"
+#include "strategies/avx2/sao-avx2.h"
+#include "strategies/generic/sao-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+sao_edge_ddistortion_func * kvz_sao_edge_ddistortion;
+calc_sao_edge_dir_func * kvz_calc_sao_edge_dir;
+sao_reconstruct_color_func * kvz_sao_reconstruct_color;
+sao_band_ddistortion_func * kvz_sao_band_ddistortion;
+
+
+int kvz_strategy_register_sao(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+
+  success &= kvz_strategy_register_sao_generic(opaque, bitdepth);
+
+  if (kvz_g_hardware_flags.intel_flags.avx2) {
+    success &= kvz_strategy_register_sao_avx2(opaque, bitdepth);
+  }
+
+  return success;
+}
\ No newline at end of file

kvazaar-1.0.0.tar.gz/src/strategies/strategies-sao.h Added

@@ -0,0 +1,73 @@
+#ifndef STRATEGIES_SAO_H_
+#define STRATEGIES_SAO_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "sao.h"
+
+
+// Declare function pointers.
+typedef int (sao_edge_ddistortion_func)(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+  int block_width, int block_height,
+  int eo_class, int offsets[NUM_SAO_EDGE_CATEGORIES]);
+
+typedef void (calc_sao_edge_dir_func)(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+  int eo_class, int block_width, int block_height,
+  int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES]);
+
+typedef void (sao_reconstruct_color_func)(const encoder_control_t * const encoder,
+  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
+  const sao_info_t *sao,
+  int stride, int new_stride,
+  int block_width, int block_height,
+  color_t color_i);
+
+typedef int (sao_band_ddistortion_func)(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+  int block_width, int block_height,
+  int band_pos, int sao_bands[4]);
+
+// Declare function pointers.
+extern sao_edge_ddistortion_func * kvz_sao_edge_ddistortion;
+extern calc_sao_edge_dir_func * kvz_calc_sao_edge_dir;
+extern sao_reconstruct_color_func * kvz_sao_reconstruct_color;
+extern sao_band_ddistortion_func * kvz_sao_band_ddistortion;
+
+int kvz_strategy_register_sao(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_SAO_EXPORTS \
+  {"sao_edge_ddistortion", (void**) &kvz_sao_edge_ddistortion}, \
+  {"calc_sao_edge_dir", (void**) &kvz_calc_sao_edge_dir}, \
+  {"sao_reconstruct_color", (void**) &kvz_sao_reconstruct_color}, \
+  {"sao_band_ddistortion", (void**) &kvz_sao_band_ddistortion}, \
+
+
+
+#endif //STRATEGIES_SAO_H_

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.asm -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.asm Changed

@@ -291,3 +291,83 @@
     vmovd eax, m4
 
     RET
+
+
+;KVZ_SAD_32x32_STRIDE
+;Calculates SAD of a 32x32 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+cglobal sad_32x32_stride, 3, 3, 5
+    vpxor m4, m4
+
+	; Handle 2 lines per iteration
+    %rep 16
+        vmovdqu m0, [r0]
+        vmovdqu m1, [r0 + 16]
+        vmovdqu m2, [r0 + r2]
+        vmovdqu m3, [r0 + r2 + 16]
+        lea r0, [r0 + 2 * r2]
+
+        vpsadbw m0, [r1]
+        vpsadbw m1, [r1 + 16]
+        vpsadbw m2, [r1 + r2]
+        vpsadbw m3, [r1 + r2 + 16]
+        lea r1, [r1 + 2 * r2]
+ 
+        vpaddd m4, m0
+        vpaddd m4, m1
+        vpaddd m4, m2
+        vpaddd m4, m3
+    %endrep
+
+    vmovhlps m0, m4
+    vpaddd m4, m0
+
+    vmovd eax, m4
+
+    RET
+
+
+;KVZ_SAD_64x64_STRIDE
+;Calculates SAD of a 64x64 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+cglobal sad_64x64_stride, 3, 4, 5
+    vpxor m4, m4 ; sum accumulation register
+	mov r3, 4 ; number of iterations in the loop
+
+Process16Lines:
+	; Intel optimization manual says to not unroll beyond 500 instructions.
+	; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
+	; smaller is better, when speed is the same, right?
+    %rep 16
+        vmovdqu m0, [r0]
+        vmovdqu m1, [r0 + 1*16]
+        vmovdqu m2, [r0 + 2*16]
+        vmovdqu m3, [r0 + 3*16]
+
+        vpsadbw m0, [r1]
+        vpsadbw m1, [r1 + 1*16]
+        vpsadbw m2, [r1 + 2*16]
+        vpsadbw m3, [r1 + 3*16]
+
+        lea r0, [r0 + r2]
+        lea r1, [r1 + r2]
+ 
+        vpaddd m4, m0
+        vpaddd m4, m1
+        vpaddd m4, m2
+        vpaddd m4, m3
+    %endrep
+
+	dec r3
+	jnz Process16Lines
+
+    vmovhlps m0, m4
+    vpaddd m4, m0
+
+    vmovd eax, m4
+
+    RET

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.h -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.h -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm.c -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.c Changed

@@ -18,13 +18,17 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdlib.h>
-#include "strategyselector.h"
+#include "strategies/x86_asm/picture-x86-asm.h"
 
 #if defined(KVZ_COMPILE_ASM)
+#include <stdlib.h>
+
+#include "kvazaar.h"
+#include "strategies/x86_asm/picture-x86-asm-sad.h"
+#include "strategies/x86_asm/picture-x86-asm-satd.h"
+#include "strategies/sse41/picture-sse41.h"
+#include "strategyselector.h"
 
-#include "picture-x86-asm-sad.h"
-#include "picture-x86-asm-satd.h"
 
 static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2)
 {
@@ -36,16 +40,6 @@
   return sad;
 }
 
-static unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
-{
-  unsigned sad = 0;
-  sad += kvz_sad_16x16_stride_avx(data1, data2, stride);
-  sad += kvz_sad_16x16_stride_avx(data1 + 16, data2 + 16, stride);
-  sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride, data2 + 16 * stride, stride);
-  sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
-  return sad;
-}
-
 static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2)
 {
   unsigned sad = 0;
@@ -56,52 +50,50 @@
   return sad;
 }
 
-static unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
+static unsigned kvz_sad_other_avx(const kvz_pixel *data1, const kvz_pixel *data2,
+                                  int width, int height,
+                                  unsigned stride)
 {
   unsigned sad = 0;
-  sad += kvz_sad_32x32_stride_avx(data1, data2, stride);
-  sad += kvz_sad_32x32_stride_avx(data1 + 32, data2 + 32, stride);
-  sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride, data2 + 32 * stride, stride);
-  sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
-  return sad;
-}
-
-static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2,
-  const int width, const int height, const unsigned stride1, const unsigned stride2)
-{
-  int y, x;
-  unsigned sad = 0;
 
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      sad += abs(data1[y * stride + x] - data2[y * stride + x]);
     }
   }
 
   return sad;
 }
 
-static unsigned reg_sad_x86_asm(const kvz_pixel * const data1, const kvz_pixel * const data2,
-const int width, const int height, const unsigned stride1, const unsigned stride2)
+static unsigned reg_sad_x86_asm(const kvz_pixel *data1, const kvz_pixel * data2,
+                                const int width, const int height,
+                                const unsigned stride1, const unsigned stride2)
 {
-  if (width == 4 && height == 4) {
-    return kvz_sad_4x4_stride_avx(data1, data2, stride1);
-  } else if (width == 8 && height == 8) {
-    return kvz_sad_8x8_stride_avx(data1, data2, stride1);
-  } else if (width == 16 && height == 16) {
-    return kvz_sad_16x16_stride_avx(data1, data2, stride1);
-  } else if (width == 32 && height == 32) {
-    return kvz_sad_32x32_stride_avx(data1, data2, stride1);
-  } else if (width == 64 && height == 64) {
-    return kvz_sad_64x64_stride_avx(data1, data2, stride1);
+  if (width == height) {
+    if (width == 8) {
+      return kvz_sad_8x8_stride_avx(data1, data2, stride1);
+    } else if (width == 16) {
+      return kvz_sad_16x16_stride_avx(data1, data2, stride1);
+    } else if (width == 32) {
+      return kvz_sad_32x32_stride_avx(data1, data2, stride1);
+    } else if (width == 64) {
+      return kvz_sad_64x64_stride_avx(data1, data2, stride1);
+    }
+  }
+
+  if (width * height >= 16) {
+    // Call the vectorized general SAD SSE41 function when the block
+    // is big enough to make it worth it.
+    return kvz_reg_sad_sse41(data1, data2, width, height, stride1, stride2);
   } else {
-    return kvz_sad_other_avx(data1, data2, width, height, stride1, stride2);
+    return kvz_sad_other_avx(data1, data2, width, height, stride1);
   }
 }
 
 #endif //defined(KVZ_COMPILE_ASM)
 
-int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) {
+int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth)
+{
   bool success = true;
 #if defined(KVZ_COMPILE_ASM)
   if (bitdepth == 8){

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm.h -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.h Changed

kvazaar-0.8.3.tar.gz/src/strategyselector.c -> kvazaar-1.0.0.tar.gz/src/strategyselector.c Changed

@@ -20,14 +20,22 @@
 
 #include "strategyselector.h"
 
-#include <assert.h>
-#include <string.h>
+#include <stdio.h>
 #include <stdlib.h>
-#if COMPILE_INTEL
-#include <immintrin.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#elif MACOS
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#else
+#include <unistd.h>
 #endif
 
 hardware_flags_t kvz_g_hardware_flags;
+hardware_flags_t kvz_g_strategies_in_use;
+hardware_flags_t kvz_g_strategies_available;
 
 static void set_hardware_flags(int32_t cpuid);
 static void* strategyselector_choose_for(const strategy_list_t * const strategies, const char * const strategy_type);
@@ -75,6 +83,11 @@
     fprintf(stderr, "kvz_strategy_register_intra failed!\n");
     return 0;
   }
+
+  if (!kvz_strategy_register_sao(&strategies, bitdepth)) {
+    fprintf(stderr, "kvz_strategy_register_sao failed!\n");
+    return 0;
+  }
   
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
@@ -85,10 +98,118 @@
     }
     ++cur_strategy_to_select;
   }
-  
+
   //We can free the structure now, as all strategies are statically set to pointers
   if (strategies.allocated) {
-    free(strategies.strategies);
+	  //Also check what optimizations are available and what are in use
+	  //SIMD optimizations available
+	  bool strategies_available = false;
+	  fprintf(stderr, "Available: ");
+	  if (kvz_g_strategies_available.intel_flags.avx != 0){
+		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.avx2 != 0){
+		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.mmx != 0) {
+		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse != 0) {
+		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
+		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
+		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
+		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
+		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
+		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.arm_flags.neon != 0) {
+		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
+		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
+		  strategies_available = true;
+	  }
+	  //If there is no strategies available
+	  if (!strategies_available){
+		  fprintf(stderr, "no SIMD optimizations");
+	  }
+	  fprintf(stderr, "\n");
+
+	  //SIMD optimizations in use
+	  bool strategies_in_use = false;
+	  fprintf(stderr, "In use: ");
+	  if (kvz_g_strategies_in_use.intel_flags.avx != 0){
+		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
+		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
+		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
+		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
+		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
+		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
+		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
+		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) {
+		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.arm_flags.neon != 0) {
+		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) {
+		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
+		  strategies_in_use = true;
+	  }
+	  //If there is no strategies in use
+	  if (!strategies_in_use){
+		  fprintf(stderr, "no SIMD optimizations");
+	  }
+	  fprintf(stderr, "\n");
+
+	  //Free memory
+	  free(strategies.strategies);
   }
 
   return 1;
@@ -115,6 +236,21 @@
     new_strategy->priority = priority;
     new_strategy->fptr = fptr;
   }
+
+  //Check what strategies are available when they are registered
+  if (strcmp(strategy_name, "avx") == 0) kvz_g_strategies_available.intel_flags.avx++;
+  if (strcmp(strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_available.intel_flags.avx++;
+  if (strcmp(strategy_name, "avx2") == 0) kvz_g_strategies_available.intel_flags.avx2++;
+  if (strcmp(strategy_name, "mmx") == 0) kvz_g_strategies_available.intel_flags.mmx++;
+  if (strcmp(strategy_name, "sse") == 0) kvz_g_strategies_available.intel_flags.sse++;
+  if (strcmp(strategy_name, "sse2") == 0) kvz_g_strategies_available.intel_flags.sse2++;
+  if (strcmp(strategy_name, "sse3") == 0) kvz_g_strategies_available.intel_flags.sse3++;
+  if (strcmp(strategy_name, "sse41") == 0) kvz_g_strategies_available.intel_flags.sse41++;
+  if (strcmp(strategy_name, "sse42") == 0) kvz_g_strategies_available.intel_flags.sse42++;
+  if (strcmp(strategy_name, "ssse3") == 0) kvz_g_strategies_available.intel_flags.ssse3++;
+  if (strcmp(strategy_name, "altivec") == 0) kvz_g_strategies_available.powerpc_flags.altivec++;
+  if (strcmp(strategy_name, "neon") == 0) kvz_g_strategies_available.arm_flags.neon++;
+
 #ifdef DEBUG_STRATEGYSELECTOR
   fprintf(stderr, "Registered strategy %s:%s with priority %d (%p)\n", type, strategy_name, priority, fptr);
 #endif //DEBUG_STRATEGYSELECTOR
@@ -172,6 +308,20 @@
   if (max_priority_i == -1) {
     return NULL;
   }
+
+  //Check what strategy we are going to use
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx2") == 0) kvz_g_strategies_in_use.intel_flags.avx2++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "mmx") == 0) kvz_g_strategies_in_use.intel_flags.mmx++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse") == 0) kvz_g_strategies_in_use.intel_flags.sse++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse2") == 0) kvz_g_strategies_in_use.intel_flags.sse2++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse3") == 0) kvz_g_strategies_in_use.intel_flags.sse3++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse41") == 0) kvz_g_strategies_in_use.intel_flags.sse41++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse42") == 0) kvz_g_strategies_in_use.intel_flags.sse42++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "ssse3") == 0) kvz_g_strategies_in_use.intel_flags.ssse3++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "altivec") == 0) kvz_g_strategies_in_use.powerpc_flags.altivec++;
+  if (strcmp(strategies->strategies[max_priority_i].strategy_name, "neon") == 0) kvz_g_strategies_in_use.arm_flags.neon++;

kvazaar-0.8.3.tar.gz/src/strategyselector.h -> kvazaar-1.0.0.tar.gz/src/strategyselector.h Changed

@@ -26,77 +26,12 @@
  * Dynamic dispatch based on cpuid.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #if defined(KVZ_DEBUG) && !defined(DEBUG_STRATEGYSELECTOR)
 # define DEBUG_STRATEGYSELECTOR
 #endif
 
-//Hardware data (abstraction of defines). Extend for other compilers
-
-#if defined(_M_IX86) || defined(__i586__) || defined(__i686__) || defined(_M_X64) || defined(_M_AMD64) || defined(__amd64__) || defined(__x86_64__)
-#  define COMPILE_INTEL 1
-#else
-#  define COMPILE_INTEL 0
-#endif
-
-// Visual Studio note:
-// Because these macros are only used to guard code that is guarded by CPUID
-// at runtime, use /arch parameter to disable them, but enable all intrinsics
-// supported by VisualStudio if SSE2 (highest) is enabled.
-// AVX and AVX2 are handled by /arch directly and sse intrinsics will use VEX
-// versions if they are defined.
-#define MSC_X86_SIMD(level) (_M_X64 || (_M_IX86_FP >= (level)))
-
-#if COMPILE_INTEL
-#  if defined(__MMX__) || MSC_X86_SIMD(1)
-#    define COMPILE_INTEL_MMX 1
-#  endif
-#  if defined(__SSE__) || MSC_X86_SIMD(1)
-#    define COMPILE_INTEL_SSE 1
-#  endif
-#  if defined(__SSE2__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSE2 1
-#  endif
-#  if defined(__SSE3__)
-#    define COMPILE_INTEL_SSE3 1
-#  endif
-#  if defined(__SSSE3__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSSE3 1
-#  endif
-#  if defined(__SSE4_1__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSE41 1
-#  endif
-#  if defined(__SSE4_2__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSE42 1
-#  endif
-#  if defined(__AVX__)
-#    define COMPILE_INTEL_AVX 1
-#   endif
-#  if defined(__AVX2__)
-#    define COMPILE_INTEL_AVX2 1
-#   endif
-#endif
-
-#if defined (_M_PPC) || defined(__powerpc64__) || defined(__powerpc__)
-#  define COMPILE_POWERPC 1
-#  ifdef __ALTIVEC__
-#    define COMPILE_POWERPC_ALTIVEC 1
-#  else
-#    define COMPILE_POWERPC_ALTIVEC 0
-#  endif
-#else
-#  define COMPILE_POWERPC 0
-#endif
-
-#if defined (_M_ARM) || defined(__arm__) || defined(__thumb__)
-#  define COMPILE_ARM 1
-#else
-#  define COMPILE_ARM 0
-#endif
-
-
-
 typedef struct {
   const char *type; //Type of the function, usually its name
   const char *strategy_name; //Name of the strategy (e.g. sse2)
@@ -106,7 +41,7 @@
 
 typedef struct {
   unsigned int count;
-  unsigned int allocated;
+  unsigned int allocated;//How much memory is allocated
   strategy_t* strategies;
 } strategy_list_t;
 
@@ -128,6 +63,8 @@
     int sse42;
     int avx;
     int avx2;
+
+    bool hyper_threading;
   } intel_flags;
   
   struct {
@@ -137,10 +74,14 @@
   struct {
     int neon;
   } arm_flags;
+
+  int logical_cpu_count;
+  int physical_cpu_count;
 } hardware_flags_t;
 
 extern hardware_flags_t kvz_g_hardware_flags;
-
+extern hardware_flags_t kvz_g_strategies_in_use;
+extern hardware_flags_t kvz_g_strategies_available;
 
 int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth);
 int kvz_strategyselector_register(void *opaque, const char *type, const char *strategy_name, int priority, void *fptr);
@@ -153,6 +94,7 @@
 #include "strategies/strategies-ipol.h"
 #include "strategies/strategies-quant.h"
 #include "strategies/strategies-intra.h"
+#include "strategies/strategies-sao.h"
 
 static const strategy_to_select_t strategies_to_select[] = {
   STRATEGIES_NAL_EXPORTS
@@ -161,6 +103,7 @@
   STRATEGIES_IPOL_EXPORTS
   STRATEGIES_QUANT_EXPORTS
   STRATEGIES_INTRA_EXPORTS
+  STRATEGIES_SAO_EXPORTS
   { NULL, NULL },
 };

kvazaar-0.8.3.tar.gz/src/tables.h -> kvazaar-1.0.0.tar.gz/src/tables.h Changed

kvazaar-0.8.3.tar.gz/src/threadqueue.c -> kvazaar-1.0.0.tar.gz/src/threadqueue.c Changed

@@ -17,21 +17,19 @@
  * You should have received a copy of the GNU General Public License along
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
- 
-#include <assert.h>
+
+#include "threadqueue.h"
+
+#include <errno.h> // ETIMEDOUT
 #include <pthread.h>
-#include <errno.h> //ETIMEDOUT
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef KVZ_DEBUG
-#include <string.h>
-#endif //KVZ_DEBUG
-
 #include "global.h"
-#include "threadqueue.h"
 #include "threads.h"
 
+
 typedef struct {
   threadqueue_queue_t * threadqueue;
   int worker_id;
@@ -74,36 +72,38 @@
 } while (0);
 #endif //PTHREAD_DUMP
 
-const struct timespec kvz_time_to_wait = {1, 0};
-
-static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
+static void* threadqueue_worker(void* threadqueue_worker_spec_opaque)
+{
   threadqueue_worker_spec * const threadqueue_worker_spec = threadqueue_worker_spec_opaque;
   threadqueue_queue_t * const threadqueue = threadqueue_worker_spec->threadqueue;
   threadqueue_job_t * next_job = NULL;
-  
+
 #ifdef KVZ_DEBUG
   KVZ_GET_TIME(&threadqueue->debug_clock_thread_start[threadqueue_worker_spec->worker_id]);
 #endif //KVZ_DEBUG
 
   for(;;) {
-    int i = 0;
     threadqueue_job_t * job = NULL;
-    
+
     PTHREAD_LOCK(&threadqueue->lock);
 
     while(!threadqueue->stop && threadqueue->queue_waiting_execution == 0 && !next_job) {
+      // Wait until there is something to do in the queue.
       PTHREAD_COND_WAIT(&threadqueue->cond, &threadqueue->lock);
     }
-    
+
     if(threadqueue->stop) {
       if (next_job) {
+        // Put a job we had already reserved back into the queue.
+        // FIXME: This lock should be unnecessary, as nobody else is allowed
+        // to touch this job when it's running.
         PTHREAD_LOCK(&next_job->lock);
         next_job->state = THREADQUEUE_JOB_STATE_QUEUED;
         PTHREAD_UNLOCK(&next_job->lock);
       }
       break;
     }
-    
+
     //Find a task (should be fast enough)
     job = NULL;
     if (next_job) {
@@ -113,13 +113,15 @@
       //FIXME: if not using OWF, the first is better than the second, otherwise we should use the second order
       //for (i = threadqueue->queue_count - 1; i >= threadqueue->queue_start; --i) {
       //for (i = threadqueue->queue_start; i < threadqueue->queue_count; ++i) {
-        
-      for (i = (threadqueue->fifo ? threadqueue->queue_start : threadqueue->queue_count - 1);
+
+      for (int i = (threadqueue->fifo ? threadqueue->queue_start : threadqueue->queue_count - 1);
            (threadqueue->fifo ? i < threadqueue->queue_count : i >= threadqueue->queue_start); 
            (threadqueue->fifo ? ++i : --i)) {
         threadqueue_job_t * const i_job = threadqueue->queue[i];
-        
+
         if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
+          // Once we found the job with no dependancies, lock it and change
+          // its state to running, so nobody else can claim it.
           PTHREAD_LOCK(&i_job->lock);
           if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
             job = i_job;
@@ -130,58 +132,69 @@
         }
       }
     }
-    
-    //Ok we got a job (and we have a lock on it)
-    if (job) {
-      int queue_waiting_dependency_decr, queue_waiting_execution_incr;
 
+    if (!job) {
+      // We have no job. Probably because more threads were woken up than
+      // there were jobs to do.
+      PTHREAD_UNLOCK(&threadqueue->lock);
+    } else {
+      // We have a job with ndepends==0 and its state is running.
       assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
-      
-      //Move the queue_start "pointer" if needed
-      while (threadqueue->queue_start < threadqueue->queue_count && threadqueue->queue[threadqueue->queue_start]->state != THREADQUEUE_JOB_STATE_QUEUED) threadqueue->queue_start++;
+
+      // Advance queue_start to skip all the running jobs.
+      while (threadqueue->queue_start < threadqueue->queue_count &&
+             threadqueue->queue[threadqueue->queue_start]->state != THREADQUEUE_JOB_STATE_QUEUED)
+      {
+        threadqueue->queue_start++;
+      }
       
       if (!next_job) {
         --threadqueue->queue_waiting_execution;
         ++threadqueue->queue_running;
       }
-      
-      //Unlock the queue
+
       PTHREAD_UNLOCK(&threadqueue->lock);
-      
+
 #ifdef KVZ_DEBUG
       job->debug_worker_id = threadqueue_worker_spec->worker_id;
       KVZ_GET_TIME(&job->debug_clock_start);
 #endif //KVZ_DEBUG
-      
+
       job->fptr(job->arg);
-      
+
 #ifdef KVZ_DEBUG
       job->debug_worker_id = threadqueue_worker_spec->worker_id;
       KVZ_GET_TIME(&job->debug_clock_stop);
 #endif //KVZ_DEBUG
-      
-      //Re-lock the job to update its status and treat its dependencies
+
+      // FIXME: This lock should be unnecessary, as nobody else is allowed
+      // to touch this job when it's running.
       PTHREAD_LOCK(&job->lock);
       assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
-      
+
       job->state = THREADQUEUE_JOB_STATE_DONE;
-      
+
       next_job = NULL;
-      
-      queue_waiting_dependency_decr = 0;
-      queue_waiting_execution_incr = 0;
-      //Decrease counter of dependencies
-      for (i = 0; i < job->rdepends_count; ++i) {
+
+      int queue_waiting_dependency_decr = 0;
+      int queue_waiting_execution_incr = 0;
+
+      // Go throught all the jobs that depend on this one, decresing their ndepends.
+      for (int i = 0; i < job->rdepends_count; ++i) {
         threadqueue_job_t * const depjob = job->rdepends[i];
-        //Note that we lock the dependency AFTER locking the source. This avoids a deadlock in dep_add
+        // Note that we lock the dependency AFTER locking the source. This avoids a deadlock in dep_add.
         PTHREAD_LOCK(&depjob->lock);
-        
+
         assert(depjob->state == THREADQUEUE_JOB_STATE_QUEUED);
         assert(depjob->ndepends > 0);
         --depjob->ndepends;
-        
+
+        // Count how many jobs can now start executing so we know how many
+        // threads to wake up.
         if (depjob->ndepends == 0) {
           if (!next_job) {
+            // Avoid having to find a new job for this worker through the
+            // queue by taking one of the jobs that depended on current job.
             next_job = depjob;
             depjob->state = THREADQUEUE_JOB_STATE_RUNNING;
           } else {
@@ -189,30 +202,37 @@
           }
           ++queue_waiting_dependency_decr;
         }
-        
+
         PTHREAD_UNLOCK(&depjob->lock);
       }
-      //Unlock the job
-      PTHREAD_UNLOCK(&job->lock);
       
-      //Signal the queue that we've done a job

kvazaar-0.8.3.tar.gz/src/threadqueue.h -> kvazaar-1.0.0.tar.gz/src/threadqueue.h Changed

kvazaar-0.8.3.tar.gz/src/threads.h -> kvazaar-1.0.0.tar.gz/src/threads.h Changed

@@ -26,20 +26,24 @@
  * Abstractions for operating system specific stuff.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #include <pthread.h>
 
+#define E3 1000
+#define E9 1000000000
+#define FILETIME_TO_EPOCH 0x19DB1DED53E8000LL
+
 #if defined(__GNUC__) && !defined(__MINGW32__) 
-#include <unistd.h>
-#include <time.h>
+#include <unistd.h> // IWYU pragma: export
+#include <time.h> // IWYU pragma: export
 
 #define KVZ_CLOCK_T struct timespec
 
 #ifdef __MACH__
 // Workaround Mac OS not having clock_gettime.
-#include <mach/clock.h>
-#include <mach/mach.h>
+#include <mach/clock.h> // IWYU pragma: export
+#include <mach/mach.h> // IWYU pragma: export
 #define KVZ_GET_TIME(clock_t) { \
   clock_serv_t cclock; \
   mach_timespec_t mts; \
@@ -53,31 +57,68 @@
 #define KVZ_GET_TIME(clock_t) { clock_gettime(CLOCK_MONOTONIC, (clock_t)); }
 #endif
 
-#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)((ts).tv_sec) + (double)((ts).tv_nsec) / (double)1000000000L)
-#define KVZ_CLOCK_T_DIFF(start, stop) ((double)((stop).tv_sec - (start).tv_sec) + (double)((stop).tv_nsec - (start).tv_nsec) / (double)1000000000L)
+#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)((ts).tv_sec) + (double)((ts).tv_nsec) / 1e9)
+#define KVZ_CLOCK_T_DIFF(start, stop) ((double)((stop).tv_sec - (start).tv_sec) + (double)((stop).tv_nsec - (start).tv_nsec) / 1e9)
+
+static INLINE struct timespec * ms_from_now_timespec(struct timespec * result, int wait_ms)
+{
+  KVZ_GET_TIME(result);
+  int64_t secs = result->tv_sec + wait_ms / E3;
+  int64_t nsecs = result->tv_nsec + (wait_ms % E3) * (E9 / E3);
+  
+  if (nsecs >= E9) {
+    secs += 1;
+    nsecs -= E9;
+  }
+  
+  result->tv_sec = secs;
+  result->tv_nsec = nsecs;
+
+  return result;
+}
 
 #define KVZ_ATOMIC_INC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, 1)
 #define KVZ_ATOMIC_DEC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, -1)
-#define KVZ_SLEEP()                             usleep(0)
 
 #else //__GNUC__
 //TODO: we assume !GCC => Windows... this may be bad
-#include <windows.h>
+#include <windows.h> // IWYU pragma: export
 
 #define KVZ_CLOCK_T struct _FILETIME
 #define KVZ_GET_TIME(clock_t) { GetSystemTimeAsFileTime(clock_t); }
 // _FILETIME has 32bit low and high part of 64bit 100ns resolution timestamp (since 12:00 AM January 1, 1601)
-#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)(((uint64_t)(ts).dwHighDateTime)<<32 | (uint64_t)(ts).dwLowDateTime) / (double)10000000L)
+#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)(((uint64_t)(ts).dwHighDateTime)<<32 | (uint64_t)(ts).dwLowDateTime) / 1e7)
 #define KVZ_CLOCK_T_DIFF(start, stop) ((double)((((uint64_t)(stop).dwHighDateTime)<<32 | (uint64_t)(stop).dwLowDateTime) - \
-                                  (((uint64_t)(start).dwHighDateTime)<<32 | (uint64_t)(start).dwLowDateTime)) / (double)10000000L)
+                                  (((uint64_t)(start).dwHighDateTime)<<32 | (uint64_t)(start).dwLowDateTime)) / 1e7)
+
+static INLINE struct timespec * ms_from_now_timespec(struct timespec * result, int wait_ms)
+{
+  KVZ_CLOCK_T now;
+  KVZ_GET_TIME(&now);
 
+  int64_t moment_100ns = (int64_t)now.dwHighDateTime << 32 | (int64_t)now.dwLowDateTime;
+  moment_100ns -= (int64_t)FILETIME_TO_EPOCH;
+   
+  int64_t secs = moment_100ns / (E9 / 100) + (wait_ms / E3);
+  int64_t nsecs = (moment_100ns % (E9 / 100))*100 + ((wait_ms % E3) * (E9 / E3));
+  
+  if (nsecs >= E9) {
+    secs += 1;
+    nsecs -= E9;
+  }
+
+  result->tv_sec = secs;
+  result->tv_nsec = nsecs;
+
+  return result;
+}
 
 #define KVZ_ATOMIC_INC(ptr)                     InterlockedIncrement((volatile LONG*)ptr)
 #define KVZ_ATOMIC_DEC(ptr)                     InterlockedDecrement((volatile LONG*)ptr)
-// Sleep(0) results in bad performance on Windows for some reason,
-// As a work around sleep for 10ms.
-#define KVZ_SLEEP()                             Sleep(10)
 
 #endif //__GNUC__
 
+#undef E9
+#undef E3
+
 #endif //THREADS_H_

kvazaar-0.8.3.tar.gz/src/transform.c -> kvazaar-1.0.0.tar.gz/src/transform.c Changed

@@ -20,17 +20,20 @@
 
 #include "transform.h"
 
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include "nal.h"
+#include "image.h"
+#include "kvazaar.h"
 #include "rdo.h"
 #include "strategies/strategies-dct.h"
 #include "strategies/strategies-quant.h"
-#include "strategies/generic/quant-generic.h"
-#include "strategies/strategies-picture.h"
+#include "tables.h"
+
+/**
+ * \brief RDPCM direction.
+ */
+typedef enum rdpcm_dir {
+  RDPCM_VER = 0, // vertical
+  RDPCM_HOR = 1, // horizontal
+} rdpcm_dir;
 
 //////////////////////////////////////////////////////////////////////////
 // INITIALIZATIONS
@@ -50,6 +53,76 @@
 //
 
 /**
+ * \brief Bypass transform and quantization.
+ *
+ * Copies the reference pixels directly to reconstruction and the residual
+ * directly to coefficients. Used when cu_transquant_bypass_flag is set.
+ * Parameters pred_in and rec_out may be aliased.
+ *
+ * \param width       Transform width.
+ * \param in_stride   Stride for ref_in and pred_in
+ * \param out_stride  Stride for rec_out and coeff_out.
+ * \param ref_in      Reference pixels.
+ * \param pred_in     Predicted pixels.
+ * \param rec_out     Returns the reconstructed pixels.
+ * \param coeff_out   Returns the coefficients used for reconstruction of rec_out.
+ *
+ * \returns  Whether coeff_out contains any non-zero coefficients.
+ */
+static bool bypass_transquant(const int width,
+                              const int in_stride,
+                              const int out_stride,
+                              const kvz_pixel *const ref_in,
+                              const kvz_pixel *const pred_in,
+                              kvz_pixel *rec_out,
+                              coeff_t *coeff_out)
+{
+  bool nonzero_coeffs = false;
+
+  for (int y = 0; y < width; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int32_t in_idx  = x + y * in_stride;
+      int32_t out_idx = x + y * out_stride;
+
+      // The residual must be computed before writing to rec_out because
+      // pred_in and rec_out may point to the same array.
+      coeff_t coeff      = (coeff_t)(ref_in[in_idx] - pred_in[in_idx]);
+      coeff_out[out_idx] = coeff;
+      rec_out[out_idx]   = ref_in[in_idx];
+
+      nonzero_coeffs |= (coeff != 0);
+    }
+  }
+
+  return nonzero_coeffs;
+}
+
+/**
+ * Apply DPCM to residual.
+ *
+ * \param width   width of the block
+ * \param stride  stride of coeff array
+ * \param dir     RDPCM direction
+ * \param coeff   coefficients (residual) to filter
+ */
+static void rdpcm(const int width,
+                  const int stride,
+                  const rdpcm_dir dir,
+                  coeff_t *coeff)
+{
+  const int offset = (dir == RDPCM_HOR) ? 1 : stride;
+  const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
+  const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;
+
+  for (int y = width - 1; y >= min_y; y--) {
+    for (int x = width - 1; x >= min_x; x--) {
+      const int index = x + y * stride;
+      coeff[index] -= coeff[index - offset];
+    }
+  }
+}
+
+/**
  * \brief Get scaled QP used in quantization
  *
  */
@@ -158,7 +231,7 @@
     int has_coeffs;
   } skip, noskip, *best;
 
-  const int bit_cost = (int)(state->global->cur_lambda_cost+0.5);
+  const int bit_cost = (int)(state->frame->cur_lambda_cost+0.5);
   
   noskip.has_coeffs = kvz_quantize_residual(
       state, cur_cu, width, color, scan_order,
@@ -212,13 +285,12 @@
  * - lcu->cbf  coded block flags for the area
  * - lcu->cu.intra[].tr_skip  for the area
  */
-void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_cu, lcu_t* lcu)
+void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_pu, lcu_t* lcu)
 {
   // we have 64>>depth transform size
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
-  const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
-  if (cur_cu == NULL) {
-    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
+  if (cur_pu == NULL) {
+    cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
   const int8_t width = LCU_WIDTH>>depth;
   
@@ -227,7 +299,7 @@
   assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
 
   // Split transform and increase depth
-  if (depth == 0 || cur_cu->tr_depth > depth) {
+  if (depth == 0 || cur_pu->tr_depth > depth) {
     int offset = width / 2;
     kvz_quantize_lcu_luma_residual(state, x,          y,          depth+1, NULL, lcu);
     kvz_quantize_lcu_luma_residual(state, x + offset, y,          depth+1, NULL, lcu);
@@ -235,13 +307,13 @@
     kvz_quantize_lcu_luma_residual(state, x + offset, y + offset, depth+1, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
-    if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
-        cbf_set(&cur_cu->cbf.y, depth);
-      }
+    if (depth <= MAX_DEPTH) {
+      uint16_t child_cbfs[3] = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
     }
 
     return;
@@ -257,7 +329,7 @@
     // Pointers to current location in arrays with kvantized coefficients.
     coeff_t *orig_coeff_y = &lcu->coeff.y[luma_offset];
 
-    coeff_scan_order_t scan_idx_luma = kvz_get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
+    coeff_scan_order_t scan_idx_luma = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
 
     #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
     uint32_t residual_sum = 0;
@@ -266,30 +338,45 @@
     // Clear coded block flag structures for depths lower than current depth.
     // This should ensure that the CBF data doesn't get corrupted if this function
     // is called more than once.
-    cbf_clear(&cur_cu->cbf.y, depth + pu_index);
+    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
 
-    if (width == 4 && 
-        state->encoder_control->trskip_enable)
-    {
+
+    if (state->encoder_control->cfg->lossless) {
+      if (bypass_transquant(width,
+                            LCU_WIDTH, LCU_WIDTH,
+                            base_y, recbase_y,
+                            recbase_y, orig_coeff_y)) {
+        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
+      }
+      if (state->encoder_control->cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
+        // implicit rdpcm for horizontal and vertical intra modes
+        if (cur_pu->intra.mode == 10) {
+          rdpcm(width, LCU_WIDTH, RDPCM_HOR, orig_coeff_y);
+
+        } else if (cur_pu->intra.mode == 26) {
+          rdpcm(width, LCU_WIDTH, RDPCM_VER, orig_coeff_y);
+        }
+      }
+    } else if (width == 4 && state->encoder_control->trskip_enable) {
       // Try quantization with trskip and use it if it's better.

kvazaar-0.8.3.tar.gz/src/transform.h -> kvazaar-1.0.0.tar.gz/src/transform.h Changed

kvazaar-0.8.3.tar.gz/src/videoframe.c -> kvazaar-1.0.0.tar.gz/src/videoframe.c Changed

@@ -18,19 +18,23 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "videoframe.h"
+
 #include <stdlib.h>
-#include <string.h>
 
+#include "image.h"
 #include "sao.h"
-#include "threads.h"
-#include "videoframe.h"
+
 
 /**
  * \brief Allocate new frame
  * \param pic picture pointer
  * \return picture pointer
  */
-videoframe_t *kvz_videoframe_alloc(const int32_t width, const int32_t height, const int32_t poc) {
+videoframe_t * kvz_videoframe_alloc(int32_t width,
+                                    int32_t height,
+                                    enum kvz_chroma_format chroma_format)
+{
   videoframe_t *frame = MALLOC(videoframe_t, 1);
 
   if (!frame) return 0;
@@ -45,16 +49,17 @@
   if (frame->height_in_lcu * LCU_WIDTH < frame->height) frame->height_in_lcu++;
 
   {
-    // Allocate height_in_scu x width_in_scu x sizeof(CU_info)
-    unsigned height_in_scu = frame->height_in_lcu << MAX_DEPTH;
-    unsigned width_in_scu = frame->width_in_lcu << MAX_DEPTH;
-    frame->cu_array = kvz_cu_array_alloc(width_in_scu, height_in_scu);
+    unsigned cu_array_width  = frame->width_in_lcu  * LCU_WIDTH;
+    unsigned cu_array_height = frame->height_in_lcu * LCU_WIDTH;
+    frame->cu_array = kvz_cu_array_alloc(cu_array_width, cu_array_height);
   }
 
   frame->coeff_y = NULL; frame->coeff_u = NULL; frame->coeff_v = NULL;
 
   frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
-  frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+  if (chroma_format != KVZ_CSP_400) {
+    frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+  }
 
   return frame;
 }
@@ -89,18 +94,16 @@
   frame->poc = poc;
 }
 
-const cu_info_t* kvz_videoframe_get_cu_const(const videoframe_t * const frame, unsigned int x_in_scu, unsigned int y_in_scu)
+const cu_info_t* kvz_videoframe_get_cu_const(const videoframe_t * const frame,
+                                             unsigned int x_in_scu,
+                                             unsigned int y_in_scu)
 {
-  assert(x_in_scu < (frame->width_in_lcu << MAX_DEPTH));
-  assert(y_in_scu < (frame->height_in_lcu << MAX_DEPTH));
-  
-  return &frame->cu_array->data[x_in_scu + y_in_scu * (frame->width_in_lcu << MAX_DEPTH)];
+  return kvz_cu_array_at_const(frame->cu_array, x_in_scu << 3, y_in_scu << 3);
 }
 
-cu_info_t* kvz_videoframe_get_cu(videoframe_t * const frame, const unsigned int x_in_scu, const unsigned int y_in_scu)
+cu_info_t* kvz_videoframe_get_cu(videoframe_t * const frame,
+                                 const unsigned int x_in_scu,
+                                 const unsigned int y_in_scu)
 {
-  assert(x_in_scu < (frame->width_in_lcu << MAX_DEPTH));
-  assert(y_in_scu < (frame->height_in_lcu << MAX_DEPTH));
-  
-  return &frame->cu_array->data[x_in_scu + y_in_scu * (frame->width_in_lcu << MAX_DEPTH)];
+  return kvz_cu_array_at(frame->cu_array, x_in_scu << 3, y_in_scu << 3);
 }

kvazaar-0.8.3.tar.gz/src/videoframe.h -> kvazaar-1.0.0.tar.gz/src/videoframe.h Changed

kvazaar-0.8.3.tar.gz/src/yuv_io.c -> kvazaar-1.0.0.tar.gz/src/yuv_io.c Changed

@@ -42,7 +42,7 @@
 
 
 static int read_and_fill_frame_data(FILE *file,
-                                    unsigned width, unsigned height,
+                                    unsigned width, unsigned height, unsigned bytes_per_sample,
                                     unsigned array_width, kvz_pixel *data)
 {
   kvz_pixel* p = data;
@@ -52,7 +52,7 @@
 
   while (p < end) {
     // Read the beginning of the line from input.
-    if (width != fread(p, sizeof(unsigned char), width, file))
+    if (width != fread(p, bytes_per_sample, width, file))
       return 0;
 
     // Fill the rest with the last pixel value.
@@ -68,21 +68,104 @@
 }
 
 
-/**
-* \brief Convert 8 bit (single byte per pixel) to 10bit (two bytes per pixel) array
-*
-* \param input   input/output buffer
-* \return        1
-*/
-int frame_8bit_to_10bit(kvz_pixel* input, int width, int height) {
-  uint8_t* temp_buffer = (uint8_t*)input;
-  const uint32_t pixels = width*height;
-  for (int i = pixels - 1; i >= 0; i--) {
-    input[i] = temp_buffer[i] << 2;
+static void swap_16b_buffer_bytes(kvz_pixel* input, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    input[i] = ((input[i] & 0xff) << 8) + ((input[i] & 0xff00) >> 8);
+  }
+}
+
+
+static void shift_to_bitdepth(kvz_pixel* input, int size, int from_bitdepth, int to_bitdepth)
+{
+  int shift = to_bitdepth - from_bitdepth;
+  for (int i = 0; i < size; ++i) {
+    // Shifting by a negative number is undefined.
+    if (shift > 0) {
+      input[i] <<= shift;
+    } else {
+      input[i] >>= shift;
+    }
+  }
+}
+
+
+// Shift and copy 1-byte aligned samples to 2-byte aligned array
+static void shift_to_bitdepth_and_spread(kvz_pixel *input,
+                                         int size,
+                                         int from_bitdepth,
+                                         int to_bitdepth)
+{
+  assert(sizeof(kvz_pixel) > 1);
+  int shift = to_bitdepth - from_bitdepth;
+  unsigned char *byte_buf = (unsigned char *)input;
+  
+  // Starting from the back of the 1-byte samples, copy each sample to it's
+  // place in the 2-byte per sample array, overwriting the bytes that have
+  // already been copied in the process.
+  // Even though the two pointers are aliased, this should work because the
+  // future values read through byte_buf poiner never change as a result of
+  // writing through input pointer.
+  for (int i = size - 1; i >= 0; --i) {
+    // Shifting by a negative number is undefined.
+    if (shift > 0) {
+      input[i] = byte_buf[i] << shift;
+    } else {
+      input[i] = byte_buf[i] >> shift;
+    }
   }
+}
+
+
+bool machine_is_big_endian()
+{
+  uint16_t number = 1;
+  char first_byte = *(char*)&number;
+
+  return (first_byte != 0);
+}
+
+
+static int yuv_io_read_plane(
+    FILE* file,
+    unsigned in_width, unsigned in_height, unsigned in_bitdepth,
+    unsigned out_width, unsigned out_height, unsigned out_bitdepth,
+    kvz_pixel *out_buf)
+{
+  unsigned bytes_per_sample = in_bitdepth > 8 ? 2 : 1;
+  unsigned buf_length = in_width * in_height;
+  unsigned buf_bytes = buf_length * bytes_per_sample;
+
+  if (in_width == out_width) {
+    // No need to extend pixels.
+    const size_t pixel_size = sizeof(unsigned char);
+    if (fread(out_buf, pixel_size, buf_bytes, file) != buf_bytes)  return 0;
+  } else {
+    // Need to copy pixels to fill the image in horizontal direction.
+    if (!read_and_fill_frame_data(file, in_width, in_height, bytes_per_sample, out_width, out_buf)) return 0;
+  }
+
+  if (in_height != out_height) {
+    // Need to copy pixels to fill the image in vertical direction.
+    fill_after_frame(in_height, out_width, out_height, out_buf);
+  }
+
+  if (in_bitdepth > 8) {
+    if (machine_is_big_endian()) {
+      swap_16b_buffer_bytes(out_buf, buf_length);
+    }
+  }
+
+  if (in_bitdepth <= 8 && out_bitdepth > 8) {
+    shift_to_bitdepth_and_spread(out_buf, buf_length, in_bitdepth, out_bitdepth);
+  } else if (in_bitdepth != out_bitdepth) {
+    shift_to_bitdepth(out_buf, buf_length, in_bitdepth, out_bitdepth);
+  }
+  
   return 1;
 }
 
+
 /**
  * \brief Read a single frame from a file.
  *
@@ -97,46 +180,43 @@
  * \return              1 on success, 0 on failure
  */
 int yuv_io_read(FILE* file,
-                unsigned input_width, unsigned input_height,
+                unsigned in_width, unsigned out_width,
+                unsigned in_bitdepth, unsigned out_bitdepth,
                 kvz_picture *img_out)
 {
-  assert(input_width % 2 == 0);
-  assert(input_height % 2 == 0);
+  assert(in_width % 2 == 0);
+  assert(out_width % 2 == 0);
 
-  const unsigned y_size = input_width * input_height;
-  const unsigned uv_input_width  = input_width  / 2;
-  const unsigned uv_input_height = input_height / 2;
-  const unsigned uv_size = uv_input_width * uv_input_height;
+  int ok;
 
-  const unsigned uv_array_width  = img_out->width  / 2;
-  const unsigned uv_array_height = img_out->height  / 2;
+  ok = yuv_io_read_plane(
+      file, 
+      in_width, out_width, in_bitdepth,
+      img_out->width, img_out->height, out_bitdepth,
+      img_out->y);
+  if (!ok) return 0;
 
-  if (input_width == img_out->width) {
-    // No need to extend pixels.
-    const size_t pixel_size = sizeof(unsigned char);
-    if (fread(img_out->y, pixel_size, y_size,  file) != y_size)  return 0;
-    if (fread(img_out->u, pixel_size, uv_size, file) != uv_size) return 0;
-    if (fread(img_out->v, pixel_size, uv_size, file) != uv_size) return 0;
-  } else {
-    // Need to copy pixels to fill the image in horizontal direction.
-    if (!read_and_fill_frame_data(file, input_width,    input_height,    img_out->width, img_out->y)) return 0;
-    if (!read_and_fill_frame_data(file, uv_input_width, uv_input_height, uv_array_width, img_out->u)) return 0;
-    if (!read_and_fill_frame_data(file, uv_input_width, uv_input_height, uv_array_width, img_out->v)) return 0;
-  }
-
-  if (input_height != img_out->height) {
-    // Need to copy pixels to fill the image in vertical direction.
-    fill_after_frame(input_height,    img_out->width, img_out->height,    img_out->y);
-    fill_after_frame(uv_input_height, uv_array_width, uv_array_height, img_out->u);
-    fill_after_frame(uv_input_height, uv_array_width, uv_array_height, img_out->v);
-  }
+  if (img_out->chroma_format != KVZ_CSP_400) {
+    unsigned uv_width_in = in_width / 2;
+    unsigned uv_height_in = out_width / 2;
+    unsigned uv_width_out = img_out->width / 2;
+    unsigned uv_height_out = img_out->height / 2;
 
-#if KVZ_BIT_DEPTH == 10
-  frame_8bit_to_10bit(img_out->y, img_out->width, img_out->height);
-	frame_8bit_to_10bit(img_out->u, img_out->width >> 1, img_out->height >> 1);
-	frame_8bit_to_10bit(img_out->v, img_out->width >> 1, img_out->height >> 1);
-#endif
+    ok = yuv_io_read_plane(
+        file,
+        uv_width_in, uv_height_in, in_bitdepth,
+        uv_width_out, uv_height_out, out_bitdepth,
+        img_out->u);
+    if (!ok) return 0;

kvazaar-0.8.3.tar.gz/src/yuv_io.h -> kvazaar-1.0.0.tar.gz/src/yuv_io.h Changed

kvazaar-0.8.3.tar.gz/tests/mv_cand_tests.c -> kvazaar-1.0.0.tar.gz/tests/mv_cand_tests.c Changed

@@ -17,46 +17,206 @@
  * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "src/inter.c"
+
 #include <string.h>
 
 #include "greatest/greatest.h"
 
-#include "src/cu.h"
-#include "src/inter.h"
-
 TEST test_get_spatial_merge_cand(void)
 {
   lcu_t lcu;
   memset(&lcu, 0, sizeof(lcu));
   for (int i = 0; i < sizeof(lcu.cu) / sizeof(cu_info_t); i++) {
-    lcu.cu[i].coded = 1;
     lcu.cu[i].type = CU_INTER;
   }
-  lcu.cu[20].coded = 1;
-  lcu.cu[22].coded = 1;
-  lcu.cu[23].coded = 1;
-  lcu.cu[56].coded = 1;
-  lcu.cu[65].coded = 1;
 
   cu_info_t *mv_cand[5] = { NULL };
-  kvz_inter_get_spatial_merge_candidates(16, 16, // x, y
-                                         16, 32, // width, height
-                                         &mv_cand[0], // b0
-                                         &mv_cand[1], // b1
-                                         &mv_cand[2], // b2
-                                         &mv_cand[3], // a0
-                                         &mv_cand[4], // a1
-                                         &lcu);
-
-  ASSERT_EQ(mv_cand[0], &lcu.cu[23]); // b0
-  ASSERT_EQ(mv_cand[1], &lcu.cu[22]); // b1
-  ASSERT_EQ(mv_cand[2], &lcu.cu[20]); // b2
-  ASSERT_EQ(mv_cand[3], &lcu.cu[65]); // a0
-  ASSERT_EQ(mv_cand[4], &lcu.cu[56]); // a1
+  get_spatial_merge_candidates(64 + 32, 64, // x, y
+                               32, 24,      // width, height
+                               1920, 1080,  // picture size
+                               &mv_cand[0], // b0
+                               &mv_cand[1], // b1
+                               &mv_cand[2], // b2
+                               &mv_cand[3], // a0
+                               &mv_cand[4], // a1
+                               &lcu);
+
+  ASSERT_EQ(mv_cand[0], &lcu.cu[289]); // b0
+  ASSERT_EQ(mv_cand[1], &lcu.cu[ 16]); // b1
+  ASSERT_EQ(mv_cand[2], &lcu.cu[  8]); // b2
+  ASSERT_EQ(mv_cand[3], &lcu.cu[127]); // a0
+  ASSERT_EQ(mv_cand[4], &lcu.cu[110]); // a1
+
+  PASS();
+}
+
+TEST test_is_a0_cand_coded()
+{
+  // +--+--+
+  // |##|  |
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 16, 16), true);
+  // Same as above with a 2NxN block
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 32, 16), true);
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 32, 8), true);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 32, 24), true);
+
+  // +--+--+
+  // |  |##|
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  ASSERT_EQ(is_a0_cand_coded(16, 0, 16, 16), false);
+
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  // |  |##|
+  // +--+--+
+  ASSERT_EQ(is_a0_cand_coded(48, 16, 16, 16), false);
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_a0_cand_coded(48, 0, 16, 32), false);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_a0_cand_coded(40, 0, 24, 32), false);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_a0_cand_coded(56, 0, 8, 32), false);
+
+  // +-----+--+--+
+  // |     |  |  |
+  // |     +--+--+
+  // |     |##|  |
+  // +-----+--+--+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+-----+
+  ASSERT_EQ(is_a0_cand_coded(32, 16, 16, 16), false);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_a0_cand_coded(32, 8, 32, 24), false);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_a0_cand_coded(32, 24, 32, 8), false);
+
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 16, 32), false);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 8, 32), false);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 24, 32), false);
+
+  // +--+--+-----+
+  // |  |  |     |
+  // +--+--+     |
+  // |##|  |     |
+  // +--+--+-----+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+-----+
+  ASSERT_EQ(is_a0_cand_coded(32, 8, 8, 8), true);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_a0_cand_coded(32, 4, 16, 12), true);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_a0_cand_coded(32, 12, 16, 4), true);
+
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 8, 16), true);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 4, 16), true);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 12, 16), true);
+
+  PASS();
+}
+
+TEST test_is_b0_cand_coded()
+{
+  // +--+--+
+  // |##|  |
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 16, 16), true);
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 16, 32), true);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 24, 32), true);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 8, 32), true);
+
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  // |##|  |
+  // +--+--+
+  ASSERT_EQ(is_b0_cand_coded(32, 16, 16, 16), true);
+
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  // |  |##|
+  // +--+--+
+  ASSERT_EQ(is_b0_cand_coded(48, 16, 16, 16), false);
+  // Same as above with a 2NxN block
+  ASSERT_EQ(is_b0_cand_coded(32, 16, 32, 16), false);
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_b0_cand_coded(32, 8, 32, 24), false);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_b0_cand_coded(32, 24, 32, 8), false);
+
+  // +-----+-----+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+--+--+
+  // |     |  |##|
+  // |     +--+--+
+  // |     |  |  |
+  // +-----+--+--+
+  ASSERT_EQ(is_b0_cand_coded(48, 32, 16, 16), false);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_b0_cand_coded(32, 32, 32, 8), false);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_b0_cand_coded(32, 32, 32, 24), false);
+
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_b0_cand_coded(56, 32, 8, 32), false);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_b0_cand_coded(40, 32, 24, 32), false);

kvazaar-0.8.3.tar.gz/tests/sad_tests.c -> kvazaar-1.0.0.tar.gz/tests/sad_tests.c Changed

@@ -59,26 +59,60 @@
 
 static kvz_picture *g_pic = 0;
 static kvz_picture *g_ref = 0;
+static kvz_picture *g_big_pic = 0;
+static kvz_picture *g_big_ref = 0;
+static kvz_picture *g_64x64_zero = 0;
+static kvz_picture *g_64x64_max = 0;
+
+static struct sad_test_env_t {
+  int width;
+  int height;
+  void * tested_func;
+  const strategy_t * strategy;
+  char msg[255];
+} sad_test_env;
 
 //////////////////////////////////////////////////////////////////////////
 // SETUP, TEARDOWN AND HELPER FUNCTIONS
 static void setup_tests()
 {
-  g_pic = kvz_image_alloc(8, 8);
+  g_pic = kvz_image_alloc(KVZ_CSP_420, 8, 8);
   for (int i = 0; i < 64; ++i) {
     g_pic->y[i] = pic_data[i] + 48;
   }
 
-  g_ref = kvz_image_alloc(8, 8);
+  g_ref = kvz_image_alloc(KVZ_CSP_420, 8, 8);
   for (int i = 0; i < 64; ++i) {
     g_ref->y[i] = ref_data[i] + 48;
   }
+
+  g_big_pic = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  for (int i = 0; i < 64*64; ++i) {
+    g_big_pic->y[i] = (i*i / 32 + i) % 255;
+    //g_big_pic->y[i] = i % 255;
+  }
+
+  g_big_ref = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  for (int i = 0; i < 64 * 64; ++i) {
+    g_big_ref->y[i] = (i*i / 16 + i) % 255;
+    //g_big_ref->y[i] = (i / 2) % 255;
+  }
+
+  g_64x64_zero = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  memset(g_64x64_zero->y, 0, 64 * 64 * sizeof(kvz_pixel));
+  
+  g_64x64_max = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  memset(g_64x64_max->y, PIXEL_MAX, 64 * 64 * sizeof(kvz_pixel));
 }
 
 static void tear_down_tests()
 {
   kvz_image_free(g_pic);
   kvz_image_free(g_ref);
+  kvz_image_free(g_big_pic);
+  kvz_image_free(g_big_ref);
+  kvz_image_free(g_64x64_zero);
+  kvz_image_free(g_64x64_max);
 }
 
 
@@ -224,11 +258,66 @@
   PASS();
 }
 
+static unsigned simple_sad(const kvz_pixel* buf1, const kvz_pixel* buf2, unsigned stride,
+                           unsigned width, unsigned height)
+{
+  unsigned sum = 0;
+  for (unsigned y = 0; y < height; ++y) {
+    for (unsigned x = 0; x < width; ++x) {
+      sum += abs((int)buf1[y * stride + x] - (int)buf2[y * stride + x]);
+    }
+  }
+  return sum;
+}
 
-struct sad_test_env_t {
-  kvz_picture *g_pic;
-  kvz_picture *g_ref;
-};
+TEST test_reg_sad(void)
+{
+  unsigned width = sad_test_env.width;
+  unsigned height = sad_test_env.height;
+  unsigned stride = 64;
+
+  unsigned correct_result = simple_sad(g_big_pic->y, g_big_ref->y, stride, width, height);
+  
+  unsigned(*tested_func)(const kvz_pixel *, const kvz_pixel *, int, int, unsigned, unsigned) = sad_test_env.tested_func;
+  unsigned result = tested_func(g_big_pic->y, g_big_ref->y, width, height, stride, stride);
+  
+  sprintf(sad_test_env.msg, "%s(%ux%u):%s",
+          sad_test_env.strategy->type,
+          width,
+          height,
+          sad_test_env.strategy->strategy_name);
+
+  if (result != correct_result) {
+    FAILm(sad_test_env.msg);
+  }
+
+  PASSm(sad_test_env.msg);
+}
+
+
+TEST test_reg_sad_overflow(void)
+{
+  unsigned width = sad_test_env.width;
+  unsigned height = sad_test_env.height;
+  unsigned stride = 64;
+
+  unsigned correct_result = simple_sad(g_64x64_zero->y, g_64x64_max->y, stride, width, height);
+
+  unsigned(*tested_func)(const kvz_pixel *, const kvz_pixel *, int, int, unsigned, unsigned) = sad_test_env.tested_func;
+  unsigned result = tested_func(g_64x64_zero->y, g_64x64_max->y, width, height, stride, stride);
+
+  sprintf(sad_test_env.msg, "overflow %s(%ux%u):%s",
+    sad_test_env.strategy->type,
+    width,
+    height,
+    sad_test_env.strategy->strategy_name);
+
+  if (result != correct_result) {
+    FAILm(sad_test_env.msg);
+  }
+
+  PASSm(sad_test_env.msg);
+}
 
 
 //////////////////////////////////////////////////////////////////////////
@@ -272,6 +361,29 @@
     RUN_TEST(test_bottomleft_out);
     RUN_TEST(test_bottom_out);
     RUN_TEST(test_bottomright_out);
+
+    struct dimension {
+      int width;
+      int height;
+    };
+    static const struct dimension tested_dims[] = {
+      // Square motion partitions
+      {64, 64}, {32, 32}, {16, 16}, {8, 8},
+      // Symmetric motion partitions
+      {64, 32}, {32, 64}, {32, 16}, {16, 32}, {16, 8}, {8, 16}, {8, 4}, {4, 8},
+      // Asymmetric motion partitions
+      {48, 16}, {16, 48}, {24, 16}, {16, 24}, {12, 4}, {4, 12}
+    };
+
+    sad_test_env.tested_func = strategies.strategies[i].fptr;
+    sad_test_env.strategy = &strategies.strategies[i];
+    int num_dim_tests = sizeof(tested_dims) / sizeof(tested_dims[0]);
+    for (int dim_test = 0; dim_test < num_dim_tests; ++dim_test) {
+      sad_test_env.width = tested_dims[dim_test].width;
+      sad_test_env.height = tested_dims[dim_test].height;
+      RUN_TEST(test_reg_sad);
+      RUN_TEST(test_reg_sad_overflow);
+    }
   }
   
   tear_down_tests();

kvazaar-0.8.3.tar.gz/tests/speed_tests.c -> kvazaar-1.0.0.tar.gz/tests/speed_tests.c Changed

@@ -43,11 +43,18 @@
 static kvz_pixel * bufs[NUM_TESTS]; // SIMD aligned pointers.
 static kvz_pixel * actual_bufs[NUM_TESTS]; // pointers returned by malloc.
 
+#define WIDTH_4K 3840 
+#define HEIGHT_4K 2160
+
 static struct test_env_t {
-  int log_width; // for selecting dim from bufs
+  int width;
+  int height;
   void * tested_func;
   const strategy_t * strategy;
   char msg[1024];
+  
+  kvz_picture *inter_a;
+  kvz_picture *inter_b;
 } test_env;
 
 
@@ -83,6 +90,16 @@
       init_gradient(width - x, y, width, 255 / width, &bufs[test][chunk * 64*64]);
     }
   }
+
+  test_env.inter_a = kvz_image_alloc(KVZ_CSP_420, WIDTH_4K, HEIGHT_4K);
+  test_env.inter_b = kvz_image_alloc(KVZ_CSP_420, WIDTH_4K, HEIGHT_4K);
+  for (unsigned i = 0; i < WIDTH_4K * HEIGHT_4K; ++i) {
+    kvz_pixel pattern1 = ((i*i >> 10) % 255) >> 2;
+    kvz_pixel pattern2 = ((i*i >> 15) % 255) >> 2;
+    kvz_pixel gradient = (i >> 12) + i;
+    test_env.inter_a->y[i] = (pattern1 + gradient) % PIXEL_MAX;
+    test_env.inter_b->y[i] = (pattern2 + gradient) % PIXEL_MAX;
+  }
 }
 
 static void tear_down_tests()
@@ -90,6 +107,8 @@
   for (int test = 0; test < NUM_TESTS; ++test) {
     free(actual_bufs[test]);
   }
+  kvz_image_free(test_env.inter_a);
+  kvz_image_free(test_env.inter_b);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -126,55 +145,107 @@
     KVZ_GET_TIME(&clock_now)
   }
 
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
   sprintf(test_env.msg, "%.3fM x %s:%s",
-    (double)call_cnt / 1000000.0,
+    (double)call_cnt / 1000000.0 / test_time,
     test_env.strategy->type,
     test_env.strategy->strategy_name);
   PASSm(test_env.msg);
 }
 
 
-TEST test_inter_speed(const int width)
+TEST test_intra_dual_speed(const int width)
 {
   const int size = width * width;
-  unsigned call_cnt = 0;
+  uint64_t call_cnt = 0;
   KVZ_CLOCK_T clock_now;
   KVZ_GET_TIME(&clock_now);
   double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;
 
   // Loop until time allocated for test has passed.
   for (unsigned i = 0;
-      test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
-      ++i)
+    test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
+    ++i)
   {
     int test = i % NUM_TESTS;
     uint64_t sum = 0;
     for (int offset = 0; offset < NUM_CHUNKS * 64 * 64; offset += NUM_CHUNKS * size) {
-      // Treat 4 consecutive chunks as one chunk with double width and height,
-      // and do a 8x8 grid search against the first chunk to simulate real usage.
+      // Compare the first chunk against the 35 other chunks to simulate real usage.
       kvz_pixel * buf1 = &bufs[test][offset];
-      for (int chunk = 0; chunk < NUM_CHUNKS; chunk += 4) {
-        kvz_pixel * buf2 = &bufs[test][chunk * size + offset];
-        for (int y = 0; y < 8; ++y) {
-          for (int x = 0; x < 8; ++x) {
-            const int stride1 = 2 * 64;
-            const int stride2 = 2 * 64;
-            reg_sad_func *tested_func = test_env.tested_func;
-            sum += tested_func(buf1, &buf2[y * stride2 + x], width, width, stride1, stride2);
-            ++call_cnt;
-          }
-        }
+      for (int chunk = 0; chunk < NUM_CHUNKS; chunk += 2) {
+        cost_pixel_nxn_multi_func *tested_func = test_env.tested_func;
+        const kvz_pixel *buf_pair[2] = { &bufs[test][chunk * size + offset], &bufs[test][(chunk + 1) * size + offset] };
+        unsigned costs[2] = { 0, 0 };
+        tested_func((pred_buffer)buf_pair, buf1, 2, costs);
+        sum += costs[0] + costs[1];
+        ++call_cnt;
       }
     }
+
     ASSERT(sum > 0);
     KVZ_GET_TIME(&clock_now)
   }
 
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
+  sprintf(test_env.msg, "%.3fM x %s:%s",
+    (double)call_cnt / 1000000.0 / test_time,
+    test_env.strategy->type,
+    test_env.strategy->strategy_name);
+  PASSm(test_env.msg);
+}
+
+
+TEST test_inter_speed(const int width, const int height)
+{
+  unsigned call_cnt = 0;
+  KVZ_CLOCK_T clock_now;
+  KVZ_GET_TIME(&clock_now);
+  double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;
+
+  const vector2d_t dims_lcu = { WIDTH_4K / 64 - 2, HEIGHT_4K / 64 - 2 };
+  const int step = 3;
+  const int range = 2 * step;
+
+  // Loop until time allocated for test has passed.
+  for (uint64_t i = 0;
+      test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
+      ++i)
+  {
+    // Do a sparse full search on the first CU of every LCU.
+    
+    uint64_t sum = 0;
+
+    // Go through the non-edge LCU's in raster scan order.
+    const vector2d_t lcu = {
+      1 + i % dims_lcu.x,
+      1 + (i / dims_lcu.y) % dims_lcu.y,
+    };
+
+    vector2d_t mv;
+    for (mv.y = -range; mv.y <= range; mv.y += step) {
+      for (mv.x = -range; mv.x <= range; mv.x += step) {
+        reg_sad_func *tested_func = test_env.tested_func;
+
+        int lcu_index = lcu.y * 64 * WIDTH_4K + lcu.x * 64;
+        int mv_index = mv.y * WIDTH_4K + mv.x;
+        kvz_pixel *buf1 = &test_env.inter_a->y[lcu_index];
+        kvz_pixel *buf2 = &test_env.inter_a->y[lcu_index + mv_index];
+
+        sum += tested_func(buf1, buf2, width, height, WIDTH_4K, WIDTH_4K);
+        ++call_cnt;
+      }
+    }
+
+    ASSERT(sum > 0);
+    KVZ_GET_TIME(&clock_now)
+  }
+
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
   sprintf(test_env.msg, "%.3fM x %s(%ix%i):%s",
-    (double)call_cnt / 1000000.0,
+    (double)call_cnt / 1000000.0 / test_time,
     test_env.strategy->type,
     width,
-    width,
+    height,
     test_env.strategy->strategy_name);
   PASSm(test_env.msg);
 }
@@ -221,8 +292,9 @@
     KVZ_GET_TIME(&clock_now)
   }
   
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
   sprintf(test_env.msg, "%.3fM x %s:%s",
-    (double)call_cnt / 1000000.0,
+    (double)call_cnt / 1000000.0 / test_time,
     test_env.strategy->type,
     test_env.strategy->strategy_name);
   PASSm(test_env.msg);
@@ -231,36 +303,43 @@
 
 TEST intra_sad(void)
 {
-  const int width = 1 << test_env.log_width;
-  return test_intra_speed(width);
+  return test_intra_speed(test_env.width);
+}
+
+
+TEST intra_sad_dual(void)
+{
+  return test_intra_dual_speed(test_env.width);
 }

kvazaar-1.0.0.tar.gz/tools/genmanpage.sh Added

kvazaar-1.0.0.tar.gz/tools/update_readme.sh Added

@@ -0,0 +1,37 @@
+#!/bin/sh
+# This file is part of Kvazaar HEVC encoder.
+#
+# Copyright (C) 2013-2016 Tampere University of Technology and others (see
+# COPYING file).
+#
+# Kvazaar is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License version 2.1 as
+# published by the Free Software Foundation.
+#
+# Kvazaar is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+
+# This script updates parameter documentation in ../README.md file.
+
+LANG=C
+set -e
+
+cd "$(dirname "$0")"
+
+tmpfile="$(mktemp)"
+readme_file="../README.md"
+
+{
+    sed '/BEGIN KVAZAAR HELP MESSAGE/q' -- "$readme_file";
+    printf '```\n';
+    ../src/kvazaar --help;
+    printf '```\n';
+    sed -n '/END KVAZAAR HELP MESSAGE/{:a;p;n;ba}' -- "$readme_file";
+} >> "$tmpfile"
+
+mv -- "$tmpfile" "../README.md"

kvazaar-1.0.0.tar.gz/tools/version.sh Added