Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 14

kvazaar.changes Changed

@@ -1,4 +1,47 @@
 -------------------------------------------------------------------
+Tue Jul  9 20:15:25 UTC 2019 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 1.3.0
+  Features:
+  * Add release notes like this (#159, cf85d52)
+  * Changed --rd=2 to use SSD metric for CU mode decision
+    (662430d)
+  * Changed inter search to check the cost of flushing residual
+    to zero (75a8700)
+  * Changed rectangular and asymmetric blocks to use a transform
+    split (774c666)
+  * Added diamond search ME algorithm (4e13608)
+  * Enabled low delay B GOP structure with --bipred
+    --gop=lp-g4d3t1 (7155dd0)
+  * Added termination of intra search at zero residual with
+    --intra-rdo-et (4fb1c16)
+  Optimization:
+  * Made TZ search faster and slightly better (c136044)
+  * Optimized bi-prediction (69756e2)
+  Fixes:
+  * Fixed transform skip with rectangular inter blocks (fb462b2)
+  * Fixed accidental inter search for 4x4 blocks (649113a)
+  User Interface:
+  * Changed options for all preset levels (f033ad0)
+  * Added an option for limiting the number of steps in motion
+    estimation with --me-steps (39ed368)
+  * Added --me=dia (4e13608)
+  * Added --level, --force-level and --high-tier for setting
+    bitstream level and tier (bac0745)
+  Building:
+  * Fixed issue with struct timespec redefinition with Visual
+    Studio 2015 and later (713e694)
+  * Fixed building .asm files in Visual Studio 2017 (6be8195)
+  * Fixed compatibility with crypto++ 6.0 (4b24cd0)
+  * Added support for crypto++ with the name libcryptopp
+    (411276d)
+  * Dockerfile base image was updated to Ubuntu 18.04 (8380b6c)
+  * Enabled -Wextra by default (ff17e0b)
+  Refactoring:
+  * Inter motion vector cost functions (c73cce3)
+  * Dockerfile (0164291)
+
+-------------------------------------------------------------------
 Fri Nov 17 14:01:40 UTC 2017 - aloisio@gmx.com
 
 - Update to version 1.2.0

kvazaar.spec Changed

@@ -1,8 +1,8 @@
 #
 # spec file for package kvazaar
 #
+# Copyright (c) 2019 Packman Team <packman@links2linux.de>
 # Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany.
-# Copyright (c) 2017 Packman Team <packman@links2linux.de>
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -13,19 +13,19 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via https://bugs.links2linux.org/
 #
 
 
 %define libname libkvazaar
 %define libmver 4
 Name:           kvazaar
-Version:        1.2.0
+Version:        1.3.0
 Release:        0
 Summary:        HEVC encoder
-License:        LGPL-2.1
+License:        LGPL-2.1-or-later
 Group:          Productivity/Multimedia/Video/Editors and Convertors
-Url:            http://ultravideo.cs.tut.fi/#encoder
+URL:            http://ultravideo.cs.tut.fi/#encoder
 Source0:        https://github.com/ultravideo/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz
 Patch0:         kvazaar.memset.patch
 BuildRequires:  automake
@@ -33,7 +33,6 @@
 BuildRequires:  gcc >= 4.4
 BuildRequires:  gcc-c++
 BuildRequires:  libtool
-BuildRequires:  make
 BuildRequires:  pkgconfig
 Requires:       %{libname}%{libmver} = %{version}
 %ifnarch %{arm}
@@ -66,34 +65,32 @@
 autoreconf -fvi
 %configure \
     --disable-static \
-    --disable-silent-rules
+    --disable-silent-rules \
+    --docdir=%{_defaultdocdir}/%{name}
 make %{?_smp_mflags}
 
 %install
 %make_install
 find %{buildroot} -type f -name "*.la" -delete -print
+rm %{buildroot}%{_defaultdocdir}/%{name}/COPYING
 
 %post   -n %{libname}%{libmver} -p /sbin/ldconfig
 %postun -n %{libname}%{libmver} -p /sbin/ldconfig
 
 %files
-%defattr(-,root,root)
-%dir %{_datadir}/doc/%{name}
-%doc %{_datadir}/doc/%{name}/COPYING
-%doc %{_datadir}/doc/%{name}/CREDITS
-%doc %{_datadir}/doc/%{name}/README.md
-%{_bindir}/kvazaar
-%{_mandir}/man1/kvazaar.1%{ext_man}
+%license COPYING
+%doc CREDITS README.md
+%{_bindir}/%{name}
+%{_mandir}/man1/%{name}.1%{ext_man}
 
 %files -n %{libname}%{libmver}
-%defattr(-,root,root)
-%doc COPYING CREDITS README.md
+%license COPYING
+%doc CREDITS README.md
 %{_libdir}/%{libname}.so.%{libmver}*
 
 %files -n %{libname}-devel
-%defattr(-,root,root)
-%{_includedir}/kvazaar.h
+%{_includedir}/%{name}.h
 %{_libdir}/%{libname}.so
-%{_libdir}/pkgconfig/kvazaar.pc
+%{_libdir}/pkgconfig/%{name}.pc
 
 %changelog

kvazaar-1.2.0.tar.gz/build/kvazaar_VS2013.sln Deleted

@@ -1,55 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2013
-VisualStudioVersion = 12.0.30723.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{50AB7A17-4885-4D20-BF01-376DE4417FCD}"
-	ProjectSection(SolutionItems) = preProject
-		kvazaar_VS2010.vsd = kvazaar_VS2010.vsd
-		kvazaar_VS2010.vsmdi = kvazaar_VS2010.vsmdi
-		Local.testsettings = Local.testsettings
-		TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_tests", "kvazaar_tests\kvazaar_tests.vcxproj", "{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\kvazaar_cli.vcxproj", "{C755308D-9B3E-4712-99AB-7F6F4E2DA567}"
-	ProjectSection(ProjectDependencies) = postProject
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} = {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.ActiveCfg = Debug|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.Build.0 = Debug|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.ActiveCfg = Debug|x64
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.Build.0 = Debug|x64
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.ActiveCfg = Release|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.Build.0 = Release|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.ActiveCfg = Release|x64
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.Build.0 = Release|x64
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|Win32.ActiveCfg = Debug|Win32
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|x64.ActiveCfg = Debug|x64
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|Win32.ActiveCfg = Release|Win32
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|x64.ActiveCfg = Release|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.ActiveCfg = Debug|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.Build.0 = Debug|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.ActiveCfg = Debug|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.Build.0 = Debug|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.ActiveCfg = Release|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.Build.0 = Release|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.ActiveCfg = Release|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal

kvazaar-1.2.0.tar.gz/.gitignore -> kvazaar-1.3.0.tar.gz/.gitignore Changed

kvazaar-1.3.0.tar.gz/.gitlab-ci.yml Added

@@ -0,0 +1,47 @@
+# Use Kvazaar CI base image which includes the build tools and ffmpeg + hmdec in ${HOME}/bin
+image: ultravideo/kvazaar_ci_base:latest
+
+# Build and test kvazaar
+test-kvazaar: &test-template
+  stage: test
+  script:
+    - export PATH="${HOME}/bin:${PATH}"
+    - ./autogen.sh
+    - ./configure --enable-werror || (cat config.log && false)
+    - make --jobs=8
+    - make check --jobs=8 VERBOSE=1
+  artifacts:
+    paths:
+    - src/kvazaar
+    - src/.libs
+    expire_in: 1 week
+
+test-asan:
+  <<: *test-template
+  variables:
+    CFLAGS: '-fsanitize=address'
+    # LeakSanitizer doesn't work inside the container because it requires
+    # ptrace so we disable it.
+    ASAN_OPTIONS: 'detect_leaks=0'
+    # AddressSanitizer adds some extra symbols so we expect a failure from
+    # the external symbols test.
+    XFAIL_TESTS: test_external_symbols.sh
+
+test-tsan:
+  <<: *test-template
+  variables:
+    CFLAGS: '-fsanitize=thread'
+
+test-ubsan:
+  <<: *test-template
+  variables:
+    CFLAGS: '-fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment'
+
+test-valgrind:
+  <<: *test-template
+  variables:
+    KVAZAAR_OVERRIDE_angular_pred: generic
+    KVAZAAR_OVERRIDE_sao_band_ddistortion: generic
+    KVAZAAR_OVERRIDE_sao_edge_ddistortion: generic
+    KVAZAAR_OVERRIDE_calc_sao_edge_dir: generic
+    KVZ_TEST_VALGRIND: 1

kvazaar-1.2.0.tar.gz/.travis.yml -> kvazaar-1.3.0.tar.gz/.travis.yml Changed

kvazaar-1.2.0.tar.gz/Dockerfile -> kvazaar-1.3.0.tar.gz/Dockerfile Changed

@@ -9,34 +9,35 @@
 #
 #     RESOLUTION=`avconv -i input.avi 2>&1 | grep Stream | grep -oP ', \K[0-9]+x[0-9]+'`
 #     avconv -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
-#  or 
+#  or
 #     RESOLUTION=`ffmpeg -i input.avi 2>&1 | grep Stream | grep -oP ', \K[0-9]+x[0-9]+'`
 #     ffmpeg -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
 #
 
-# Use Ubuntu 15.10 as a base for now, it's around 136MB
-FROM ubuntu:15.10
+# Use Ubuntu 18.04 as a base for now, it's around 88MB
+FROM ubuntu:18.04
 
 MAINTAINER Marko Viitanen <fador@iki.fi>
 
-    # List of needed packages to be able to build kvazaar with autotools
-    ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf
-    
-    # Run all the commands in one RUN so we don't have any extra history
-    # data in the image.
-    RUN apt-get update \
+# List of needed packages to be able to build kvazaar with autotools
+ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf
+
+ADD . kvazaar
+# Run all the commands in one RUN so we don't have any extra history
+# data in the image.
+RUN apt-get update \
     && apt-get install -y $REQUIRED_PACKAGES \
     && apt-get clean \
-    && git clone --depth=1 git://github.com/ultravideo/kvazaar.git; \
-        cd kvazaar; \
-        ./autogen.sh; \
-        ./configure --disable-shared;\
-        make;\
-        make install; \
-    AUTOINSTALLED_PACKAGES=`apt-mark showauto`; \
-    apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES; \
-        apt-get clean autoclean; \
-        apt-get autoremove -y; \
-        rm -rf /var/lib/{apt,dpkg,cache,log}/
+    && cd kvazaar \
+    && ./autogen.sh \
+    && ./configure --disable-shared \
+    && make\
+    && make install \
+    && AUTOINSTALLED_PACKAGES=`apt-mark showauto` \
+    && apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES \
+    && apt-get clean autoclean \
+    && apt-get autoremove -y \
+    && rm -rf /var/lib/{apt,dpkg,cache,log}/
+
 ENTRYPOINT ["kvazaar"]
 CMD ["--help"]

kvazaar-1.2.0.tar.gz/README.md -> kvazaar-1.3.0.tar.gz/README.md Changed

@@ -11,6 +11,29 @@
 - Linux/Mac [![Build Status](https://travis-ci.org/ultravideo/kvazaar.svg?branch=master)](https://travis-ci.org/ultravideo/kvazaar)
 - Windows [![Build status](https://ci.appveyor.com/api/projects/status/88sg1h25lp0k71pu?svg=true)](https://ci.appveyor.com/project/Ultravideo/kvazaar)
 
+## Table of Contents
+
+- [Using Kvazaar](#using-kvazaar)
+  - [Example:](#example)
+  - [Parameters](#parameters)
+  - [LP-GOP syntax](#lp-gop-syntax)
+- [Presets](#presets)
+- [Kvazaar library](#kvazaar-library)
+- [Compiling Kvazaar](#compiling-kvazaar)
+  - [Required libraries](#required-libraries)
+  - [Autotools](#autotools)
+  - [OS X](#os-x)
+  - [Visual Studio](#visual-studio)
+  - [Docker](#docker)
+  - [Visualization (Windows only)](#visualization-windows-only)
+- [Paper](#paper)
+- [Contributing to Kvazaar](#contributing-to-kvazaar)
+  - [Code documentation](#code-documentation)
+  - [For version control we try to follow these conventions:](#for-version-control-we-try-to-follow-these-conventions)
+  - [Testing](#testing)
+  - [Unit tests](#unit-tests)
+  - [Code style](#code-style)
+
 ## Using Kvazaar
 
 ### Example:
@@ -31,14 +54,14 @@
 kvazaar -i <input> --input-res <width>x<height> -o <output>
 
 Required:
-  -i, --input                : Input file
+  -i, --input <filename>     : Input file
       --input-res <res>      : Input resolution [auto]
-                               auto: detect from file name
-                               <int>x<int>: width times height
-  -o, --output               : Output file
+                                   - auto: Detect from file name.
+                                   - <int>x<int>: width times height
+  -o, --output <filename>    : Output file
 
 Presets:
-      --preset=<preset>      : Set options to a preset [medium]
+      --preset <preset>      : Set options to a preset [medium]
                                    - ultrafast, superfast, veryfast, faster,
                                      fast, medium, slow, slower, veryslow
                                      placebo
@@ -46,144 +69,190 @@
 Input:
   -n, --frames <integer>     : Number of frames to code [all]
       --seek <integer>       : First frame to code [0]
-      --input-fps <num>/<denom> : Framerate of the input video [25.0]
-      --source-scan-type <string> : Set source scan type [progressive].
-                                   - progressive: progressive scan
-                                   - tff: top field first
-                                   - bff: bottom field first
-      --input-format         : P420 or P400
-      --input-bitdepth       : 8-16
-      --loop-input           : Re-read input file forever
+      --input-fps <num>[/<denom>] : Frame rate of the input video [25]
+      --source-scan-type <string> : Source scan type [progressive]
+                                   - progressive: Progressive scan
+                                   - tff: Top field first
+                                   - bff: Bottom field first
+      --input-format <string> : P420 or P400 [P420]
+      --input-bitdepth <int> : 8-16 [8]
+      --loop-input           : Re-read input file forever.
 
 Options:
-      --help                 : Print this help message and exit
-      --version              : Print version information and exit
-      --aud                  : Use access unit delimiters
-      --debug <string>       : Output encoders reconstruction.
-      --cpuid <integer>      : Disable runtime cpu optimizations with value 0.
-      --hash                 : Decoded picture hash [checksum]
+      --help                 : Print this help message and exit.
+      --version              : Print version information and exit.
+      --(no-)aud             : Use access unit delimiters. [disabled]
+      --debug <filename>     : Output internal reconstruction.
+      --(no-)cpuid           : Enable runtime CPU optimizations. [enabled]
+      --hash <string>        : Decoded picture hash [checksum]
                                    - none: 0 bytes
                                    - checksum: 18 bytes
                                    - md5: 56 bytes
-      --no-psnr              : Don't calculate PSNR for frames
-      --no-info              : Don't add encoder info SEI.
+      --(no-)psnr            : Calculate PSNR for frames. [enabled]
+      --(no-)info            : Add encoder info SEI. [enabled]
+      --crypto <string>      : Selective encryption. Crypto support must be
+                               enabled at compile-time. Can be 'on' or 'off' or
+                               a list of features separated with a '+'. [off]
+                                   - on: Enable all encryption features.
+                                   - off: Disable selective encryption.
+                                   - mvs: Motion vector magnitudes.
+                                   - mv_signs: Motion vector signs.
+                                   - trans_coeffs: Coefficient magnitudes.
+                                   - trans_coeff_signs: Coefficient signs.
+                                   - intra_pred_modes: Intra prediction modes.
+      --key <string>         : Encryption key [16,213,27,56,255,127,242,112,
+                                               97,126,197,204,25,59,38,30]
 
 Video structure:
-  -q, --qp <integer>         : Quantization Parameter [32]
-  -p, --period <integer>     : Period of intra pictures [0]
-                               - 0: only first picture is intra
-                               - 1: all pictures are intra
-                               - 2-N: every Nth picture is intra
-      --vps-period <integer> : Specify how often the video parameter set is
-                               re-sent. [0]
-                                   - 0: only send VPS with the first frame
-                                   - N: send VPS with every Nth intra frame
-  -r, --ref <integer>        : Reference frames, range 1..15 [3]
-      --gop <string>         : Definition of GOP structure [0]
-                                   - 0: disabled
+  -q, --qp <integer>         : Quantization parameter [22]
+  -p, --period <integer>     : Period of intra pictures [64]
+                                   - 0: Only first picture is intra.
+                                   - 1: All pictures are intra.
+                                   - N: Every Nth picture is intra.
+      --vps-period <integer> : How often the video parameter set is re-sent [0]
+                                   - 0: Only send VPS with the first frame.
+                                   - N: Send VPS with every Nth intra frame.
+  -r, --ref <integer>        : Number of reference frames, in range 1..15 [4]
+      --gop <string>         : GOP structure [8]
+                                   - 0: Disabled
                                    - 8: B-frame pyramid of length 8
-                                   - lp-<string>: lp-gop definition
-                                         (e.g. lp-g8d4t2, see README)
-      --cqmfile <string>     : Custom Quantization Matrices from a file
-      --bitrate <integer>    : Target bitrate. [0]
-                                   - 0: disable rate-control
-                                   - N: target N bits per second
-      --lossless             : Use lossless coding
-      --mv-constraint        : Constrain movement vectors
-                                   - none: no constraint
-                                   - frametile: constrain within the tile
-                                   - frametilemargin: constrain even more
-      --roi <string>         : Use a delta QP map for region of interest
-                                   Read an array of delta QP values from
-                                   a file, where the first two values are the
-                                   width and height, followed by width*height
-                                   delta QP values in raster order.
-                                   The delta QP map can be any size or aspect
-                                   ratio, and will be mapped to LCU's.
-      --(no-)erp-aqp         : Use adaptive QP for 360 video with
-                               equirectangular projection
+                                   - lp-<string>: Low-delay P-frame GOP
+                                     (e.g. lp-g8d4t2, see README)
+      --(no-)open-gop        : Use open GOP configuration. [enabled]
+      --cqmfile <filename>   : Read custom quantization matrices from a file.
+      --scaling-list <string>: Set scaling list mode. [off]
+                                   - off: Disable scaling lists.
+                                   - custom: use custom list (with --cqmfile).
+                                   - default: Use default lists.
+      --bitrate <integer>    : Target bitrate [0]
+                                   - 0: Disable rate control.
+                                   - N: Target N bits per second.
+      --(no-)lossless        : Use lossless coding. [disabled]
+      --mv-constraint <string> : Constrain movement vectors. [none]
+                                   - none: No constraint
+                                   - frametile: Constrain within the tile.
+                                   - frametilemargin: Constrain even more.
+      --roi <filename>       : Use a delta QP map for region of interest.
+                               Reads an array of delta QP values from a text
+                               file. The file format is: width and height of
+                               the QP delta map followed by width*height delta
+                               QP values in raster order. The map can be of any
+                               size and will be scaled to the video size.
+      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
+                               in PPS and slice_qp_delta in slize header zero.
+      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
+                               equirectangular projection. [disabled]
+      --level <number>       : Use the given HEVC level in the output and give
+                               an error if level limits are exceeded. [6.2]
+                                   - 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6,
+                                     6.1, 6.2
+      --force-level <number> : Same as --level but warnings instead of errors.
+      --high-tier            : Used with --level. Use high tier bitrate limits
+                               instead of the main tier limits during encoding.
+                               High tier requires level 4 or higher.
 
 Compression tools:
-      --deblock [<beta:tc>]  : Deblocking
-                                     - beta: between -6 and 6
-                                     - tc: between -6 and 6
-      --(no-)sao             : Sample Adaptive Offset
-      --(no-)rdoq            : Rate-Distortion Optimized Quantization
-      --(no-)signhide        : Sign Hiding
-      --(no-)smp             : Symmetric Motion Partition
-      --(no-)amp             : Asymmetric Motion Partition
-      --rd <integer>         : Intra mode search complexity
-                                   - 0: skip intra if inter is good enough
-                                   - 1: rough intra mode search with SATD
-                                   - 2: refine intra mode search with SSE
-      --(no-)mv-rdo          : Rate-Distortion Optimized motion vector costs
-      --(no-)full-intra-search
-                             : Try all intra modes during rough search.

kvazaar-1.2.0.tar.gz/appveyor.yml -> kvazaar-1.3.0.tar.gz/appveyor.yml Changed

@@ -1,28 +1,85 @@
+# Only the whitelisted branches get built, regardless of build config
 branches:
   only:
     - master
-    - appveyor
 
+# Email the author if their commit either failed to build or fixed a failed build
+# good -> bad, bad -> bad, bad -> good  but not  good -> good
+notifications:
+  - provider: Email
+    to:
+      - '{{commitAuthorEmail}}'
+    on_build_success: false
+    on_build_failure: true
+    on_build_status_changed: true
+
+# Skip commits that don't affect the code / compiling the code
+skip_commits:
+  files:
+    - .gitignore
+    - .gitlab-ci.yml
+    - .travis-install.bash
+    - .travis.yml
+    - COPYING
+    - CREDITS
+    - README.md
+    - docs.doxy
+
+# Download only a zip file of the latest commit
+# Downloading the whole history of the repository would be unnecessary
+shallow_clone: true
+
+# Only try building the app, don't run any tests
+test: off
+
+# Don't bother with debug builds
+configuration: 
+  - Release
+
+# Build with multiple compilers / build suites
+image: Visual Studio 2015
 environment:
   matrix:
-    - MSYSTEM: MINGW64
+    - platform: Win32
+    - platform: x64
     - MSYSTEM: MINGW32
+    - MSYSTEM: MINGW64
 
-shallow_clone: true
-test: off
+for:
+-
+  # MinGW builds need all kinds of build scripts
+  matrix:
+    only:
+      - MSYSTEM: MINGW32
+      - MSYSTEM: MINGW64
+
+  install:
+    # Update core packages
+    - C:\msys64\usr\bin\pacman -Syyuu --noconfirm --noprogressbar
+    # Update non-core packages
+    - C:\msys64\usr\bin\pacman -Suu --noconfirm --noprogressbar
+    # Install required MSYS2 packages
+    - C:\msys64\usr\bin\pacman -S --noconfirm --noprogressbar --needed automake-wrapper make
+    # Now MSYS2 is up to date, do the rest of the install from a bash script
+    - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-install.sh"
+
+  build_script:
+    - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-build.sh"
+
+  cache:
+    - C:\msys64\var\cache\pacman\pkg
+-
+  # MSVC builds only need vsyasm and the solution file
+  matrix:
+    except:
+      - MSYSTEM: MINGW32
+      - MSYSTEM: MINGW64
+  
+  install:
+    - ps: $url = "http://ultravideo.cs.tut.fi/vsyasm.exe"
+    - ps: $output = "C:\Tools\vsyasm.exe"
+    - ps: "(New-Object System.Net.WebClient).DownloadFile($url, $output)"
+    - ps: '$env:Path += ";$output\.."'
 
-install:
-  # Update core packages
-  - C:\msys64\usr\bin\pacman -Syyuu --noconfirm --noprogressbar
-  # Update non-core packages
-  - C:\msys64\usr\bin\pacman -Suu --noconfirm --noprogressbar
-  # Install required MSYS2 packages
-  - C:\msys64\usr\bin\pacman -S --noconfirm --noprogressbar --needed automake-wrapper make
-  # Now MSYS2 is up to date, do the rest of the install from a bash script
-  - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-install.sh"
-
-build_script:
-  - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-build.sh"
-
-cache:
-  - C:\msys64\var\cache\pacman\pkg
+  build:
+    project: .\build\kvazaar_VS2015.sln

kvazaar-1.2.0.tar.gz/autogen.sh -> kvazaar-1.3.0.tar.gz/autogen.sh Changed

kvazaar-1.2.0.tar.gz/build/C_Properties.props -> kvazaar-1.3.0.tar.gz/build/C_Properties.props Changed

@@ -13,7 +13,7 @@
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <PreprocessorDefinitions>KVZ_DLL_EXPORTS;KVZ_COMPILE_ASM;WIN32_LEAN_AND_MEAN;WIN32;WIN64;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)..\..\pthreads.2\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\src\threadwrapper\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DisableSpecificWarnings>4244;4204;4206;4028;4152;4996;4018;4456;4389;4100;4131;4459;4706;4214;4127;4201</DisableSpecificWarnings>
       <OpenMPSupport>false</OpenMPSupport>
       <TreatSpecificWarningsAsErrors>4013;4029;4047;4716;4700;4020;4021;4133</TreatSpecificWarningsAsErrors>

kvazaar-1.3.0.tar.gz/build/kvazaar_VS2015.sln Added

@@ -0,0 +1,55 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 12.0.30723.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{50AB7A17-4885-4D20-BF01-376DE4417FCD}"
+	ProjectSection(SolutionItems) = preProject
+		kvazaar_VS2010.vsd = kvazaar_VS2010.vsd
+		kvazaar_VS2010.vsmdi = kvazaar_VS2010.vsmdi
+		Local.testsettings = Local.testsettings
+		TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_tests", "kvazaar_tests\kvazaar_tests.vcxproj", "{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\kvazaar_cli.vcxproj", "{C755308D-9B3E-4712-99AB-7F6F4E2DA567}"
+	ProjectSection(ProjectDependencies) = postProject
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} = {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.ActiveCfg = Debug|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.Build.0 = Debug|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.ActiveCfg = Debug|x64
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.Build.0 = Debug|x64
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.ActiveCfg = Release|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.Build.0 = Release|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.ActiveCfg = Release|x64
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.Build.0 = Release|x64
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|Win32.ActiveCfg = Debug|Win32
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|x64.ActiveCfg = Debug|x64
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|Win32.ActiveCfg = Release|Win32
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|x64.ActiveCfg = Release|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.Build.0 = Debug|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.ActiveCfg = Debug|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.Build.0 = Debug|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.ActiveCfg = Release|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.Build.0 = Release|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.ActiveCfg = Release|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

kvazaar-1.2.0.tar.gz/build/kvazaar_cli/kvazaar_cli.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_cli/kvazaar_cli.vcxproj Changed

@@ -22,23 +22,24 @@
     <ProjectGuid>{C755308D-9B3E-4712-99AB-7F6F4E2DA567}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>kvazaar_cli</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">

kvazaar-1.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -22,27 +22,28 @@
     <ProjectGuid>{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>kvazaar_lib</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -78,20 +79,26 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <YASM />
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <YASM>
       <Defines>ARCH_X86_64=1;%(Defines)</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <YASM>
       <Defines>ARCH_X86_64=0;PREFIX</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <ClCompile>
       <UndefinePreprocessorDefinitions>
@@ -101,10 +108,13 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <YASM>
       <Defines>ARCH_X86_64=0;PREFIX</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <ClCompile>
       <UndefinePreprocessorDefinitions>
@@ -114,10 +124,13 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <YASM>
       <Defines>ARCH_X86_64=1;%(Defines)</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <ClCompile>
       <UndefinePreprocessorDefinitions>
@@ -154,6 +167,12 @@
     <ClCompile Include="..\..\src\search.c" />
     <ClCompile Include="..\..\src\search_inter.c" />
     <ClCompile Include="..\..\src\search_intra.c" />
+    <ClCompile Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c">
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
@@ -172,9 +191,11 @@
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
+    <ClCompile Include="..\..\src\strategies\strategies-encode.c" />
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
     <ClInclude Include="..\..\src\checkpoint.h" />
@@ -214,6 +235,18 @@
     <ClCompile Include="..\..\src\strategies\strategies-picture.c" />
     <ClCompile Include="..\..\src\strategies\strategies-sao.c" />
     <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" />
+    <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp">
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompileAsCpp</CompileAs>
+    </ClCompile>
+    <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp">
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompileAsCpp</CompileAs>
+    </ClCompile>
     <ClCompile Include="..\..\src\videoframe.c" />
     <ClInclude Include="..\..\src\encoder_state-bitstream.h" />
     <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h" />
@@ -228,13 +261,19 @@
     <ClInclude Include="..\..\src\kvz_math.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\reg_sad_pow2_widths-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
+    <ClInclude Include="..\..\src\strategies\sse41\reg_sad_pow2_widths-sse41.h" />
     <ClInclude Include="..\..\src\strategies\strategies-common.h" />
     <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\quant-generic.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-encode.h" />
     <ClInclude Include="..\..\src\strategies\strategies-intra.h" />
     <ClInclude Include="..\..\src\strategies\strategies-quant.h" />
   </ItemGroup>
@@ -279,6 +318,8 @@
     <ClInclude Include="..\..\src\tables.h" />
     <ClInclude Include="..\..\src\threadqueue.h" />
     <ClInclude Include="..\..\src\threads.h" />
+    <ClInclude Include="..\..\src\threadwrapper\include\pthread.h" />
+    <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h" />
     <ClInclude Include="..\..\src\transform.h" />
     <ClInclude Include="..\..\src\videoframe.h" />
   </ItemGroup>
@@ -296,4 +337,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Project="..\yasm\vsyasm.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-1.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -49,6 +49,9 @@
     <Filter Include="Threading">
       <UniqueIdentifier>{63c21cb2-b379-4d38-bcb8-173786c2466d}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Threadwrapper">
+      <UniqueIdentifier>{f4abece9-e209-4817-a57e-c64ca7c5e05c}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\strategies\strategies-nal.c">
@@ -221,6 +224,21 @@
     </ClCompile>
     <ClCompile Include="..\..\src\extras\libmd5.c" />
     <ClCompile Include="..\..\src\extras\crypto.cpp" />
+    <ClCompile Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.c">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\strategies-encode.c">
+      <Filter>Optimization\strategies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp">
+      <Filter>Threadwrapper</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp">
+      <Filter>Threadwrapper</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -411,6 +429,30 @@
     </ClInclude>
     <ClInclude Include="..\..\src\extras\libmd5.h" />
     <ClInclude Include="..\..\src\extras\crypto.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\strategies-encode.h">
+      <Filter>Optimization\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\reg_sad_pow2_widths-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\sse41\reg_sad_pow2_widths-sse41.h">
+      <Filter>Optimization\strategies\sse41</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\threadwrapper\include\pthread.h">
+      <Filter>Threadwrapper</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h">
+      <Filter>Threadwrapper</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
@@ -423,4 +465,4 @@
       <Filter>Optimization\strategies\x86_asm</Filter>
     </YASM>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-1.2.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj Changed

@@ -22,23 +22,24 @@
     <ProjectGuid>{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>kvazaar_tests</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -115,4 +116,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-1.2.0.tar.gz/build/yasm/vsyasm.targets -> kvazaar-1.3.0.tar.gz/build/yasm/vsyasm.targets Changed

@@ -20,7 +20,7 @@
     AfterTargets="$(YASMAfterTargets)"
     Condition="'@(YASM)' != ''"
     DependsOnTargets="$(YASMDependsOn);ComputeYASMOutput"
-    Outputs="@(YASM->'%(ObjectFile)')"
+    Outputs="@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
     Inputs="@(YASM);%(YASM.AdditionalDependencies);$(MSBuildProjectFile)">
     <ItemGroup
       Condition="'@(SelectedFiles)' != ''">
@@ -32,7 +32,7 @@
       <YASM_tlog
         Include="%(YASM.ObjectFile)"
         Condition="'%(YASM.ObjectFile)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'">
-        <Source>@(YASM, '|')</Source>
+        <Source>@(YASM->'%(FullPath)', '|')</Source>
       </YASM_tlog>
     </ItemGroup>
     <Message
@@ -40,8 +40,9 @@
       Text="%(YASM.ExecutionDescription)" />
     <WriteLinesToFile
       Condition="'@(YASM_tlog)' != '' and '%(YASM_tlog.ExcludedFromBuild)' != 'true'"
-      File="$(IntDir)$(ProjectName).write.1.tlog"
-      Lines="^%(YASM_tlog.Source);@(YASM_tlog->'%(Fullpath)')" />
+      File="$(TLogLocation)$(ProjectName).write.1.tlog"
+      Lines="^%(YASM_tlog.Source);@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
+      Encoding="Unicode" />
     <YASM
       Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'"
       CommandLineTemplate="%(YASM.CommandLineTemplate)"

kvazaar-1.2.0.tar.gz/configure.ac -> kvazaar-1.3.0.tar.gz/configure.ac Changed

@@ -23,7 +23,7 @@
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=4
-ver_minor=0
+ver_minor=2
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
@@ -45,15 +45,20 @@
 
 LT_INIT([win32-dll])
 
+AX_CHECK_COMPILE_FLAG([-maltivec],[flag_altivec="true"])
 AX_CHECK_COMPILE_FLAG([-mavx2],   [flag_avx2="true"])
 AX_CHECK_COMPILE_FLAG([-msse4.1], [flag_sse4_1="true"])
 AX_CHECK_COMPILE_FLAG([-msse2],   [flag_sse2="true"])
+AX_CHECK_COMPILE_FLAG([-mbmi],    [flag_bmi="true"])
+AX_CHECK_COMPILE_FLAG([-mabm],    [flag_abm="true"])
+AX_CHECK_COMPILE_FLAG([-mbmi2],   [flag_bmi2="true"])
 
-AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true"])
+AM_CONDITIONAL([HAVE_ALTIVEC], [test x"$flag_altivec" = x"true"])
+AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true"])
 AM_CONDITIONAL([HAVE_SSE4_1], [test x"$flag_sse4_1" = x"true"])
 AM_CONDITIONAL([HAVE_SSE2], [test x"$flag_sse2" = x"true"])
 
-KVZ_CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
+KVZ_CFLAGS="-Wall -Wextra -Wvla -Wno-sign-compare -Wno-unused-parameter -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
 CFLAGS="$KVZ_CFLAGS $CFLAGS"
 
 AC_SEARCH_LIBS([log], [m c], [], [exit 1])
@@ -68,7 +73,10 @@
         [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])],
         [PKG_CHECK_MODULES([cryptopp], [libcrypto++],
             [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])],
-            [AC_MSG_ERROR([neither cryptopp nor libcrypto++ found with pkg-config])]
+            [PKG_CHECK_MODULES([cryptopp], [libcryptopp],
+                [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])],
+                [AC_MSG_ERROR([neither cryptopp, libcrypto++ nor libcryptopp found with pkg-config])]
+            )]
         )]
     )]
 )

kvazaar-1.2.0.tar.gz/doc/kvazaar.1 -> kvazaar-1.3.0.tar.gz/doc/kvazaar.1 Changed

@@ -1,24 +1,24 @@
-.TH KVAZAAR "1" "November 2017" "kvazaar v1.2.0" "User Commands"
+.TH KVAZAAR "1" "July 2019" "kvazaar v1.3.0" "User Commands"
 .SH NAME
 kvazaar \- open source HEVC encoder
 .SH SYNOPSIS
 \fBkvazaar \fR\-i <input> \-\-input\-res <width>x<height> \-o <output>
 .SH DESCRIPTION
 .TP
-\fB\-i\fR, \fB\-\-input               
+\fB\-i\fR, \fB\-\-input <filename>    
 Input file
 .TP
 \fB\-\-input\-res <res>     
 Input resolution [auto]
-auto: detect from file name
-<int>x<int>: width times height
+    \- auto: Detect from file name.
+    \- <int>x<int>: width times height
 .TP
-\fB\-o\fR, \fB\-\-output              
+\fB\-o\fR, \fB\-\-output <filename>   
 Output file
 
 .SS "Presets:"
 .TP
-\fB\-\-preset=<preset>     
+\fB\-\-preset <preset>     
 Set options to a preset [medium]
     \- ultrafast, superfast, veryfast, faster,
       fast, medium, slow, slower, veryslow
@@ -32,241 +32,315 @@
 \fB\-\-seek <integer>      
 First frame to code [0]
 .TP
-\fB\-\-input\-fps <num>/<denom>
-Framerate of the input video [25.0]
+\fB\-\-input\-fps <num>[/<denom>]
+Frame rate of the input video [25]
 .TP
 \fB\-\-source\-scan\-type <string>
-Set source scan type [progressive].
-    \- progressive: progressive scan
-    \- tff: top field first
-    \- bff: bottom field first
+Source scan type [progressive]
+    \- progressive: Progressive scan
+    \- tff: Top field first
+    \- bff: Bottom field first
 .TP
-\fB\-\-input\-format        
-P420 or P400
+\fB\-\-input\-format <string>
+P420 or P400 [P420]
 .TP
-\fB\-\-input\-bitdepth      
-8\-16
+\fB\-\-input\-bitdepth <int>
+8\-16 [8]
 .TP
 \fB\-\-loop\-input          
-Re\-read input file forever
+Re\-read input file forever.
 
 .SS "Options:"
 .TP
 \fB\-\-help                
-Print this help message and exit
+Print this help message and exit.
 .TP
 \fB\-\-version             
-Print version information and exit
+Print version information and exit.
 .TP
-\fB\-\-aud                 
-Use access unit delimiters
+\fB\-\-(no\-)aud            
+Use access unit delimiters. [disabled]
 .TP
-\fB\-\-debug <string>      
-Output encoders reconstruction.
+\fB\-\-debug <filename>    
+Output internal reconstruction.
 .TP
-\fB\-\-cpuid <integer>     
-Disable runtime cpu optimizations with value 0.
+\fB\-\-(no\-)cpuid          
+Enable runtime CPU optimizations. [enabled]
 .TP
-\fB\-\-hash                
+\fB\-\-hash <string>       
 Decoded picture hash [checksum]
     \- none: 0 bytes
     \- checksum: 18 bytes
     \- md5: 56 bytes
 .TP
-\fB\-\-no\-psnr             
-Don't calculate PSNR for frames
-.TP
-\fB\-\-no\-info             
-Don't add encoder info SEI.
+\fB\-\-(no\-)psnr           
+Calculate PSNR for frames. [enabled]
+.TP
+\fB\-\-(no\-)info           
+Add encoder info SEI. [enabled]
+.TP
+\fB\-\-crypto <string>     
+Selective encryption. Crypto support must be
+enabled at compile\-time. Can be 'on' or 'off' or
+a list of features separated with a '+'. [off]
+    \- on: Enable all encryption features.
+    \- off: Disable selective encryption.
+    \- mvs: Motion vector magnitudes.
+    \- mv_signs: Motion vector signs.
+    \- trans_coeffs: Coefficient magnitudes.
+    \- trans_coeff_signs: Coefficient signs.
+    \- intra_pred_modes: Intra prediction modes.
+.TP
+\fB\-\-key <string>        
+Encryption key [16,213,27,56,255,127,242,112,
+                97,126,197,204,25,59,38,30]
 
 .SS "Video structure:"
 .TP
 \fB\-q\fR, \fB\-\-qp <integer>        
-Quantization Parameter [32]
+Quantization parameter [22]
 .TP
 \fB\-p\fR, \fB\-\-period <integer>    
-Period of intra pictures [0]
-\- 0: only first picture is intra
-\- 1: all pictures are intra
-\- 2\-N: every Nth picture is intra
+Period of intra pictures [64]
+    \- 0: Only first picture is intra.
+    \- 1: All pictures are intra.
+    \- N: Every Nth picture is intra.
 .TP
 \fB\-\-vps\-period <integer>
-Specify how often the video parameter set is
-re\-sent. [0]
-    \- 0: only send VPS with the first frame
-    \- N: send VPS with every Nth intra frame
+How often the video parameter set is re\-sent [0]
+    \- 0: Only send VPS with the first frame.
+    \- N: Send VPS with every Nth intra frame.
 .TP
 \fB\-r\fR, \fB\-\-ref <integer>       
-Reference frames, range 1..15 [3]
+Number of reference frames, in range 1..15 [4]
 .TP
 \fB\-\-gop <string>        
-Definition of GOP structure [0]
-    \- 0: disabled
+GOP structure [8]
+    \- 0: Disabled
     \- 8: B\-frame pyramid of length 8
-    \- lp\-<string>: lp\-gop definition
-          (e.g. lp\-g8d4t2, see README)
+    \- lp\-<string>: Low\-delay P\-frame GOP
+      (e.g. lp\-g8d4t2, see README)
+.TP
+\fB\-\-(no\-)open\-gop
+Use open GOP configuration. [enabled]
 .TP
-\fB\-\-cqmfile <string>    
-Custom Quantization Matrices from a file
+\fB\-\-cqmfile <filename>  
+Read custom quantization matrices from a file.
+.TP
+\fB\-\-scaling-list <string>
+Set scaling list mode. [off]
+    \- off: Disable scaling lists.
+    \- custom: use custom list (with \-\-cqmfile).
+    \- default: Use default lists.
 .TP
 \fB\-\-bitrate <integer>   
-Target bitrate. [0]
-    \- 0: disable rate\-control
-    \- N: target N bits per second
-.TP
-\fB\-\-lossless            
-Use lossless coding
-.TP
-\fB\-\-mv\-constraint       
-Constrain movement vectors
-    \- none: no constraint
-    \- frametile: constrain within the tile
-    \- frametilemargin: constrain even more
-.TP
-\fB\-\-roi <string>        
-Use a delta QP map for region of interest
-    Read an array of delta QP values from
-    a file, where the first two values are the
-    width and height, followed by width*height
-    delta QP values in raster order.
-    The delta QP map can be any size or aspect
-    ratio, and will be mapped to LCU's.
+Target bitrate [0]

kvazaar-1.2.0.tar.gz/src/Makefile.am -> kvazaar-1.3.0.tar.gz/src/Makefile.am Changed

@@ -124,6 +124,8 @@
 	strategies/generic/quant-generic.h \
 	strategies/generic/sao-generic.c \
 	strategies/generic/sao-generic.h \
+	strategies/generic/encode_coding_tree-generic.c \
+	strategies/generic/encode_coding_tree-generic.h \
 	strategies/strategies-common.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -139,6 +141,8 @@
 	strategies/strategies-quant.h \
 	strategies/strategies-sao.c \
 	strategies/strategies-sao.h \
+	strategies/strategies-encode.c \
+	strategies/strategies-encode.h \
 	strategies/x86_asm/picture-x86-asm.c \
 	strategies/x86_asm/picture-x86-asm.h \
 	strategyselector.c \
@@ -186,7 +190,9 @@
 	strategies/avx2/quant-avx2.c \
 	strategies/avx2/quant-avx2.h \
 	strategies/avx2/sao-avx2.c \
-	strategies/avx2/sao-avx2.h
+	strategies/avx2/sao-avx2.h \
+	strategies/avx2/encode_coding_tree-avx2.c \
+	strategies/avx2/encode_coding_tree-avx2.h
 
 libsse2_la_SOURCES = \
 	strategies/sse2/picture-sse2.c \
@@ -197,13 +203,17 @@
 	strategies/sse41/picture-sse41.h
 
 if HAVE_PPC
+
+if HAVE_ALTIVEC
 libaltivec_la_CFLAGS = -maltivec
 endif
 
+endif #HAVE_PPC
+
 if HAVE_X86
 
 if HAVE_AVX2
-libavx2_la_CFLAGS = -mavx2
+libavx2_la_CFLAGS = -mavx2 -mbmi -mabm -mbmi2
 endif
 if HAVE_SSE4_1
 libsse41_la_CFLAGS = -msse4.1

kvazaar-1.2.0.tar.gz/src/cfg.c -> kvazaar-1.3.0.tar.gz/src/cfg.c Changed

@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
 
 
 kvz_config *kvz_config_alloc(void)
@@ -36,7 +37,7 @@
   cfg->width           = 0;
   cfg->height          = 0;
   cfg->framerate       = 25; // deprecated and will be removed.
-  cfg->framerate_num   = 0;
+  cfg->framerate_num   = 25;
   cfg->framerate_denom = 1;
   cfg->qp              = 22;
   cfg->intra_period    = 64;
@@ -78,6 +79,7 @@
   cfg->lossless        = false;
   cfg->tmvp_enable     = true;
   cfg->implicit_rdpcm  = false;
+  cfg->fast_residual_cost_limit = 0;
 
   cfg->cu_split_termination = KVZ_CU_SPLIT_TERMINATION_ZERO;
 
@@ -85,13 +87,13 @@
   cfg->tiles_height_count = 1;
   cfg->tiles_width_split  = NULL;
   cfg->tiles_height_split = NULL;
-  
+
   cfg->wpp = 1;
   cfg->owf = -1;
   cfg->slice_count = 1;
   cfg->slice_addresses_in_ts = MALLOC(int32_t, 1);
   cfg->slice_addresses_in_ts[0] = 0;
-  
+
   cfg->threads = -1;
   cfg->cpuid = 1;
 
@@ -108,16 +110,19 @@
   cfg->crypto_features = KVZ_CRYPTO_OFF;
 
   cfg->me_early_termination = 1;
+  cfg->intra_rdo_et         = 0;
 
   cfg->input_format = KVZ_FORMAT_P420;
   cfg->input_bitdepth = 8;
 
   cfg->gop_lp_definition.d = 3;
   cfg->gop_lp_definition.t = 1;
+  cfg->open_gop = true;
 
   cfg->roi.width = 0;
   cfg->roi.height = 0;
   cfg->roi.dqps = NULL;
+  cfg->set_qp_in_cu = false;
 
   cfg->erp_aqp = false;
 
@@ -125,6 +130,17 @@
 
   cfg->optional_key = NULL;
 
+  cfg->level = 62; // default hevc level, 6.2 (the highest)
+  cfg->force_level = true; // don't care about level limits by-default
+  cfg->high_tier = false;
+
+  cfg->me_max_steps = (uint32_t)-1;
+
+  cfg->scaling_list = KVZ_SCALING_LIST_OFF;
+
+  cfg->max_merge = 5;
+  cfg->early_skip = true;
+
   return 1;
 }
 
@@ -178,14 +194,14 @@
   const char* current_arg = NULL;
   int32_t current_value;
   int32_t values[MAX_TILES_PER_DIM];
-  
+
   int i;
-  
+
   //Free pointer in any case
   if (*array) {
     FREE_POINTER(*array);
   }
-  
+
   //If the arg starts with u, we want an uniform split
   if (arg[0]=='u') {
     *ntiles = atoi(arg + 1);
@@ -196,7 +212,7 @@
     //Done with parsing
     return 1;
   }
-  
+
   //We have a comma-separated list of int for the split...
   current_arg = arg;
   *ntiles = 1;
@@ -213,27 +229,27 @@
     ++(*ntiles);
     if (MAX_TILES_PER_DIM <= *ntiles) break;
   } while (current_arg);
-  
+
   if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) {
     fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM);
     return 0;
   }
-  
+
   *array = MALLOC(int32_t, *ntiles - 1);
   if (!*array) {
     fprintf(stderr, "Could not allocate array for tiles\n");
     return 0;
   }
-  
+
   //TODO: memcpy?
   for (i = 0; i < *ntiles - 1; ++i) {
     (*array)[i] = values[i];
   }
-  
+
   return 1;
 }
 
-static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)                               
+static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
 {
   char *tail;
   int d = strtol(numstr, &tail, 10);
@@ -285,14 +301,14 @@
   const char* current_arg = NULL;
   int32_t current_value;
   int32_t values[MAX_SLICES];
-  
+
   int i;
-  
+
   //Free pointer in any case
   if (*array) {
     FREE_POINTER(*array);
   }
-  
+
   //If the arg starts with u, we want an uniform split
   if (arg[0]=='u') {
     *nslices = atoi(arg+1);
@@ -303,7 +319,7 @@
     //Done with parsing
     return 1;
   }
-  
+
   //We have a comma-separated list of int for the split...
   current_arg = arg;
   //We always have a slice starting at 0
@@ -322,29 +338,29 @@
     ++(*nslices);
     if (MAX_SLICES <= *nslices) break;
   } while (current_arg);
-  
+
   if (MAX_SLICES <= *nslices || 0 >= *nslices) {
     fprintf(stderr, "Invalid number of slices (0 < %d <= %d = MAX_SLICES)!\n", *nslices, MAX_SLICES);
     return 0;
   }
-  
+
   *array = MALLOC(int32_t, *nslices);
   if (!*array) {
     fprintf(stderr, "Could not allocate array for slices\n");
     return 0;
   }
-  
+
   //TODO: memcpy?
   for (i = 0; i < *nslices; ++i) {
     (*array)[i] = values[i];
   }
-  
+
   return 1;
 }
 
 int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
 {
-  static const char * const me_names[]          = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", NULL };
+  static const char * const me_names[]          = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", "dia", NULL };
   static const char * const source_scan_type_names[] = { "progressive", "tff", "bff", NULL };

kvazaar-1.2.0.tar.gz/src/cfg.h -> kvazaar-1.3.0.tar.gz/src/cfg.h Changed

kvazaar-1.2.0.tar.gz/src/cli.c -> kvazaar-1.3.0.tar.gz/src/cli.c Changed

@@ -36,9 +36,9 @@
   { "input",              required_argument, NULL, 'i' },
   { "output",             required_argument, NULL, 'o' },
   { "debug",              required_argument, NULL, 'd' },
-  { "width",              required_argument, NULL, 'w' },
+  { "width",              required_argument, NULL, 'w' }, // deprecated
   { "height",             required_argument, NULL, 'h' }, // deprecated
-  { "frames",             required_argument, NULL, 'n' }, // deprecated
+  { "frames",             required_argument, NULL, 'n' },
   { "qp",                 required_argument, NULL, 'q' },
   { "period",             required_argument, NULL, 'p' },
   { "ref",                required_argument, NULL, 'r' },
@@ -86,7 +86,8 @@
   { "owf",                required_argument, NULL, 0 },
   { "slices",             required_argument, NULL, 0 },
   { "threads",            required_argument, NULL, 0 },
-  { "cpuid",              required_argument, NULL, 0 },
+  { "cpuid",              optional_argument, NULL, 0 },
+  { "no-cpuid",                 no_argument, NULL, 0 },
   { "pu-depth-inter",     required_argument, NULL, 0 },
   { "pu-depth-intra",     required_argument, NULL, 0 },
   { "info",                     no_argument, NULL, 0 },
@@ -109,6 +110,8 @@
   { "crypto",             required_argument, NULL, 0 },
   { "key",                required_argument, NULL, 0 },
   { "me-early-termination",required_argument, NULL, 0 },
+  { "intra-rdo-et",             no_argument, NULL, 0 },
+  { "no-intra-rdo-et",          no_argument, NULL, 0 },
   { "lossless",                 no_argument, NULL, 0 },
   { "no-lossless",              no_argument, NULL, 0 },
   { "tmvp",                     no_argument, NULL, 0 },
@@ -122,6 +125,18 @@
   { "roi",                required_argument, NULL, 0 },
   { "erp-aqp",                  no_argument, NULL, 0 },
   { "no-erp-aqp",               no_argument, NULL, 0 },
+  { "level",              required_argument, NULL, 0 },
+  { "force-level",        required_argument, NULL, 0 },
+  { "high-tier",                no_argument, NULL, 0 },
+  { "me-steps",           required_argument, NULL, 0 },
+  { "fast-residual-cost", required_argument, NULL, 0 },
+  { "set-qp-in-cu",             no_argument, NULL, 0 },
+  { "open-gop",                 no_argument, NULL, 0 },
+  { "no-open-gop",              no_argument, NULL, 0 },
+  { "scaling-list",       required_argument, NULL, 0 },
+  { "max-merge",          required_argument, NULL, 0 },
+  { "early-skip",               no_argument, NULL, 0 },
+  { "no-early-skip",            no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -316,168 +331,214 @@
     "Usage:\n"
     "kvazaar -i <input> --input-res <width>x<height> -o <output>\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Required:\n"
-    "  -i, --input                : Input file\n"
+    "  -i, --input <filename>     : Input file\n"
     "      --input-res <res>      : Input resolution [auto]\n"
-    "                               auto: detect from file name\n"
-    "                               <int>x<int>: width times height\n"
-    "  -o, --output               : Output file\n"
+    "                                   - auto: Detect from file name.\n"
+    "                                   - <int>x<int>: width times height\n"
+    "  -o, --output <filename>    : Output file\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Presets:\n"
-    "      --preset=<preset>      : Set options to a preset [medium]\n"
+    "      --preset <preset>      : Set options to a preset [medium]\n"
     "                                   - ultrafast, superfast, veryfast, faster,\n"
     "                                     fast, medium, slow, slower, veryslow\n"
     "                                     placebo\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Input:\n"
     "  -n, --frames <integer>     : Number of frames to code [all]\n"
     "      --seek <integer>       : First frame to code [0]\n"
-    "      --input-fps <num>/<denom> : Framerate of the input video [25.0]\n"
-    "      --source-scan-type <string> : Set source scan type [progressive].\n"
-    "                                   - progressive: progressive scan\n"
-    "                                   - tff: top field first\n"
-    "                                   - bff: bottom field first\n"
-    "      --input-format         : P420 or P400\n"
-    "      --input-bitdepth       : 8-16\n"
-    "      --loop-input           : Re-read input file forever\n"
+    "      --input-fps <num>[/<denom>] : Frame rate of the input video [25]\n"
+    "      --source-scan-type <string> : Source scan type [progressive]\n"
+    "                                   - progressive: Progressive scan\n"
+    "                                   - tff: Top field first\n"
+    "                                   - bff: Bottom field first\n"
+    "      --input-format <string> : P420 or P400 [P420]\n"
+    "      --input-bitdepth <int> : 8-16 [8]\n"
+    "      --loop-input           : Re-read input file forever.\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Options:\n"
-    "      --help                 : Print this help message and exit\n"
-    "      --version              : Print version information and exit\n"
-    "      --aud                  : Use access unit delimiters\n"
-    "      --debug <string>       : Output encoders reconstruction.\n"
-    "      --cpuid <integer>      : Disable runtime cpu optimizations with value 0.\n"
-    "      --hash                 : Decoded picture hash [checksum]\n"
+    "      --help                 : Print this help message and exit.\n"
+    "      --version              : Print version information and exit.\n"
+    "      --(no-)aud             : Use access unit delimiters. [disabled]\n"
+    "      --debug <filename>     : Output internal reconstruction.\n"
+    "      --(no-)cpuid           : Enable runtime CPU optimizations. [enabled]\n"
+    "      --hash <string>        : Decoded picture hash [checksum]\n"
     "                                   - none: 0 bytes\n"
     "                                   - checksum: 18 bytes\n"
     "                                   - md5: 56 bytes\n"
-    "      --no-psnr              : Don't calculate PSNR for frames\n"
-    "      --no-info              : Don't add encoder info SEI.\n"
+    "      --(no-)psnr            : Calculate PSNR for frames. [enabled]\n"
+    "      --(no-)info            : Add encoder info SEI. [enabled]\n"
+    "      --crypto <string>      : Selective encryption. Crypto support must be\n"
+    "                               enabled at compile-time. Can be 'on' or 'off' or\n"
+    "                               a list of features separated with a '+'. [off]\n"
+    "                                   - on: Enable all encryption features.\n"
+    "                                   - off: Disable selective encryption.\n"
+    "                                   - mvs: Motion vector magnitudes.\n"
+    "                                   - mv_signs: Motion vector signs.\n"
+    "                                   - trans_coeffs: Coefficient magnitudes.\n"
+    "                                   - trans_coeff_signs: Coefficient signs.\n"
+    "                                   - intra_pred_modes: Intra prediction modes.\n"
+    "      --key <string>         : Encryption key [16,213,27,56,255,127,242,112,\n"
+    "                                               97,126,197,204,25,59,38,30]\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Video structure:\n"
-    "  -q, --qp <integer>         : Quantization Parameter [32]\n"
-    "  -p, --period <integer>     : Period of intra pictures [0]\n"
-    "                               - 0: only first picture is intra\n"
-    "                               - 1: all pictures are intra\n"
-    "                               - 2-N: every Nth picture is intra\n"
-    "      --vps-period <integer> : Specify how often the video parameter set is\n"
-    "                               re-sent. [0]\n"
-    "                                   - 0: only send VPS with the first frame\n"
-    "                                   - N: send VPS with every Nth intra frame\n"
-    "  -r, --ref <integer>        : Reference frames, range 1..15 [3]\n"
-    "      --gop <string>         : Definition of GOP structure [0]\n"
-    "                                   - 0: disabled\n"
+    "  -q, --qp <integer>         : Quantization parameter [22]\n"
+    "  -p, --period <integer>     : Period of intra pictures [64]\n"
+    "                                   - 0: Only first picture is intra.\n"
+    "                                   - 1: All pictures are intra.\n"
+    "                                   - N: Every Nth picture is intra.\n"
+    "      --vps-period <integer> : How often the video parameter set is re-sent [0]\n"
+    "                                   - 0: Only send VPS with the first frame.\n"
+    "                                   - N: Send VPS with every Nth intra frame.\n"
+    "  -r, --ref <integer>        : Number of reference frames, in range 1..15 [4]\n"
+    "      --gop <string>         : GOP structure [8]\n"
+    "                                   - 0: Disabled\n"
     "                                   - 8: B-frame pyramid of length 8\n"
-    "                                   - lp-<string>: lp-gop definition\n"
-    "                                         (e.g. lp-g8d4t2, see README)\n"
-    "      --cqmfile <string>     : Custom Quantization Matrices from a file\n"
-    "      --bitrate <integer>    : Target bitrate. [0]\n"
-    "                                   - 0: disable rate-control\n"
-    "                                   - N: target N bits per second\n"
-    "      --lossless             : Use lossless coding\n"
-    "      --mv-constraint        : Constrain movement vectors\n"
-    "                                   - none: no constraint\n"
-    "                                   - frametile: constrain within the tile\n"
-    "                                   - frametilemargin: constrain even more\n"
-    "      --roi <string>         : Use a delta QP map for region of interest\n"
-    "                                   Read an array of delta QP values from\n"
-    "                                   a file, where the first two values are the\n"
-    "                                   width and height, followed by width*height\n"
-    "                                   delta QP values in raster order.\n"
-    "                                   The delta QP map can be any size or aspect\n"
-    "                                   ratio, and will be mapped to LCU's.\n"
-    "      --(no-)erp-aqp         : Use adaptive QP for 360 video with\n"
-    "                               equirectangular projection\n"
+    "                                   - lp-<string>: Low-delay P-frame GOP\n"
+    "                                     (e.g. lp-g8d4t2, see README)\n"
+    "      --(no-)open-gop        : Use open GOP configuration. [enabled]\n"
+    "      --cqmfile <filename>   : Read custom quantization matrices from a file.\n"
+    "      --scaling-list <string>: Set scaling list mode. [off]\n"
+    "                                   - off: Disable scaling lists.\n"
+    "                                   - custom: use custom list (with --cqmfile).\n"
+    "                                   - default: Use default lists.\n"
+    "      --bitrate <integer>    : Target bitrate [0]\n"
+    "                                   - 0: Disable rate control.\n"
+    "                                   - N: Target N bits per second.\n"
+    "      --(no-)lossless        : Use lossless coding. [disabled]\n"
+    "      --mv-constraint <string> : Constrain movement vectors. [none]\n"
+    "                                   - none: No constraint\n"
+    "                                   - frametile: Constrain within the tile.\n"
+    "                                   - frametilemargin: Constrain even more.\n"
+    "      --roi <filename>       : Use a delta QP map for region of interest.\n"
+    "                               Reads an array of delta QP values from a text\n"
+    "                               file. The file format is: width and height of\n"
+    "                               the QP delta map followed by width*height delta\n"

kvazaar-1.2.0.tar.gz/src/cu.c -> kvazaar-1.3.0.tar.gz/src/cu.c Changed

kvazaar-1.2.0.tar.gz/src/cu.h -> kvazaar-1.3.0.tar.gz/src/cu.h Changed

kvazaar-1.2.0.tar.gz/src/encmain.c -> kvazaar-1.3.0.tar.gz/src/encmain.c Changed

@@ -27,6 +27,9 @@
 /* The following two defines must be located before the inclusion of any system header files. */
 #define WINVER       0x0500
 #define _WIN32_WINNT 0x0500
+
+#include "global.h" // IWYU pragma: keep
+
 #include <fcntl.h>    /* _O_BINARY */
 #include <io.h>       /* _setmode() */
 #endif
@@ -41,7 +44,6 @@
 #include "checkpoint.h"
 #include "cli.h"
 #include "encoder.h"
-#include "global.h" // IWYU pragma: keep
 #include "kvazaar.h"
 #include "kvazaar_internal.h"
 #include "threads.h"
@@ -431,6 +433,12 @@
     uint32_t frames_done = 0;
     double psnr_sum[3] = { 0.0, 0.0, 0.0 };
 
+    // how many bits have been written this second? used for checking if framerate exceeds level's limits
+    uint64_t bits_this_second = 0;
+    // the amount of frames have been encoded in this second of video. can be non-integer value if framerate is non-integer value
+    unsigned frames_this_second = 0;
+    const float framerate = ((float)encoder->cfg.framerate_num) / ((float)encoder->cfg.framerate_denom);
+
     uint8_t padding_x = get_padding(opts->config->width);
     uint8_t padding_y = get_padding(opts->config->height);
 
@@ -527,6 +535,39 @@
         fflush(output);
 
         bitstream_length += len_out;
+        
+        // the level's bitrate check
+        frames_this_second += 1;
+
+        if ((float)frames_this_second >= framerate) {
+          // if framerate <= 1 then we go here always
+
+          // how much of the bits of the last frame belonged to the next second
+          uint64_t leftover_bits = (uint64_t)((double)len_out * ((double)frames_this_second - framerate));
+
+          // the latest frame is counted for the amount that it contributed to this current second
+          bits_this_second += len_out - leftover_bits;
+
+          if (bits_this_second > encoder->cfg.max_bitrate) {
+            fprintf(stderr, "Level warning: This %s's bitrate (%llu bits/s) reached the maximum bitrate (%u bits/s) of %s tier level %g.",
+              framerate >= 1.0f ? "second" : "frame",
+              (unsigned long long) bits_this_second,
+              encoder->cfg.max_bitrate,
+              encoder->cfg.high_tier ? "high" : "main",
+              (float)encoder->cfg.level / 10.0f );
+          }
+
+          if (framerate > 1.0f) {
+            // leftovers for the next second
+            bits_this_second = leftover_bits;
+          } else {
+            // one or more next seconds are from this frame and their bitrate is the same or less as this frame's
+            bits_this_second = 0;
+          }
+          frames_this_second = 0;
+        } else {
+          bits_this_second += len_out;
+        }
 
         // Compute and print stats.

kvazaar-1.2.0.tar.gz/src/encode_coding_tree.c -> kvazaar-1.3.0.tar.gz/src/encode_coding_tree.c Changed

@@ -30,6 +30,7 @@
 #include "intra.h"
 #include "kvazaar.h"
 #include "kvz_math.h"
+#include "strategyselector.h"
 #include "tables.h"
 #include "videoframe.h"
 
@@ -46,10 +47,10 @@
  * This method encodes the X and Y component within a block of the last
  * significant coefficient.
  */
-static void encode_last_significant_xy(cabac_data_t * const cabac,
-                                       uint8_t lastpos_x, uint8_t lastpos_y,
-                                       uint8_t width, uint8_t height,
-                                       uint8_t type, uint8_t scan)
+void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
+                                    uint8_t lastpos_x, uint8_t lastpos_y,
+                                    uint8_t width, uint8_t height,
+                                    uint8_t type, uint8_t scan)
 {
   const int index = kvz_math_floor_log2(width) - 2;
   uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4);
@@ -100,250 +101,6 @@
   }
 }
 
-void kvz_encode_coeff_nxn(encoder_state_t * const state,
-                          cabac_data_t * const cabac,
-                          const coeff_t *coeff,
-                          uint8_t width,
-                          uint8_t type,
-                          int8_t scan_mode,
-                          int8_t tr_skip)
-{
-  const encoder_control_t * const encoder = state->encoder_control;
-  int c1 = 1;
-  uint8_t last_coeff_x = 0;
-  uint8_t last_coeff_y = 0;
-  int32_t i;
-  uint32_t sig_coeffgroup_flag[8 * 8] = { 0 };
-
-  int8_t be_valid = encoder->cfg.signhide_enable;
-  int32_t scan_pos_sig;
-  uint32_t go_rice_param = 0;
-  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
-
-  // CONSTANTS
-  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
-  const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
-  const uint32_t *scan           =
-    kvz_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
-
-  // Init base contexts according to block type
-  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]);
-  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) :
-                                 &(cabac->ctx.cu_sig_model_chroma[0]);
-
-  // Scan all coeff groups to find out which of them have coeffs.
-  // Populate sig_coeffgroup_flag with that info.
-
-  unsigned sig_cg_cnt = 0;
-  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
-    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
-      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
-      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
-        // Load four 16-bit coeffs and see if any of them are non-zero.
-        unsigned coeff_pos = cg_pos + coeff_row * width;
-        uint64_t four_coeffs = *(uint64_t*)(&coeff[coeff_pos]);
-        if (four_coeffs) {
-          ++sig_cg_cnt;
-          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
-          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
-          sig_coeffgroup_flag[cg_pos_x + cg_pos_y * num_blk_side] = 1;
-          break;
-        }
-      }
-    }
-  }
-
-  // Rest of the code assumes at least one non-zero coeff.
-  assert(sig_cg_cnt > 0);
-
-  // Find the last coeff group by going backwards in scan order.
-  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
-  while (!sig_coeffgroup_flag[scan_cg[scan_cg_last]]) {
-    --scan_cg_last;
-  }
-
-  // Find the last coeff by going backwards in scan order.
-  unsigned scan_pos_last = scan_cg_last * 16 + 15;
-  while (!coeff[scan[scan_pos_last]]) {
-    --scan_pos_last;
-  }
-
-  int pos_last = scan[scan_pos_last];
-
-  // transform skip flag
-  if(width == 4 && encoder->cfg.trskip_enable) {
-    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
-    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
-  }
-
-  last_coeff_x = pos_last & (width - 1);
-  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
-
-  // Code last_coeff_x and last_coeff_y
-  encode_last_significant_xy(cabac,
-                             last_coeff_x,
-                             last_coeff_y,
-                             width,
-                             width,
-                             type,
-                             scan_mode);
-
-  scan_pos_sig  = scan_pos_last;
-
-  // significant_coeff_flag
-  for (i = scan_cg_last; i >= 0; i--) {
-    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
-    int32_t abs_coeff[16];
-    int32_t cg_blk_pos     = scan_cg[i];
-    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
-    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
-
-    uint32_t coeff_signs   = 0;
-    int32_t last_nz_pos_in_cg = -1;
-    int32_t first_nz_pos_in_cg = 16;
-    int32_t num_non_zero = 0;
-    go_rice_param = 0;
-
-    if (scan_pos_sig == scan_pos_last) {
-      abs_coeff[0] = abs(coeff[pos_last]);
-      coeff_signs  = (coeff[pos_last] < 0);
-      num_non_zero = 1;
-      last_nz_pos_in_cg  = scan_pos_sig;
-      first_nz_pos_in_cg = scan_pos_sig;
-      scan_pos_sig--;
-    }
-
-    if (i == scan_cg_last || i == 0) {
-      sig_coeffgroup_flag[cg_blk_pos] = 1;
-    } else {
-      uint32_t sig_coeff_group   = (sig_coeffgroup_flag[cg_blk_pos] != 0);
-      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                      cg_pos_y, width);
-      cabac->cur_ctx = &base_coeff_group_ctx[ctx_sig];
-      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
-    }
-
-    if (sig_coeffgroup_flag[cg_blk_pos]) {
-      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
-                                                             cg_pos_x, cg_pos_y, width);
-
-      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
-        blk_pos = scan[scan_pos_sig];
-        pos_y   = blk_pos >> log2_block_size;
-        pos_x   = blk_pos - (pos_y << log2_block_size);
-        sig    = (coeff[blk_pos] != 0) ? 1 : 0;
-
-        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
-          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
-                                             log2_block_size, type);
-          cabac->cur_ctx = &baseCtx[ctx_sig];
-          CABAC_BIN(cabac, sig, "sig_coeff_flag");
-        }
-
-        if (sig) {
-          abs_coeff[num_non_zero] = abs(coeff[blk_pos]);
-          coeff_signs              = 2 * coeff_signs + (coeff[blk_pos] < 0);
-          num_non_zero++;
-
-          if (last_nz_pos_in_cg == -1) {
-            last_nz_pos_in_cg = scan_pos_sig;
-          }
-
-          first_nz_pos_in_cg  = scan_pos_sig;
-        }
-      }
-    } else {
-      scan_pos_sig = sub_pos - 1;
-    }
-
-    if (num_non_zero > 0) {
-      bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */
-                         && !encoder->cfg.lossless;
-      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
-      cabac_ctx_t *base_ctx_mod;
-      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
-
-      if (c1 == 0) {
-        ctx_set++;
-      }
-
-      c1 = 1;
-
-      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma[4 * ctx_set]) :
-                         &(cabac->ctx.cu_one_model_chroma[4 * ctx_set]);
-      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);

kvazaar-1.2.0.tar.gz/src/encode_coding_tree.h -> kvazaar-1.3.0.tar.gz/src/encode_coding_tree.h Changed

kvazaar-1.2.0.tar.gz/src/encoder.c -> kvazaar-1.3.0.tar.gz/src/encoder.c Changed

@@ -305,7 +305,7 @@
   kvz_scalinglist_init(&encoder->scaling_list);
 
   // CQM
-  if (cfg->cqmfile) {
+  if (cfg->scaling_list == KVZ_SCALING_LIST_CUSTOM && cfg->cqmfile) {
     FILE* cqmfile = fopen(cfg->cqmfile, "rb");
     if (cqmfile) {
       kvz_scalinglist_parse(&encoder->scaling_list, cqmfile);
@@ -314,7 +314,12 @@
       fprintf(stderr, "Could not open CQM file.\n");
       goto init_failed;
     }
+  } else if (cfg->scaling_list == KVZ_SCALING_LIST_DEFAULT) {
+    // Enable scaling lists if default lists are used
+    encoder->scaling_list.enable = 1;
+    encoder->scaling_list.use_default_list = 1;
   }
+
   kvz_scalinglist_process(&encoder->scaling_list, encoder->bitdepth);
 
   kvz_encoder_control_input_init(encoder, encoder->cfg.width, encoder->cfg.height);
@@ -347,13 +352,15 @@
 
   }
 
-  encoder->lcu_dqp_enabled = cfg->target_bitrate > 0 || encoder->cfg.roi.dqps;
+  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
+  // for SMP and AMP partition units.
+  encoder->tr_depth_inter = 0;
 
-  // When tr_depth_inter is equal to 0, inter transform split flag defaults
-  // to 1 for SMP and AMP partition units. We want to avoid the extra
-  // transform split so we set tr_depth_inter to 1 when SMP or AMP
-  // partition modes are enabled.
-  encoder->tr_depth_inter = (encoder->cfg.smp_enable || encoder->cfg.amp_enable) ? 1 : 0;
+  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
+    encoder->max_qp_delta_depth = 0;
+  } else {
+    encoder->max_qp_delta_depth = -1;
+  }
 
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
@@ -731,6 +738,7 @@
   switch (num_layers) {
     case 0:
     case 1:
+      encoder->gop_layer_weights[0] = 1;
       break;
 
     // Use the first layers of the 4-layer weights.

kvazaar-1.2.0.tar.gz/src/encoder.h -> kvazaar-1.3.0.tar.gz/src/encoder.h Changed

kvazaar-1.2.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-1.3.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -60,7 +60,7 @@
   // PTL
   // Profile Tier
   WRITE_U(stream, 0, 2, "general_profile_space");
-  WRITE_U(stream, 0, 1, "general_tier_flag");
+  WRITE_U(stream, state->encoder_control->cfg.high_tier, 1, "general_tier_flag");
   // Main Profile == 1,  Main 10 profile == 2
   WRITE_U(stream, (state->encoder_control->bitdepth == 8)?1:2, 5, "general_profile_idc");
   /* Compatibility flags should be set at general_profile_idc
@@ -80,8 +80,8 @@
 
   // end Profile Tier
 
-  // Level 6.2 (general_level_idc is 30 * 6.2)
-  WRITE_U(stream, 186, 8, "general_level_idc");
+  uint8_t level = state->encoder_control->cfg.level;
+  WRITE_U(stream, level * 3, 8, "general_level_idc");
 
   WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
   WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
@@ -395,8 +395,11 @@
   // scaling list
   WRITE_U(stream, encoder->scaling_list.enable, 1, "scaling_list_enable_flag");
   if (encoder->scaling_list.enable) {
-    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
-    encoder_state_write_bitstream_scaling_list(stream, state);
+    // Signal scaling list data for custom lists
+    WRITE_U(stream, (encoder->cfg.scaling_list == KVZ_SCALING_LIST_CUSTOM) ? 1 : 0, 1, "sps_scaling_list_data_present_flag");
+    if (encoder->cfg.scaling_list == KVZ_SCALING_LIST_CUSTOM) {
+      encoder_state_write_bitstream_scaling_list(stream, state);
+    }
   }
 
   WRITE_U(stream, (encoder->cfg.amp_enable ? 1 : 0), 1, "amp_enabled_flag");
@@ -451,16 +454,21 @@
 
   WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
   WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
-  WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pic_init_qp_minus26");
+  
+  // If tiles and slices = tiles is enabled, signal QP in the slice header. Keeping the PPS constant for OMAF etc
+  // Keep QP constant here also if it will be only set at CU level.
+  bool constant_qp_in_pps = ((encoder->cfg.slices & KVZ_SLICES_TILES) && encoder->tiles_enable) || encoder->cfg.set_qp_in_cu;
+  WRITE_SE(stream, constant_qp_in_pps ? 0 : (((int8_t)encoder->cfg.qp) - 26), "pic_init_qp_minus26");
+
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag");
 
-  if (encoder->lcu_dqp_enabled) {
+  if (encoder->max_qp_delta_depth >= 0) {
     // Use separate QP for each LCU when rate control is enabled.
     WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag");
-    WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+    WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth");
   } else {
-  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
+    WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
   }
 
   //TODO: add QP offsets
@@ -777,12 +785,12 @@
       WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
     }
   }
-    
+
   if (state->frame->slicetype != KVZ_SLICE_I) {
       WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-      WRITE_UE(stream, ref_negative != 0 ? ref_negative - 1: 0, "num_ref_idx_l0_active_minus1");
+      WRITE_UE(stream, MAX(0, ((int)state->frame->ref_LX_size[0]) - 1), "num_ref_idx_l0_active_minus1");
       if (state->frame->slicetype == KVZ_SLICE_B) {
-        WRITE_UE(stream, ref_positive != 0 ? ref_positive - 1 : 0, "num_ref_idx_l1_active_minus1");
+        WRITE_UE(stream, MAX(0, ((int)state->frame->ref_LX_size[1]) - 1), "num_ref_idx_l1_active_minus1");
         WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
       }
 
@@ -799,12 +807,16 @@
           WRITE_UE(stream, 0, "collocated_ref_idx");
         }
       }
-
-      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
+      const uint8_t max_merge_cands = state->encoder_control->cfg.max_merge;
+      WRITE_UE(stream, 5- max_merge_cands, "five_minus_max_num_merge_cand");
   }
 
   {
-    int slice_qp_delta = state->frame->QP - encoder->cfg.qp;
+    // If tiles are enabled, signal the full QP here (relative to the base value of 26)
+    // If QP is to be set only at CU level, force slice_qp_delta zero
+    bool signal_qp_in_slice_header = (encoder->cfg.slices & KVZ_SLICES_TILES) && encoder->tiles_enable;
+    int slice_qp_delta = state->frame->QP - (signal_qp_in_slice_header ? 26 : encoder->cfg.qp);
+    if(encoder->cfg.set_qp_in_cu) slice_qp_delta = 0;
     WRITE_SE(stream, slice_qp_delta, "slice_qp_delta");
   }
 }

kvazaar-1.2.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

kvazaar-1.2.0.tar.gz/src/encoderstate.c -> kvazaar-1.3.0.tar.gz/src/encoderstate.c Changed

@@ -37,9 +37,6 @@
 #include "tables.h"
 #include "threadqueue.h"
 
-#define SAO_BUF_WIDTH (LCU_WIDTH + SAO_DELAY_PX + 2)
-#define SAO_BUF_WIDTH_C (SAO_BUF_WIDTH / 2)
-
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -250,10 +247,18 @@
 {
   videoframe_t *const frame = state->tile->frame;
 
-  // Temporary buffers for SAO input pixels.
-  kvz_pixel sao_buf_y_array[SAO_BUF_WIDTH * SAO_BUF_WIDTH];
-  kvz_pixel sao_buf_u_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C];
-  kvz_pixel sao_buf_v_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C];
+
+  // Temporary buffers for SAO input pixels. The buffers cover the pixels
+  // inside the LCU (LCU_WIDTH x LCU_WIDTH), SAO_DELAY_PX wide bands to the
+  // left and above the LCU, and one pixel border on the left and top
+  // sides. We add two extra pixels to the buffers because the AVX2 SAO
+  // reconstruction reads up to two extra bytes when using edge SAO in the
+  // horizontal direction.
+#define SAO_BUF_WIDTH   (1 + SAO_DELAY_PX   + LCU_WIDTH)
+#define SAO_BUF_WIDTH_C (1 + SAO_DELAY_PX/2 + LCU_WIDTH_C)
+  kvz_pixel sao_buf_y_array[SAO_BUF_WIDTH   * SAO_BUF_WIDTH   + 2];
+  kvz_pixel sao_buf_u_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C + 2];
+  kvz_pixel sao_buf_v_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C + 2];
 
   // Pointers to the top-left pixel of the LCU in the buffers.
   kvz_pixel *const sao_buf_y = &sao_buf_y_array[(SAO_DELAY_PX + 1) * (SAO_BUF_WIDTH + 1)];
@@ -526,68 +531,81 @@
 /**
  * \brief Sets the QP for each CU in state->tile->frame->cu_array.
  *
- * The QPs are used in deblocking.
+ * The QPs are used in deblocking and QP prediction.
  *
- * The delta QP for an LCU is coded when the first CU with coded block flag
- * set is encountered. Hence, for the purposes of deblocking, all CUs
- * before the first one with cbf set use state->ref_qp and all CUs after
- * that use state->qp.
+ * The QP delta for a quantization group is coded when the first CU with
+ * coded block flag set is encountered. Hence, for the purposes of
+ * deblocking and QP prediction, all CUs in before the first one that has
+ * cbf set use the QP predictor and all CUs after that use (QP predictor
+ * + QP delta).
  *
  * \param state           encoder state
  * \param x               x-coordinate of the left edge of the root CU
  * \param y               y-coordinate of the top edge of the root CU
  * \param depth           depth in the CU quadtree
- * \param coeffs_coded    Used for tracking whether a CU with a residual
- *                        has been encountered. Should be set to false at
- *                        the top level.
- * \return Whether there were any CUs with residual or not.
+ * \param last_qp         QP of the last CU in the last quantization group
+ * \param prev_qp         -1 if QP delta has not been coded in current QG,
+ *                        otherwise the QP of the current QG
  */
-static bool set_cu_qps(encoder_state_t *state, int x, int y, int depth, bool coeffs_coded)
+static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
 {
-  if (state->qp == state->ref_qp) {
-    // If the QPs are equal there is no need to care about the residuals.
-    coeffs_coded = true;
-  }
+
+  // Stop recursion if the CU is completely outside the frame.
+  if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
 
   cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y);
   const int cu_width = LCU_WIDTH >> depth;
-  coeffs_coded = coeffs_coded || cbf_is_set_any(cu->cbf, cu->depth);
 
-  if (!coeffs_coded && cu->depth > depth) {
+  if (depth <= state->encoder_control->max_qp_delta_depth) {
+    *prev_qp = -1;
+  }
+
+  if (cu->depth > depth) {
     // Recursively process sub-CUs.
     const int d = cu_width >> 1;
-    coeffs_coded = set_cu_qps(state, x,     y,     depth + 1, coeffs_coded);
-    coeffs_coded = set_cu_qps(state, x + d, y,     depth + 1, coeffs_coded);
-    coeffs_coded = set_cu_qps(state, x,     y + d, depth + 1, coeffs_coded);
-    coeffs_coded = set_cu_qps(state, x + d, y + d, depth + 1, coeffs_coded);
+    set_cu_qps(state, x,     y,     depth + 1, last_qp, prev_qp);
+    set_cu_qps(state, x + d, y,     depth + 1, last_qp, prev_qp);
+    set_cu_qps(state, x,     y + d, depth + 1, last_qp, prev_qp);
+    set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
 
   } else {
-    if (!coeffs_coded && cu->tr_depth > depth) {
+    bool cbf_found = *prev_qp >= 0;
+
+    if (cu->tr_depth > depth) {
       // The CU is split into smaller transform units. Check whether coded
       // block flag is set for any of the TUs.
       const int tu_width = LCU_WIDTH >> cu->tr_depth;
-      for (int y_scu = y; y_scu < y + cu_width; y_scu += tu_width) {
-        for (int x_scu = x; x_scu < x + cu_width; x_scu += tu_width) {
+      for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
+        for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
           cu_info_t *tu = kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
           if (cbf_is_set_any(tu->cbf, cu->depth)) {
-            coeffs_coded = true;
+            cbf_found = true;
           }
         }
       }
+    } else if (cbf_is_set_any(cu->cbf, cu->depth)) {
+      cbf_found = true;
+    }
+
+    int8_t qp;
+    if (cbf_found) {
+      *prev_qp = qp = cu->qp;
+    } else {
+      qp = kvz_get_cu_ref_qp(state, x, y, *last_qp);
     }
 
     // Set the correct QP for all state->tile->frame->cu_array elements in
     // the area covered by the CU.
-    const int8_t qp = coeffs_coded ? state->qp : state->ref_qp;
-
     for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
       for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
         kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
       }
     }
-  }
 
-  return coeffs_coded;
+    if (is_last_cu_in_qg(state, x, y, depth)) {
+      *last_qp = cu->qp;
+    }
+  }
 }
 
 
@@ -608,11 +626,13 @@
 
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
-  if (encoder->cfg.deblock_enable) {
-    if (encoder->lcu_dqp_enabled) {
-      set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, false);
-    }
+  if (encoder->max_qp_delta_depth >= 0) {
+    int last_qp = state->last_qp;
+    int prev_qp = -1;
+    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
+  }
 
+  if (encoder->cfg.deblock_enable) {
     kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y);
   }
 
@@ -635,9 +655,6 @@
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]);
   }
 
-  // QP delta is not used when rate control is turned off.
-  state->must_code_qp_delta = encoder->lcu_dqp_enabled;
-
   //Encode coding tree
   kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0);
 
@@ -709,7 +726,8 @@
   const encoder_control_t *ctrl = state->encoder_control;
   const kvz_config *cfg = &ctrl->cfg;
 
-  state->ref_qp = state->frame->QP;
+  // Signaled slice QP may be different to frame QP with set-qp-in-cu enabled.
+  state->last_qp = ctrl->cfg.set_qp_in_cu ? 26 : state->frame->QP;
 
   if (cfg->crypto_features) {
     state->crypto_hdl = kvz_crypto_create(cfg);
@@ -784,6 +802,21 @@
             dep_lcu = dep_lcu->right;
           }
           kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[dep_lcu->id]);
+
+          // Very spesific bug that happens when owf length is longer than the
+          // gop length. Takes care of that.
+          if(!state->encoder_control->cfg.gop_lowdelay &&
+             state->encoder_control->cfg.open_gop &&
+             state->encoder_control->cfg.gop_len != 0 &&
+             state->encoder_control->cfg.owf > state->encoder_control->cfg.gop_len &&
+             ref_state->frame->slicetype == KVZ_SLICE_I &&
+             ref_state->frame->num != 0){
+
+            while (ref_state->frame->poc != state->frame->poc - state->encoder_control->cfg.gop_len){
+              ref_state = ref_state->previous_encoder_state;
+            }
+            kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[dep_lcu->id]);
+          }

kvazaar-1.2.0.tar.gz/src/encoderstate.h -> kvazaar-1.3.0.tar.gz/src/encoderstate.h Changed

@@ -268,10 +268,17 @@
   bool must_code_qp_delta;
 
   /**
-   * \brief Reference for computing QP delta for the next LCU that is coded
-   * next. Updated whenever a QP delta is coded.
+   * \brief QP value of the last CU in the last coded quantization group.
+   *
+   * A quantization group is a square of width
+   * (LCU_WIDTH >> encoder_control->max_qp_delta_depth). All CUs of in the
+   * same quantization group share the QP predictor value, but may have
+   * different QP values.
+   *
+   * Set to the frame QP at the beginning of a wavefront row or a tile and
+   * updated when the last CU of a quantization group is coded.
    */
-  int8_t ref_qp;
+  int8_t last_qp;
 
   /**
    * \brief Coeffs for the LCU.
@@ -297,6 +304,8 @@
 lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y);
 
 
+int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp);
+
 /**
  * Whether the parameter sets should be written with the current frame.
  */
@@ -309,6 +318,30 @@
          (vps_period >= 0 && frame == 0);
 }
 
+
+/**
+ * \brief Returns true if the CU is the last CU in its containing
+ * quantization group.
+ *
+ * \param state   encoder state
+ * \param x       x-coordinate of the left edge of the CU
+ * \param y       y-cooradinate of the top edge of the CU
+ * \param depth   depth in the CU tree
+ * \return true, if it's the last CU in its QG, otherwise false
+ */
+static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
+{
+  if (state->encoder_control->max_qp_delta_depth < 0) return false;
+
+  const int cu_width = LCU_WIDTH >> depth;
+  const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth;
+  const int right  = x + cu_width;
+  const int bottom = y + cu_width;
+  return (right % qg_width == 0 || right >= state->tile->frame->width) &&
+         (bottom % qg_width == 0 || bottom >= state->tile->frame->height);
+}
+
+
 static const uint8_t g_group_idx[32] = {
   0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
   6, 6, 7, 7, 7, 7, 8, 8, 8, 8,

kvazaar-1.2.0.tar.gz/src/extras/crypto.cpp -> kvazaar-1.3.0.tar.gz/src/extras/crypto.cpp Changed

kvazaar-1.2.0.tar.gz/src/filter.c -> kvazaar-1.3.0.tar.gz/src/filter.c Changed

kvazaar-1.2.0.tar.gz/src/global.h -> kvazaar-1.3.0.tar.gz/src/global.h Changed

@@ -78,6 +78,12 @@
  * Stuff related to multi-threading using pthreads
  */
 
+ // Pthreads-win32 tries to define timespec even if it has already been defined.
+ // In Visual Studio 2015 timespec is defined in time.h so we may need to define
+ // HAVE_STRUCT_TIMESPEC.
+#if _MSC_VER >= 1900 && !defined(HAVE_STRUCT_TIMESPEC)
+#   define HAVE_STRUCT_TIMESPEC
+#endif
 
 #if defined(_MSC_VER) && defined(_M_AMD64)
   #define X86_64
@@ -200,7 +206,7 @@
 // NOTE: When making a release, check to see if incrementing libversion in 
 // configure.ac is necessary.
 #ifndef KVZ_VERSION
-#define KVZ_VERSION 1.2.0
+#define KVZ_VERSION 1.3.0
 #endif
 #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)
 
@@ -233,8 +239,10 @@
 #ifdef _MSC_VER
 // Buggy VS2010 throws intellisense warnings if void* is not casted.
   #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num))
+  #define MALLOC_SIMD_PADDED(type, num, padding) (type *)malloc(sizeof(type) * (num) + (padding))
 #else
   #define MALLOC(type, num) malloc(sizeof(type) * (num))
+  #define MALLOC_SIMD_PADDED(type, num, padding) malloc(sizeof(type) * (num) + (padding))
 #endif
 
 // Use memset through FILL and FILL_ARRAY when appropriate, such as when

kvazaar-1.2.0.tar.gz/src/image.c -> kvazaar-1.3.0.tar.gz/src/image.c Changed

@@ -47,6 +47,8 @@
   assert((width % 2) == 0);
   assert((height % 2) == 0);
 
+  const size_t simd_padding_width = 64;
+
   kvz_picture *im = MALLOC(kvz_picture, 1);
   if (!im) return NULL;
 
@@ -56,12 +58,13 @@
 
   im->chroma_format = chroma_format;
 
-  //Allocate memory
-  im->fulldata = MALLOC(kvz_pixel, (luma_size + 2 * chroma_size));
-  if (!im->fulldata) {
+  //Allocate memory, pad the full data buffer from both ends
+  im->fulldata_buf = MALLOC_SIMD_PADDED(kvz_pixel, (luma_size + 2 * chroma_size), simd_padding_width * 2);
+  if (!im->fulldata_buf) {
     free(im);
     return NULL;
   }
+  im->fulldata = im->fulldata_buf + simd_padding_width / sizeof(kvz_pixel);
 
   im->base_image = im;
   im->refcount = 1; //We give a reference to caller
@@ -110,11 +113,12 @@
     // Free our reference to the base image.
     kvz_image_free(im->base_image);
   } else {
-    free(im->fulldata);
+    free(im->fulldata_buf);
   }
 
   // Make sure freed data won't be used.
   im->base_image = NULL;
+  im->fulldata_buf = NULL;
   im->fulldata = NULL;
   im->y = im->u = im->v = NULL;
   im->data[COLOR_Y] = im->data[COLOR_U] = im->data[COLOR_V] = NULL;
@@ -128,10 +132,10 @@
  */
 kvz_picture *kvz_image_copy_ref(kvz_picture *im)
 {
-  // The caller should have had another reference.
-  assert(im->refcount > 0);
-  KVZ_ATOMIC_INC(&(im->refcount));
-
+  int32_t new_refcount = KVZ_ATOMIC_INC(&im->refcount);
+  // The caller should have had another reference and we added one
+  // reference so refcount should be at least 2.
+  assert(new_refcount >= 2);
   return im;
 }
 
@@ -223,6 +227,15 @@
   free(yuv);
 }
 
+static INLINE uint32_t reg_sad_maybe_optimized(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                                  const int32_t width, const int32_t height, const uint32_t stride1,
+                                  const uint32_t stride2, optimized_sad_func_ptr_t optimized_sad)
+{
+  if (optimized_sad != NULL)
+    return optimized_sad(data1, data2, height, stride1, stride2);
+  else
+    return kvz_reg_sad(data1, data2, width, height, stride1, stride2);
+}
 
 /**
  * \brief Diagonally interpolate SAD outside the frame.
@@ -251,58 +264,6 @@
   return sad;
 }
 
-/**
- * \brief Vertically interpolate SAD outside the frame.
- *
- * \param data1   Starting point of the first picture.
- * \param data2   Starting point of the second picture.
- * \param width   Width of the region for which SAD is calculated.
- * \param height  Height of the region for which SAD is calculated.
- * \param width  Width of the pixel array.
- *
- * \returns Sum of Absolute Differences
- */
-static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_stride)
-{
-  int x, y;
-  unsigned sad = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
-    }
-  }
-
-  return sad;
-}
-
-/**
- * \brief Horizontally interpolate SAD outside the frame.
- *
- * \param data1   Starting point of the first picture.
- * \param data2   Starting point of the second picture.
- * \param width   Width of the region for which SAD is calculated.
- * \param height  Height of the region for which SAD is calculated.
- * \param width   Width of the pixel array.
- *
- * \returns Sum of Absolute Differences
- */
-static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_stride, unsigned ref_stride)
-{
-  int x, y;
-  unsigned sad = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_data[y * pic_stride + x] - ref_data[y * ref_stride]);
-    }
-  }
-
-  return sad;
-}
-
 
 /**
  * \brief  Handle special cases of comparing blocks that are not completely
@@ -319,7 +280,8 @@
  */
 static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture *ref,
                                  int pic_x, int pic_y, int ref_x, int ref_y,
-                                 int block_width, int block_height)
+                                 int block_width, int block_height,
+                                 optimized_sad_func_ptr_t optimized_sad)
 {
   kvz_pixel *pic_data, *ref_data;
 
@@ -356,94 +318,86 @@
   //   that we compare the right part of the block to the ref_data.
   // - Reduce block_width and block_height so that the the size of the area
   //   being compared is correct.
+  //
+  // NOTE: No more correct since hor_sad was modified to be a separate
+  // strategy
   if (top && left) {
     result += cor_sad(pic_data,
                       &ref_data[top * ref->stride + left],
                       left, top, pic->stride);
-    result += ver_sad(&pic_data[left],
+    result += kvz_ver_sad(&pic_data[left],
                       &ref_data[top * ref->stride + left],
                       block_width - left, top, pic->stride);
-    result += hor_sad(&pic_data[top * pic->stride],
-                      &ref_data[top * ref->stride + left],
-                      left, block_height - top, pic->stride, ref->stride);
-    result += kvz_reg_sad(&pic_data[top * pic->stride + left],
-                      &ref_data[top * ref->stride + left],
-                      block_width - left, block_height - top, pic->stride, ref->stride);
+
+    result += kvz_hor_sad(pic_data + top * pic->stride,
+                          ref_data + top * ref->stride,
+                          block_width, block_height - top,
+                          pic->stride, ref->stride,
+                          left, right);
+
   } else if (top && right) {
-    result += ver_sad(pic_data,
+    result += kvz_ver_sad(pic_data,
                       &ref_data[top * ref->stride],
                       block_width - right, top, pic->stride);
     result += cor_sad(&pic_data[block_width - right],
                       &ref_data[top * ref->stride + (block_width - right - 1)],
                       right, top, pic->stride);
-    result += kvz_reg_sad(&pic_data[top * pic->stride],
-                      &ref_data[top * ref->stride],
-                      block_width - right, block_height - top, pic->stride, ref->stride);
-    result += hor_sad(&pic_data[top * pic->stride + (block_width - right)],
-                      &ref_data[top * ref->stride + (block_width - right - 1)],
-                      right, block_height - top, pic->stride, ref->stride);
+
+    result += kvz_hor_sad(pic_data + top * pic->stride,
+                          ref_data + top * ref->stride,
+                          block_width, block_height - top,
+                          pic->stride, ref->stride,
+                          left, right);
+
   } else if (bottom && left) {
-    result += hor_sad(pic_data,
-                      &ref_data[left],
-                      left, block_height - bottom, pic->stride, ref->stride);
-    result += kvz_reg_sad(&pic_data[left],
-                      &ref_data[left],
-                      block_width - left, block_height - bottom, pic->stride, ref->stride);
+    result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
+                          pic->stride, ref->stride, left, right);
+
     result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],

kvazaar-1.2.0.tar.gz/src/image.h -> kvazaar-1.3.0.tar.gz/src/image.h Changed

kvazaar-1.2.0.tar.gz/src/input_frame_buffer.c -> kvazaar-1.3.0.tar.gz/src/input_frame_buffer.c Changed

@@ -58,6 +58,11 @@
 
   const int gop_buf_size = 3 * cfg->gop_len;
 
+  bool is_closed_gop = false;
+
+  // Check for closed gop, we need an extra frame in the buffer in this case
+  if (!cfg->open_gop && cfg->intra_period > 0 && cfg->gop_len > 0) is_closed_gop = true;
+
   if (cfg->gop_len == 0 || cfg->gop_lowdelay) {
     // No reordering of output pictures necessary.
 
@@ -94,11 +99,11 @@
     buf->pts_buffer[buf_idx] = img_in->pts;
     buf->num_in++;
 
-    if (buf->num_in < cfg->gop_len) {
+    if (buf->num_in < cfg->gop_len + is_closed_gop ? 1 : 0) {
       // Not enough frames to start output.
       return 0;
 
-    } else if (buf->num_in == cfg->gop_len) {
+    } else if (buf->num_in == cfg->gop_len + is_closed_gop ? 1 : 0) {
       // Now we known the PTSs that are needed to compute the delay.
       buf->delay = buf->pts_buffer[gop_buf_size - 1] - img_in->pts;
     }
@@ -109,7 +114,7 @@
     return NULL;
   }
 
-  if (img_in == NULL && buf->num_in < cfg->gop_len) {
+  if (img_in == NULL && buf->num_in < cfg->gop_len + is_closed_gop ? 1 : 0) {
     // End of the sequence but we have less than a single GOP of frames. Use
     // the difference between the PTSs of the first and the last frame as the
     // delay.
@@ -137,22 +142,35 @@
 
   } else {
     gop_offset = (buf->num_out - 1) % cfg->gop_len;
+    
+    // For closed gop, calculate the gop_offset again
+    if (!cfg->open_gop && cfg->intra_period > 0) {
+      // Offset the GOP position for each extra I-frame added to the structure
+      // in closed gop case
+      int num_extra_frames = (buf->num_out - 1) / (cfg->intra_period + 1);
+      gop_offset = (buf->num_out - 1 - num_extra_frames) % cfg->gop_len;
+    }
 
     // Index of the first picture in the GOP that is being output.
     int gop_start_idx = buf->num_out - 1 - gop_offset;
 
     // Skip pictures until we find an available one.
     gop_offset += buf->gop_skipped;
-    for (;;) {
-      assert(gop_offset < cfg->gop_len);
 
-      idx_out = gop_start_idx + cfg->gop[gop_offset].poc_offset - 1;
-      if (idx_out < buf->num_in - 1) {
-        // An available picture found.
-        break;
+    // Every closed-gop IRAP handled here
+    if (is_closed_gop && (!cfg->open_gop && ((buf->num_out - 1) % (cfg->intra_period + 1)) == cfg->intra_period)) {
+      idx_out = gop_start_idx;
+    } else {
+      for (;;) {
+        assert(gop_offset < cfg->gop_len + is_closed_gop ? 1 : 0);
+        idx_out = gop_start_idx + cfg->gop[gop_offset].poc_offset - 1;
+        if (idx_out < buf->num_in - 1) {
+          // An available picture found.
+          break;
+        }
+        buf->gop_skipped++;
+        gop_offset++;
       }
-      buf->gop_skipped++;
-      gop_offset++;
     }
 
     if (buf->num_out < cfg->gop_len - 1) {

kvazaar-1.2.0.tar.gz/src/inter.c -> kvazaar-1.3.0.tar.gz/src/inter.c Changed

@@ -29,6 +29,7 @@
 #include "strategies/generic/picture-generic.h"
 #include "strategies/strategies-ipol.h"
 #include "videoframe.h"
+#include "strategies/strategies-picture.h"
 
 
 typedef struct {
@@ -51,8 +52,6 @@
   int mv_frac_x = (mv_param[0] & 3);
   int mv_frac_y = (mv_param[1] & 3);
 
- #define FILTER_SIZE_Y 8 //Luma filter size
-
   // Fractional luma 1/4-pel
   kvz_extended_block src = {0, 0, 0, 0};
 
@@ -66,7 +65,7 @@
                          ref->y,
                          ref->width,
                          ref->height,
-                         FILTER_SIZE_Y,
+                         KVZ_LUMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src);
@@ -75,7 +74,7 @@
                                      src.stride,
                                      block_width,
                                      block_height,
-                                     lcu->rec.y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH),
+                                     lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
                                      LCU_WIDTH,
                                      mv_frac_x,
                                      mv_frac_y,
@@ -96,8 +95,6 @@
   int mv_frac_x = (mv_param[0] & 3);
   int mv_frac_y = (mv_param[1] & 3);
 
-#define FILTER_SIZE_Y 8 //Luma filter size
-
   // Fractional luma 1/4-pel
   kvz_extended_block src = { 0, 0, 0, 0 };
 
@@ -111,7 +108,7 @@
                          ref->y,
                          ref->width,
                          ref->height,
-                         FILTER_SIZE_Y,
+                         KVZ_LUMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src);
@@ -120,7 +117,7 @@
                                            src.stride,
                                            block_width,
                                            block_height,
-                                           hi_prec_out->y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH),
+                                           hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
                                            LCU_WIDTH,
                                            mv_frac_x,
                                            mv_frac_y,
@@ -147,8 +144,6 @@
   block_width >>= 1;
   block_height >>= 1;
 
-#define FILTER_SIZE_C 4 //Chroma filter size
-
   // Fractional chroma 1/8-pel
   kvz_extended_block src_u = { 0, 0, 0, 0 };
   kvz_extended_block src_v = { 0, 0, 0, 0 };
@@ -162,7 +157,7 @@
                          ref->u,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_u);
@@ -178,12 +173,12 @@
                          ref->v,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_v);
   kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
-    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   if (src_u.malloc_used) free(src_u.buffer);
   if (src_v.malloc_used) free(src_v.buffer);
@@ -207,8 +202,6 @@
   block_width >>= 1;
   block_height >>= 1;
 
-#define FILTER_SIZE_C 4 //Chroma filter size
-
   // Fractional chroma 1/8-pel
   kvz_extended_block src_u = { 0, 0, 0, 0 };
   kvz_extended_block src_v = { 0, 0, 0, 0 };
@@ -223,7 +216,7 @@
                          ref->u,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_u);
@@ -232,7 +225,7 @@
                                          src_u.stride,
                                          block_width,
                                          block_height,
-                                         hi_prec_out->u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
+                                         hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
                                          LCU_WIDTH_C,
                                          mv_frac_x,
                                          mv_frac_y,
@@ -248,7 +241,7 @@
                          ref->v,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_v);
@@ -257,7 +250,7 @@
                                          src_v.stride,
                                          block_width,
                                          block_height,
-                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
+                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
                                          LCU_WIDTH_C,
                                          mv_frac_x,
                                          mv_frac_y,
@@ -306,27 +299,27 @@
 
 
 /**
- * \brief Reconstruct inter block
+ * \brief Reconstruct an inter PU using uniprediction.
  *
  * \param state         encoder state
  * \param ref           picture to copy the data from
- * \param xpos          block x position
- * \param ypos          block y position
- * \param width         block width
- * \param height        block height
+ * \param xpos          PU x position
+ * \param ypos          PU y position
+ * \param width         PU width
+ * \param height        PU height
  * \param mv_param      motion vector
  * \param lcu           destination lcu
- * \param hi_prec_out   destination of high precision output (null if not needed)
+ * \param hi_prec_out   destination of high precision output, or NULL if not needed
 */
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * const ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param[2],
-                         lcu_t *lcu,
-                         hi_prec_buf_t *hi_prec_out)
+static void inter_recon_unipred(const encoder_state_t * const state,
+                                const kvz_picture * const ref,
+                                int32_t xpos,
+                                int32_t ypos,
+                                int32_t width,
+                                int32_t height,
+                                const int16_t mv_param[2],
+                                lcu_t *lcu,
+                                hi_prec_buf_t *hi_prec_out)
 {
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
@@ -426,36 +419,32 @@
     }
   }
 }
-
 /**
- * \brief Reconstruct bi-pred inter block
+ * \brief Reconstruct bi-pred inter PU
  *
  * \param state     encoder state
  * \param ref1      reference picture to copy the data from
  * \param ref2      other reference picture to copy the data from
- * \param xpos      block x position
- * \param ypos      block y position
- * \param width     block width
- * \param height    block height
+ * \param xpos      PU x position

kvazaar-1.2.0.tar.gz/src/inter.h -> kvazaar-1.3.0.tar.gz/src/inter.h Changed

@@ -40,26 +40,22 @@
 
 } inter_merge_cand_t;
 
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width);
 
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param[2],
-                         lcu_t* lcu,
-                         hi_prec_buf_t *hi_prec_out);
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param[2][2],
+                            lcu_t* lcu);
 
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param[2][2],
-                                lcu_t* lcu);
 
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t x,

kvazaar-1.2.0.tar.gz/src/kvazaar.c -> kvazaar-1.3.0.tar.gz/src/kvazaar.c Changed

kvazaar-1.2.0.tar.gz/src/kvazaar.h -> kvazaar-1.3.0.tar.gz/src/kvazaar.h Changed

@@ -92,6 +92,7 @@
   KVZ_IME_FULL16 = 4, //! \since 3.6.0
   KVZ_IME_FULL32 = 5, //! \since 3.6.0
   KVZ_IME_FULL64 = 6, //! \since 3.6.0
+  KVZ_IME_DIA = 7, // Experimental. TODO: change into a proper doc comment
 };
 
 /**
@@ -206,6 +207,12 @@
   KVZ_SAO_FULL = 3
 };
 
+enum kvz_scalinglist {
+  KVZ_SCALING_LIST_OFF = 0,
+  KVZ_SCALING_LIST_CUSTOM = 1,
+  KVZ_SCALING_LIST_DEFAULT = 2,  
+};
+
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"[format])
 
@@ -322,6 +329,7 @@
   uint8_t *optional_key;
 
   enum kvz_me_early_termination me_early_termination; /*!< \since 3.8.0 \brief Mode of me early termination. */
+  int32_t intra_rdo_et; /*!< \since 4.1.0 \brief Use early termination in intra rdo. */
 
   int32_t lossless; /*!< \brief Use lossless coding. */
 
@@ -351,6 +359,37 @@
    * \brief Use adaptive QP for 360 video with equirectangular projection.
    */
   int32_t erp_aqp;
+
+  /** \brief The HEVC level */
+  uint8_t level;
+  /** \brief Whether we ignore and just warn from all of the errors about the output not conforming to the level's requirements. */
+  uint8_t force_level;
+  /** \brief Whether we use the high tier bitrates. Requires the level to be 4 or higher. */
+  uint8_t high_tier;
+  /** \brief The maximum allowed bitrate for this level and tier. */
+  uint32_t max_bitrate;
+
+  /** \brief Maximum steps that hexagonal and diagonal motion estimation can use. -1 to disable */
+  uint32_t me_max_steps;
+
+  /** \brief Minimum QP that uses CABAC for residual cost instead of a fast estimate. */
+  int8_t fast_residual_cost_limit;
+
+  /** \brief Set QP at CU level keeping pic_init_qp_minus26 in PPS zero */
+  int8_t set_qp_in_cu;
+
+  /** \brief Flag to enable/disable open GOP configuration */
+  int8_t open_gop;
+
+  /** \brief Type of scaling lists to use */
+  int8_t scaling_list;
+
+  /** \brief Maximum number of merge cadidates */
+  uint8_t max_merge;
+
+  /** \brief Enable Early Skip Mode Decision */
+  uint8_t early_skip;
+
 } kvz_config;
 
 /**
@@ -359,7 +398,8 @@
  * Function picture_alloc in kvz_api must be used for allocation.
  */
 typedef struct kvz_picture {
-  kvz_pixel *fulldata;         //!< \brief Allocated buffer (only used in the base_image)
+  kvz_pixel *fulldata_buf;     //!< \brief Allocated buffer with padding (only used in the base_image)
+  kvz_pixel *fulldata;         //!< \brief Allocated buffer portion that's actually used
 
   kvz_pixel *y;                //!< \brief Pointer to luma pixel array.
   kvz_pixel *u;                //!< \brief Pointer to chroma U pixel array.

kvazaar-1.2.0.tar.gz/src/rate_control.c -> kvazaar-1.3.0.tar.gz/src/rate_control.c Changed

@@ -79,8 +79,8 @@
   int pictures_coded = MAX(0, state->frame->num - encoder->cfg.owf);
 
   int gop_offset = (state->frame->gop_offset - encoder->cfg.owf) % MAX(1, encoder->cfg.gop_len);
-  // Only take fully coded GOPs into account.
-  if (encoder->cfg.gop_len > 0 && gop_offset != encoder->cfg.gop_len - 1) {
+  
+  if (encoder->cfg.gop_len > 0 && gop_offset != encoder->cfg.gop_len - 1 && encoder->cfg.gop_lp_definition.d == 0) {
     // Subtract number of bits in the partially coded GOP.
     bits_coded -= state->frame->cur_gop_bits_coded;
     // Subtract number of pictures in the partially coded GOP.
@@ -293,7 +293,7 @@
     int dqp = ctrl->cfg.roi.dqps[roi_index];
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
     state->lambda = qp_to_lamba(state, state->qp);
-    state->lambda_sqrt = sqrt(state->frame->lambda);
+    state->lambda_sqrt = sqrt(state->lambda);
 
   } else if (ctrl->cfg.target_bitrate > 0) {
     lcu_stats_t *lcu         = kvz_get_lcu_stats(state, pos.x, pos.y);

kvazaar-1.2.0.tar.gz/src/rdo.c -> kvazaar-1.3.0.tar.gz/src/rdo.c Changed

@@ -30,6 +30,7 @@
 #include "imagelist.h"
 #include "inter.h"
 #include "scalinglist.h"
+#include "strategyselector.h"
 #include "tables.h"
 #include "transform.h"
 
@@ -41,8 +42,6 @@
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4
 
-static const double COEFF_SUM_MULTIPLIER = 1.9;
-
 const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
 const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
 
@@ -195,7 +194,6 @@
   return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
 }
 
-
 /**
  * \brief Estimate bitcost for coding coefficients.
  *
@@ -211,15 +209,17 @@
                             int32_t type,
                             int8_t scan_mode)
 {
-  if (state->encoder_control->cfg.rdo > 0) {
+  if (state->qp >= state->encoder_control->cfg.fast_residual_cost_limit) {
     return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
 
   } else {
-    return COEFF_SUM_MULTIPLIER * kvz_coeff_abs_sum(coeff, width * width) + 0.5;
+    // Estimate coeff coding cost based on QP and sum of absolute coeffs.
+    // const uint32_t sum = kvz_coeff_abs_sum(coeff, width * width);
+    // return (uint32_t)(sum * (state->qp * COEFF_COST_QP_FACTOR + COEFF_COST_BIAS) + 0.5);
+    return kvz_fast_coeff_cost(coeff, width, state->qp);
   }
 }
 
-
 #define COEF_REMAIN_BIN_REDUCTION 3
 /** Calculates the cost for specific absolute transform level
  * \param abs_level scaled quantized level
@@ -879,52 +879,23 @@
   }
 }
 
-/** MVD cost calculation with CABAC
-* \returns int
-* Calculates cost of actual motion vectors using CABAC coding
-*/
+/**
+ * Calculate cost of actual motion vectors using CABAC coding
+ */
 uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       vector2d_t *mvd,
-                                       const cabac_data_t* real_cabac)
+                                       const cabac_data_t* cabac,
+                                       const int32_t mvd_hor,
+                                       const int32_t mvd_ver)
 {
-  uint32_t bitcost = 0;
-  const int32_t mvd_hor = mvd->x;
-  const int32_t mvd_ver = mvd->y;
-  const int8_t hor_abs_gr0 = mvd_hor != 0;
-  const int8_t ver_abs_gr0 = mvd_ver != 0;
-  const uint32_t mvd_hor_abs = abs(mvd_hor);
-  const uint32_t mvd_ver_abs = abs(mvd_ver);
+  cabac_data_t cabac_copy = *cabac;
+  cabac_copy.only_count = 1;
 
-  cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, real_cabac, sizeof(cabac_data_t));
-  cabac_data_t *cabac = &cabac_copy;
-  cabac->only_count = 1;
-
-  cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]);
-  CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-  CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
-  cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]);
-  if (hor_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_hor_abs > 1), "abs_mvd_greater1_flag_hor");
-  }
-  if (ver_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_ver_abs > 1), "abs_mvd_greater1_flag_ver");
-  }
-  if (hor_abs_gr0) {
-    if (mvd_hor_abs > 1) {
-      // It is safe to drop const here because cabac->only_count is set.
-      kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1);
-    }
-    CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
-  }
-  if (ver_abs_gr0) {
-    if (mvd_ver_abs > 1) {
-      // It is safe to drop const here because cabac->only_count is set.
-      kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1);
-    }
-    CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
-  }
-  bitcost = ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)) - ((23 - real_cabac->bits_left) + (real_cabac->num_buffered_bytes << 3));
+  // It is safe to drop const here because cabac->only_count is set.
+  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
+
+  uint32_t bitcost =
+    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
+    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
 
   return bitcost;
 }
@@ -946,8 +917,7 @@
   cabac_data_t state_cabac_copy;
   cabac_data_t* cabac;
   uint32_t merge_idx;
-  int cand1_cost, cand2_cost;
-  vector2d_t mvd_temp1, mvd_temp2, mvd = { 0, 0 };
+  vector2d_t mvd = { 0, 0 };
   int8_t merged = 0;
   int8_t cur_mv_cand = 0;
 
@@ -979,27 +949,30 @@
   cabac = &state_cabac_copy;
 
   if (!merged) {
-    mvd_temp1.x = x - mv_cand[0][0];
-    mvd_temp1.y = y - mv_cand[0][1];
-    cand1_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp1, cabac);
-
-    mvd_temp2.x = x - mv_cand[1][0];
-    mvd_temp2.y = y - mv_cand[1][1];
-    cand2_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp2, cabac);
+    vector2d_t mvd1 = {
+      x - mv_cand[0][0],
+      y - mv_cand[0][1],
+    };
+    vector2d_t mvd2 = {
+      x - mv_cand[1][0],
+      y - mv_cand[1][1],
+    };
+    uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
       cur_mv_cand = 1;
-      mvd = mvd_temp2;
+      mvd = mvd2;
     } else {
-      mvd = mvd_temp1;
+      mvd = mvd1;
     }
   }
 
   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
 
   CABAC_BIN(cabac, merged, "MergeFlag");
-  num_cand = MRG_MAX_NUM_CANDS;
+  num_cand = state->encoder_control->cfg.max_merge;
   if (merged) {
     if (num_cand > 1) {
       int32_t ui;
@@ -1058,51 +1031,18 @@
 
         // ToDo: Bidir vector support
         if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
-          const int32_t mvd_hor = mvd.x;
-          const int32_t mvd_ver = mvd.y;
-          const int8_t hor_abs_gr0 = mvd_hor != 0;
-          const int8_t ver_abs_gr0 = mvd_ver != 0;
-          const uint32_t mvd_hor_abs = abs(mvd_hor);
-          const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]);
-          CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-          CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
-
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]);
-
-          if (hor_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_hor_abs > 1), "abs_mvd_greater1_flag_hor");
-          }
-
-          if (ver_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_ver_abs > 1), "abs_mvd_greater1_flag_ver");
-          }
-
-          if (hor_abs_gr0) {
-            if (mvd_hor_abs > 1) {
-              // It is safe to drop const because cabac->only_count is set.
-              kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1);
-            }
-
-            CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
-          }
-
-          if (ver_abs_gr0) {
-            if (mvd_ver_abs > 1) {

kvazaar-1.2.0.tar.gz/src/rdo.h -> kvazaar-1.3.0.tar.gz/src/rdo.h Changed

@@ -39,7 +39,7 @@
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
-uint32_t kvz_get_coeff_cost(const encoder_state_t *state,
+uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
                             const coeff_t *coeff,
                             int32_t width,
                             int32_t type,
@@ -57,8 +57,9 @@
 kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
 
 uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       vector2d_t *mvd,
-                                       const cabac_data_t* cabac);
+                                       const cabac_data_t* cabac,
+                                       int32_t mvd_hor,
+                                       int32_t mvd_ver);
 
 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15

kvazaar-1.2.0.tar.gz/src/scalinglist.c -> kvazaar-1.3.0.tar.gz/src/scalinglist.c Changed

kvazaar-1.2.0.tar.gz/src/scalinglist.h -> kvazaar-1.3.0.tar.gz/src/scalinglist.h Changed

kvazaar-1.2.0.tar.gz/src/search.c -> kvazaar-1.3.0.tar.gz/src/search.c Changed

@@ -116,7 +116,7 @@
   }
 }
 
-void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
+void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
 {
   const int x_local = SUB_SCU(x_px);
   const int y_local = SUB_SCU(y_px);
@@ -138,6 +138,7 @@
       to->type      = cu->type;
       to->depth     = cu->depth;
       to->part_size = cu->part_size;
+      to->qp        = cu->qp;
 
       if (cu->type == CU_INTRA) {
         to->intra.mode        = cu->intra.mode;
@@ -152,7 +153,7 @@
   }
 }
 
-static void lcu_set_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
+static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
 {
   const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size;
   const int num_pu = kvz_part_mode_num_parts[part_mode];
@@ -169,7 +170,7 @@
   }
 }
 
-static void lcu_set_coeff(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu)
+static void lcu_fill_cbf(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu)
 {
   const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth;
   const uint32_t mask = ~((width >> tr_split)-1);
@@ -189,6 +190,40 @@
 }
 
 
+//Calculates cost for all zero coeffs
+static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y,
+  const int depth)
+{
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
+  int cu_width = LCU_WIDTH >> depth;
+  lcu_t *const lcu = &work_tree[depth];
+
+  const int luma_index = y_local * LCU_WIDTH + x_local;
+  const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
+
+  double ssd = 0.0;
+  ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+    &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
+    LCU_WIDTH, LCU_WIDTH, cu_width
+    );
+  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
+    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+      &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      );
+    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+      &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      );
+  }
+  // Save the pixels at a lower level of the working tree.
+  copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]);
+
+  return ssd;
+}
+
+
 /**
 * Calculate RD cost for a Coding Unit.
 * \return Cost of block
@@ -368,6 +403,30 @@
 }
 
 
+/**
+ * \brief Sort modes and costs to ascending order according to costs.
+ */
+void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length)
+{
+  // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about
+  // 60% of the time, so there should be no need for anything more complex
+  // than insertion sort.
+  // Length for merge is 5 or less.
+  for (uint8_t i = 1; i < length; ++i) {
+    const double cur_cost = costs[i];
+    const int8_t cur_mode = modes[i];
+    uint8_t j = i;
+    while (j > 0 && cur_cost < costs[j - 1]) {
+      costs[j] = costs[j - 1];
+      modes[j] = modes[j - 1];
+      --j;
+    }
+    costs[j] = cur_cost;
+    modes[j] = cur_mode;
+  }
+}
+
+
 static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
 {
   vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) };
@@ -392,6 +451,7 @@
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
+  double inter_zero_coeff_cost = MAX_INT;
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
@@ -412,6 +472,7 @@
   cur_cu->tr_depth = depth > 0 ? depth : 1;
   cur_cu->type = CU_NOTSET;
   cur_cu->part_size = SIZE_2Nx2N;
+  cur_cu->qp = state->qp;
 
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
@@ -419,14 +480,17 @@
       y + cu_width <= frame->height)
   {
     int cu_width_inter_min = LCU_WIDTH >> ctrl->cfg.pu_depth_inter.max;
-    bool can_use_inter = state->frame->slicetype != KVZ_SLICE_I && (
-      WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
-      // When the split was forced because the CTU is partially outside the
-      // frame, we permit inter coding even if pu_depth_inter would
-      // otherwise forbid it.
-      (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width ||
-      (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height
-    );
+    bool can_use_inter =
+      state->frame->slicetype != KVZ_SLICE_I &&
+      depth <= MAX_DEPTH &&
+      (
+        WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
+        // When the split was forced because the CTU is partially outside the
+        // frame, we permit inter coding even if pu_depth_inter would
+        // otherwise forbid it.
+        (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width ||
+        (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height
+      );
 
     if (can_use_inter) {
       double mode_cost;
@@ -442,30 +506,31 @@
         cur_cu->type = CU_INTER;
       }
 
-      // Try SMP and AMP partitioning.
-      static const part_mode_t mp_modes[] = {
-        // SMP
-        SIZE_2NxN, SIZE_Nx2N,
-        // AMP
-        SIZE_2NxnU, SIZE_2NxnD,
-        SIZE_nLx2N, SIZE_nRx2N,
-      };
-
-      const int first_mode = ctrl->cfg.smp_enable ? 0 : 2;
-      const int last_mode  = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
-      for (int i = first_mode; i <= last_mode; ++i) {
-        kvz_search_cu_smp(state,
-                          x, y,
-                          depth,
-                          mp_modes[i],
-                          &work_tree[depth + 1],
-                          &mode_cost, &mode_bitcost);
-        // TODO: take cost of coding part mode into account
-        if (mode_cost < cost) {
-          cost = mode_cost;
-          inter_bitcost = mode_bitcost;
-          // TODO: only copy inter prediction info, not pixels
-          work_tree_copy_up(x_local, y_local, depth, work_tree);
+      if (!(ctrl->cfg.early_skip && cur_cu->skipped)) {
+        // Try SMP and AMP partitioning.
+        static const part_mode_t mp_modes[] = {
+          // SMP
+          SIZE_2NxN, SIZE_Nx2N,
+          // AMP
+          SIZE_2NxnU, SIZE_2NxnD,
+          SIZE_nLx2N, SIZE_nRx2N,
+        };
+
+        const int first_mode = ctrl->cfg.smp_enable ? 0 : 2;
+        const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
+        for (int i = first_mode; i <= last_mode; ++i) {
+          kvz_search_cu_smp(state,
+		                    x, y,
+		                    depth,
+		                    mp_modes[i],
+		                    &work_tree[depth + 1],
+		                    &mode_cost, &mode_bitcost);
+          if (mode_cost < cost) {
+            cost = mode_cost;
+            inter_bitcost = mode_bitcost;
+            // Copy inter prediction info to current level.

kvazaar-1.2.0.tar.gz/src/search.h -> kvazaar-1.3.0.tar.gz/src/search.h Changed

kvazaar-1.2.0.tar.gz/src/search_inter.c -> kvazaar-1.3.0.tar.gz/src/search_inter.c Changed

@@ -30,11 +30,12 @@
 #include "inter.h"
 #include "kvazaar.h"
 #include "rdo.h"
+#include "search.h"
 #include "strategies/strategies-ipol.h"
 #include "strategies/strategies-picture.h"
+#include "transform.h"
 #include "videoframe.h"
 
-
 typedef struct {
   encoder_state_t *state;
 
@@ -77,6 +78,13 @@
    * \brief Bit cost of best_mv
    */
   uint32_t best_bitcost;
+
+  /**
+   * \brief Possible optimized SAD implementation for the width, leave as
+   *        NULL for arbitrary-width blocks
+   */
+  optimized_sad_func_ptr_t optimized_sad;
+
 } inter_search_info_t;
 
 
@@ -204,7 +212,8 @@
       info->state->tile->offset_x + info->origin.x + x,
       info->state->tile->offset_y + info->origin.y + y,
       info->width,
-      info->height
+      info->height,
+      info->optimized_sad
   );
 
   if (cost >= info->best_cost) return false;
@@ -261,8 +270,8 @@
   for (int i = 0; i < info->num_merge_cand; ++i) {
     if (info->merge_cand[i].dir == 3) continue;
     const vector2d_t merge_mv = {
-      info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] >> 2,
-      info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] >> 2
+      (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] + 2) >> 2,
+      (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] + 2) >> 2
     };
     if (merge_mv.x == mv.x && merge_mv.y == mv.y) {
       return true;
@@ -296,8 +305,8 @@
   for (unsigned i = 0; i < info->num_merge_cand; ++i) {
     if (info->merge_cand[i].dir == 3) continue;
 
-    int x = info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] >> 2;
-    int y = info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] >> 2;
+    int x = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] + 2) >> 2;
+    int y = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] + 2) >> 2;
 
     if (x == 0 && y == 0) continue;
 
@@ -307,32 +316,65 @@
 
 
 static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
-                                    vector2d_t *mvd,
-                                    const cabac_data_t* cabac)
+                                    const cabac_data_t* cabac,
+                                    const int32_t mvd_hor,
+                                    const int32_t mvd_ver)
 {
   unsigned bitcost = 0;
-  const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) };
+  const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) };
 
-  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.x > 0);
-  if (abs_mvd.x > 0) {
-    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.x > 1);
-    if (abs_mvd.x > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x - 2) << CTX_FRAC_BITS;
-    }
-    bitcost += CTX_FRAC_ONE_BIT; // sign
+  bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS;
+  bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS;
+
+  // Round and shift back to integer bits.
+  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+}
+
+
+static int select_mv_cand(const encoder_state_t *state,
+                          int16_t mv_cand[2][2],
+                          int32_t mv_x,
+                          int32_t mv_y,
+                          uint32_t *cost_out)
+{
+  const bool same_cand =
+    (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]);
+
+  if (same_cand && !cost_out) {
+    // Pick the first one if both candidates are the same.
+    return 0;
   }
 
-  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.y > 0);
-  if (abs_mvd.y > 0) {
-    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.y > 1);
-    if (abs_mvd.y > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y - 2) << CTX_FRAC_BITS;
-    }
-    bitcost += CTX_FRAC_ONE_BIT; // sign
+  uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+                              const cabac_data_t*,
+                              int32_t, int32_t);
+  if (state->encoder_control->cfg.mv_rdo) {
+    mvd_coding_cost = kvz_get_mvd_coding_cost_cabac;
+  } else {
+    mvd_coding_cost = get_mvd_coding_cost;
   }
 
-  // Round and shift back to integer bits.
-  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+  uint32_t cand1_cost = mvd_coding_cost(
+      state, &state->cabac,
+      mv_x - mv_cand[0][0],
+      mv_y - mv_cand[0][1]);
+
+  uint32_t cand2_cost;
+  if (same_cand) {
+    cand2_cost = cand1_cost;
+  } else {
+    cand2_cost = mvd_coding_cost(
+      state, &state->cabac,
+      mv_x - mv_cand[1][0],
+      mv_y - mv_cand[1][1]);
+  }
+
+  if (cost_out) {
+    *cost_out = MIN(cand1_cost, cand2_cost);
+  }
+
+  // Pick the second candidate if it has lower cost.
+  return cand2_cost < cand1_cost ? 1 : 0;
 }
 
 
@@ -348,10 +390,7 @@
 {
   uint32_t temp_bitcost = 0;
   uint32_t merge_idx;
-  int cand1_cost,cand2_cost;
-  vector2d_t mvd_temp1, mvd_temp2;
   int8_t merged      = 0;
-  int8_t cur_mv_cand = 0;
 
   x *= 1 << mv_shift;
   y *= 1 << mv_shift;
@@ -371,20 +410,10 @@
   }
 
   // Check mvd cost only if mv is not merged
-  if(!merged) {
-    mvd_temp1.x = x - mv_cand[0][0];
-    mvd_temp1.y = y - mv_cand[0][1];
-    cand1_cost = get_mvd_coding_cost(state, &mvd_temp1, &state->cabac);
-
-    mvd_temp2.x = x - mv_cand[1][0];
-    mvd_temp2.y = y - mv_cand[1][1];
-    cand2_cost = get_mvd_coding_cost(state, &mvd_temp2, &state->cabac);
-
-    // Select candidate 1 if it has lower cost
-    if (cand2_cost < cand1_cost) {
-      cur_mv_cand = 1;
-    }
-    temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
+  if (!merged) {
+    uint32_t mvd_cost = 0;
+    select_mv_cand(state, mv_cand, x, y, &mvd_cost);
+    temp_bitcost += mvd_cost;
   }
   *bitcost = temp_bitcost;
   return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5);
@@ -442,6 +471,7 @@
 void kvz_tz_pattern_search(inter_search_info_t *info,
                            unsigned pattern_type,
                            const int iDist,
+                           vector2d_t mv,
                            int *best_dist)
 {
   assert(pattern_type < 4);
@@ -537,8 +567,6 @@
     };
   }
 
-  const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
-
   // Compute SAD values for all chosen points.
   int best_index = -1;
   for (int i = 0; i < n_points; i++) {
@@ -579,8 +607,9 @@
   const int iRaster = 5;  // search distance limit and downsampling factor for step 3

kvazaar-1.2.0.tar.gz/src/search_inter.h -> kvazaar-1.3.0.tar.gz/src/search_inter.h Changed

@@ -32,17 +32,19 @@
 #include "inter.h"
 #include "kvazaar.h"
 
-#define FILTER_SIZE 8
-#define HALF_FILTER (FILTER_SIZE>>1)
+#define KVZ_LUMA_FILTER_TAPS 8
+#define KVZ_LUMA_FILTER_OFFSET 3
+#define KVZ_CHROMA_FILTER_TAPS 4
+#define KVZ_CHROMA_FILTER_OFFSET 1
 
-// Maximum extra width a block needs to filter 
-// a fractional pixel with positive fractional mv.x and mv.y
-#define KVZ_EXT_PADDING (FILTER_SIZE - 1)
+ // Maximum extra width a block needs to filter 
+ // a fractional pixel with positive fractional mv.x and mv.y
+#define KVZ_EXT_PADDING_LUMA (KVZ_LUMA_FILTER_TAPS - 1)
+#define KVZ_EXT_PADDING_CHROMA (KVZ_CHROMA_FILTER_TAPS - 1)
 
-// Maximum block width for extended block
-#define KVZ_EXT_BLOCK_W (LCU_WIDTH + KVZ_EXT_PADDING)
-
-typedef kvz_pixel frac_search_block[(LCU_WIDTH + 1) * (LCU_WIDTH + 1)];
+ // Maximum block width for extended block
+#define KVZ_EXT_BLOCK_W_LUMA (LCU_WIDTH + KVZ_EXT_PADDING_LUMA)
+#define KVZ_EXT_BLOCK_W_CHROMA (LCU_WIDTH_C + KVZ_EXT_PADDING_CHROMA)
 
 enum hpel_position {
   HPEL_POS_HOR = 0,

kvazaar-1.2.0.tar.gz/src/search_intra.c -> kvazaar-1.3.0.tar.gz/src/search_intra.c Changed

@@ -42,29 +42,6 @@
 
 
 /**
- * \brief Sort modes and costs to ascending order according to costs.
- */
-static INLINE void sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length)
-{
-  // Length is always between 5 and 23, and is either 21, 17, 9 or 8 about
-  // 60% of the time, so there should be no need for anything more complex
-  // than insertion sort.
-  for (uint8_t i = 1; i < length; ++i) {
-    const double cur_cost = costs[i];
-    const int8_t cur_mode = modes[i];
-    uint8_t j = i;
-    while (j > 0 && cur_cost < costs[j - 1]) {
-      costs[j] = costs[j - 1];
-      modes[j] = modes[j - 1];
-      --j;
-    }
-    costs[j] = cur_cost;
-    modes[j] = cur_mode;
-  }
-}
-
-
-/**
 * \brief Select mode with the smallest cost.
 */
 static INLINE uint8_t select_best_mode_index(const int8_t *modes, const double *costs, uint8_t length)
@@ -309,7 +286,7 @@
   if (depth == 0 || split_cost < nosplit_cost) {
     return split_cost;
   } else {
-    kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
 
     pred_cu->cbf = nosplit_cbf;
 
@@ -367,7 +344,7 @@
     costs[i] += satd_func(pred, orig_block);
   }
 
-  sort_modes(modes, costs, 5);
+  kvz_sort_modes(modes, costs, 5);
 }
 
 
@@ -617,12 +594,21 @@
     FILL(pred_cu.cbf, 0);
 
     // Reset transform split data in lcu.cu for this area.
-    kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
 
     double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
     costs[rdo_mode] += mode_cost;
+
+    // Early termination if no coefficients has to be coded
+    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
+      modes_to_check = rdo_mode + 1;
+      break;
+    }
   }
 
+  // Update order according to new costs
+  kvz_sort_modes(modes, costs, modes_to_check);
+
   // The best transform split hierarchy is not saved anywhere, so to get the
   // transform split hierarchy the search has to be performed again with the
   // best mode.
@@ -844,8 +830,7 @@
   }
 
   // Set transform depth to current depth, meaning no transform splits.
-  kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
-  double best_rough_cost = costs[select_best_mode_index(modes, costs, number_of_modes)];
+  kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
@@ -860,7 +845,7 @@
     }
     int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search);
 
-    sort_modes(modes, costs, number_of_modes);
+    kvz_sort_modes(modes, costs, number_of_modes);
     number_of_modes = search_intra_rdo(state,
                       x_px, y_px, depth,
                       ref_pixels, LCU_WIDTH,
@@ -872,5 +857,5 @@
   uint8_t best_mode_i = select_best_mode_index(modes, costs, number_of_modes);
 
   *mode_out = modes[best_mode_i];
-  *cost_out = skip_rough_search ? costs[best_mode_i]:best_rough_cost;
+  *cost_out = costs[best_mode_i];
 }

kvazaar-1.3.0.tar.gz/src/strategies/avx2/avx2_common_functions.h Added

@@ -0,0 +1,114 @@
+#ifndef AVX2_COMMON_FUNCTIONS_H
+#define AVX2_COMMON_FUNCTIONS_H
+
+#include <immintrin.h>
+
+/*
+ * Reorder coefficients from raster to scan order
+ * Fun fact: Once upon a time, doing this in a loop looked like this:
+ * for (int32_t n = 0; n < width * height; n++) {
+ *   coef_reord[n] = coef[scan[n]];
+ *   q_coef_reord[n] = q_coef[scan[n]];
+ * }
+ */
+static INLINE void scanord_read_vector(const int16_t **__restrict coeffs, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs)
+{
+  // For vectorized reordering of coef and q_coef
+  const __m128i low128_shuffle_masks[3] = {
+    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
+  };
+
+  const __m128i blend_masks[3] = {
+    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m128i invec_rearr_masks_upper[3] = {
+    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
+  };
+
+  const __m128i invec_rearr_masks_lower[3] = {
+    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
+  };
+
+  const size_t row_offsets[4] = {
+    scan[subpos] + width * 0,
+    scan[subpos] + width * 1,
+    scan[subpos] + width * 2,
+    scan[subpos] + width * 3,
+  };
+
+  for (int i = 0; i < n_bufs; i++) {
+    const int16_t *__restrict coeff = coeffs[i];
+
+    // NOTE: Upper means "higher in pixel order inside block", which implies
+    // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
+    // so upper 128b vector actually becomes the lower part of a 256-bit coeff
+    // vector and lower vector the higher part!
+    __m128d coeffs_d_upper;
+    __m128d coeffs_d_lower;
+
+    __m128i coeffs_upper;
+    __m128i coeffs_lower;
+
+    __m128i coeffs_rearr1_upper;
+    __m128i coeffs_rearr1_lower;
+
+    __m128i coeffs_rearr2_upper;
+    __m128i coeffs_rearr2_lower;
+
+    // Zeroing these is actually unnecessary, but the compiler will whine
+    // about uninitialized values otherwise
+    coeffs_d_upper = _mm_setzero_pd();
+    coeffs_d_lower = _mm_setzero_pd();
+
+    coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));
+    coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));
+
+    coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));
+    coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));
+
+    coeffs_upper   = _mm_castpd_si128(coeffs_d_upper);
+    coeffs_lower   = _mm_castpd_si128(coeffs_d_lower);
+
+    coeffs_lower   = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);
+
+    coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);
+    coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);
+
+    coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);
+    coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);
+
+    // The Intel Intrinsics Guide talks about _mm256_setr_m128i but my headers
+    // lack such an instruction. What it does is essentially this anyway.
+    result_vecs[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
+                                             coeffs_rearr2_lower,
+                                             1);
+  }
+}
+
+// If ints is completely zero, returns 16 in *first and -1 in *last
+static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
+{
+  // Note that nonzero_bytes will always have both bytes set for a set word
+  // even if said word only had one of its bytes set, because we're doing 16
+  // bit wide comparisons. No big deal, just shift results to the right by one
+  // bit to have the results represent indexes of first set words, not bytes.
+  // Another note, it has to use right shift instead of division to preserve
+  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
+  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
+  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
+  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
+}
+
+#endif

kvazaar-1.3.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.c Added

@@ -0,0 +1,605 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategyselector.h"
+
+#if COMPILE_INTEL_AVX2
+#include "avx2_common_functions.h"
+#include "cabac.h"
+#include "context.h"
+#include "encode_coding_tree-avx2.h"
+#include "encode_coding_tree.h"
+#include "strategies/missing-intel-intrinsics.h"
+#include <immintrin.h>
+
+/*
+ * NOTE: Unlike SSE/AVX comparisons that would return 11 or 00 for gt/lte,
+ * this'll use 1x and 0x as bit patterns (x: garbage). A couple extra
+ * instructions will get you 11 and 00 if you need to use this as a mask
+ * somewhere at some point, but we don't need this right now.
+ *
+ * I'd love to draw a logic circuit here to describe this, but I can't. Two
+ * 2-bit uints can be compared for greaterness by first comparing their high
+ * bits using AND-NOT; (x AND (NOT y)) == 1 if x > y. If A_hi > B_hi, A > B.
+ * If A_hi == B_hi AND A_lo > B_lo, A > B. Otherwise, A <= B. It's really
+ * simple when drawn on paper, but quite messy on a general-purpose ALU. But
+ * look, just five instructions!
+ */
+static INLINE uint32_t u32vec_cmpgt_epu2(uint32_t a, uint32_t b)
+{
+  uint32_t a_gt_b          = _andn_u32(b, a);
+  uint32_t a_ne_b          = a ^ b;
+  uint32_t a_gt_b_sh       = a_gt_b << 1;
+  uint32_t lobit_tiebrk_hi = _andn_u32(a_ne_b, a_gt_b_sh);
+  uint32_t res             = a_gt_b | lobit_tiebrk_hi;
+  return res;
+}
+
+static INLINE uint32_t pack_16x16b_to_16x2b(__m256i src)
+{
+  /*
+   * For each 16-bit element in src:
+   * ABCD EFGH IJKL MNOP Original elements
+   * 0000 0000 0000 00XY Element clipped to [0, 3] using _mm256_min_epu16
+   * 0000 000X Y000 0000 Shift word to align LSBs across byte boundary
+   * 0000 0001 1000 0000 Comparison mask to be compared against
+   * XXXX XXXX YYYY YYYY Comparison result, for movemask
+   */
+  const __m256i threes  = _mm256_set1_epi16   (3);
+  const __m256i cmpmask = _mm256_slli_epi16   (threes, 7); // 0x0180 (avoid set1)
+
+  __m256i  clipped      = _mm256_min_epu16    (src, threes);
+  __m256i  shifted      = _mm256_slli_epi16   (clipped, 7);
+  __m256i  cmpres       = _mm256_cmpeq_epi8   (shifted, cmpmask);
+  uint32_t result       = _mm256_movemask_epi8(cmpres);
+
+  return result;
+}
+
+/**
+ * \brief Context derivation process of coeff_abs_significant_flag,
+ *        parallelized to handle 16 coeffs at once
+ * \param pattern_sig_ctx pattern for current coefficient group
+ * \param scan_idx pixel scan type in use
+ * \param pos_xs column addresses of current scan positions
+ * \param pos_ys row addresses of current scan positions
+ * \param block_type log2 value of block size if square block, or 4 otherwise
+ * \param width width of the block
+ * \param texture_type texture type (TEXT_LUMA...)
+ * \returns ctx_inc for current scan position
+ */
+static INLINE __m256i kvz_context_get_sig_ctx_inc_16x16b(int32_t pattern_sig_ctx, uint32_t scan_idx, __m256i pos_xs,
+                                __m256i pos_ys, int32_t block_type, int8_t texture_type)
+{
+  const __m256i zero   = _mm256_set1_epi8(0);
+  const __m256i ff     = _mm256_set1_epi8(0xff);
+
+  const __m256i ones   = _mm256_set1_epi16(1);
+  const __m256i twos   = _mm256_set1_epi16(2);
+  const __m256i threes = _mm256_set1_epi16(3);
+
+  const __m256i ctx_ind_map[3] = {
+    _mm256_setr_epi16(
+        0, 2, 1, 6,
+        3, 4, 7, 6,
+        4, 5, 7, 8,
+        5, 8, 8, 8
+    ),
+    _mm256_setr_epi16(
+        0, 1, 4, 5,
+        2, 3, 4, 5,
+        6, 6, 8, 8,
+        7, 7, 8, 8
+    ),
+    _mm256_setr_epi16(
+        0, 2, 6, 7,
+        1, 3, 6, 7,
+        4, 4, 8, 8,
+        5, 5, 8, 8
+    ),
+  };
+
+  int16_t offset;
+  if (block_type == 3)
+    if (scan_idx == SCAN_DIAG)
+      offset = 9;
+    else
+      offset = 15;
+  else
+    if (texture_type == 0)
+      offset = 21;
+    else
+      offset = 12;
+
+  __m256i offsets = _mm256_set1_epi16(offset);
+
+  // This will only ever be compared to 0, 1 and 2, so it's fine to cast down
+  // to 16b (and it should never be above 3 anyways)
+  __m256i pattern_sig_ctxs = _mm256_set1_epi16((int16_t)(MIN(0xffff, pattern_sig_ctx)));
+  __m256i pattern_sig_ctxs_eq_zero = _mm256_cmpeq_epi16(pattern_sig_ctxs, zero);
+  __m256i pattern_sig_ctxs_eq_one  = _mm256_cmpeq_epi16(pattern_sig_ctxs, ones);
+  __m256i pattern_sig_ctxs_eq_two  = _mm256_cmpeq_epi16(pattern_sig_ctxs, twos);
+
+  __m256i pattern_sig_ctxs_eq_1or2 = _mm256_or_si256 (pattern_sig_ctxs_eq_one,
+                                                      pattern_sig_ctxs_eq_two);
+  __m256i pattern_sig_ctxs_lt3     = _mm256_or_si256 (pattern_sig_ctxs_eq_1or2,
+                                                      pattern_sig_ctxs_eq_zero);
+  __m256i pattern_sig_ctxs_other   = _mm256_xor_si256(pattern_sig_ctxs_lt3,
+                                                      ff);
+  __m256i x_plus_y        = _mm256_add_epi16  (pos_xs,   pos_ys);
+  __m256i x_plus_y_zero   = _mm256_cmpeq_epi16(x_plus_y, zero);   // All these should be 0, preempts block_type_two rule
+
+  __m256i texture_types = _mm256_set1_epi16((int16_t)texture_type);
+
+  __m256i block_types     = _mm256_set1_epi16((int16_t)block_type);
+  __m256i block_type_two  = _mm256_cmpeq_epi16(block_types, twos);   // All these should be ctx_ind_map[4 * pos_y + pos_x];
+  __m256i bt2_vals        = ctx_ind_map[scan_idx];
+  __m256i bt2_vals_masked = _mm256_and_si256(bt2_vals, block_type_two);
+
+  __m256i pos_xs_in_subset = _mm256_and_si256(pos_xs, threes);
+  __m256i pos_ys_in_subset = _mm256_and_si256(pos_ys, threes);
+
+  __m256i cg_pos_xs        = _mm256_srli_epi16(pos_xs, 2);
+  __m256i cg_pos_ys        = _mm256_srli_epi16(pos_ys, 2);
+  __m256i cg_pos_xysums    = _mm256_add_epi16 (cg_pos_xs, cg_pos_ys);
+
+  __m256i pos_xy_sums_in_subset = _mm256_add_epi16(pos_xs_in_subset, pos_ys_in_subset);
+
+  /*
+   * if (pattern_sig_ctx == 0) {
+   *   switch (pos_x_in_subset + pos_y_in_subset) {
+   *   case 0:
+   *     cnt = 2;
+   *     break;
+   *   case 1:
+   *   case 2:
+   *     cnt = 1;
+   *     break;
+   *   default:
+   *     cnt = 0;
+   *   }
+   * }
+   *
+   * Equivalent to:
+   *
+   * if (pattern_sig_ctx == 0) {
+   *   subamt = cnt <= 1 ? 1 : 0;
+   *   pxyis_max3 = min(3, pos_x_in_subset + pos_y_in_subset);
+   *   cnt = (3 - pxyis_max3) - subamt;
+   * }
+   */
+  __m256i pxyis_lte_1     = _mm256_cmpgt_epi16(twos,                  pos_xy_sums_in_subset);
+  __m256i subamts         = _mm256_and_si256  (pxyis_lte_1,           ones);
+  __m256i pxyis_max3      = _mm256_min_epu16  (pos_xy_sums_in_subset, threes);
+  __m256i cnts_tmp        = _mm256_sub_epi16  (threes,                pxyis_max3);
+  __m256i cnts_sig_ctx_0  = _mm256_sub_epi16  (cnts_tmp,              subamts);
+  __m256i cnts_sc0_masked = _mm256_and_si256  (cnts_sig_ctx_0,        pattern_sig_ctxs_eq_zero);
+
+  /*
+   * if (pattern_sig_ctx == 1 || pattern_sig_ctx == 2) {
+   *   if (pattern_sig_ctx == 1)
+   *     subtrahend = pos_y_in_subset;
+   *   else

kvazaar-1.3.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.h Added

@@ -0,0 +1,42 @@
+#ifndef ENCODE_CODING_TREE_AVX2_H_
+#define ENCODE_CODING_TREE_AVX2_H_
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \file
+ * Functions for writing the coding quadtree and related syntax.
+ */
+
+#include "encoderstate.h"
+#include "global.h"
+
+void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
+                               cabac_data_t * const cabac,
+                               const coeff_t *coeff,
+                               uint8_t width,
+                               uint8_t type,
+                               int8_t scan_mode,
+                               int8_t tr_skip);
+
+int kvz_strategy_register_encode_avx2(void* opaque, uint8_t bitdepth);
+
+#endif // ENCODE_CODING_TREE_AVX2_H_

kvazaar-1.2.0.tar.gz/src/strategies/avx2/ipol-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/ipol-avx2.c Changed

@@ -31,1338 +31,1422 @@
 
 #include "encoder.h"
 #include "kvazaar.h"
+#include "search_inter.h"
 #include "strategies/generic/picture-generic.h"
 #include "strategies/strategies-ipol.h"
 #include "strategyselector.h"
 #include "strategies/generic/ipol-generic.h"
 
 
-#define FILTER_OFFSET 3
-#define FILTER_SIZE 8
-
-#define MAX_HEIGHT (4 * (LCU_WIDTH + 1) + FILTER_SIZE)
-#define MAX_WIDTH ((LCU_WIDTH + 1) + FILTER_SIZE)
-
 extern int8_t kvz_g_luma_filter[4][8];
 extern int8_t kvz_g_chroma_filter[8][4];
 
-void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst)
+static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
 {
-  __m128i a, b, c, d;
-  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter));
-
-  a = _mm_maddubs_epi16(*data01, fir);
-  b = _mm_maddubs_epi16(*data23, fir);
-  a = _mm_hadd_epi16(a, b);
-
-  c = _mm_maddubs_epi16(*data45, fir);
-  d = _mm_maddubs_epi16(*data67, fir);
-  c = _mm_hadd_epi16(c, d);
-
-  a = _mm_hadd_epi16(a, c);
+  __m128i fir = _mm_loadl_epi64((__m128i*)filter);
+  __m128i row = _mm_loadl_epi64((__m128i*)data);
+  __m128i acc;
+  acc = _mm_maddubs_epi16(row, fir);
+  __m128i temp = _mm_srli_si128(acc, 4);
+  acc = _mm_add_epi16(acc, temp);
+  temp = _mm_srli_si128(acc, 2);
+  acc = _mm_add_epi16(acc, temp);
+  int32_t filtered = _mm_cvtsi128_si32(acc);
+
+  return filtered;
+}
 
-  _mm_storeu_si128(dst, a);
+static void kvz_init_shuffle_masks(__m256i *shuf_01_23, __m256i *shuf_45_67) {
+  // Shuffle pairs
+  *shuf_01_23 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+                                 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  *shuf_45_67 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+                                 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
 }
 
-static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *filter, int32_t offset23, int32_t shift23)
-{
-  __m128i temp[8];
-  __m128i temp_lo;
-  __m128i temp_hi;
-  __m128i fir = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
-
-  temp[0] = _mm_madd_epi16(row[0], fir);
-  temp[1] = _mm_madd_epi16(row[1], fir);
-  temp_lo = _mm_unpacklo_epi32(temp[0], temp[1]);
-  temp_hi = _mm_unpackhi_epi32(temp[0], temp[1]);
-  temp[0] = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp[2] = _mm_madd_epi16(row[2], fir);
-  temp[3] = _mm_madd_epi16(row[3], fir);
-  temp_lo = _mm_unpacklo_epi32(temp[2], temp[3]);
-  temp_hi = _mm_unpackhi_epi32(temp[2], temp[3]);
-  temp[2] = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp[4] = _mm_madd_epi16(row[4], fir);
-  temp[5] = _mm_madd_epi16(row[5], fir);
-  temp_lo = _mm_unpacklo_epi32(temp[4], temp[5]);
-  temp_hi = _mm_unpackhi_epi32(temp[4], temp[5]);
-  temp[4] = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp[6] = _mm_madd_epi16(row[6], fir);
-  temp[7] = _mm_madd_epi16(row[7], fir);
-  temp_lo = _mm_unpacklo_epi32(temp[6], temp[7]);
-  temp_hi = _mm_unpackhi_epi32(temp[6], temp[7]);
-  temp[6] = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp_lo = _mm_unpacklo_epi32(temp[0], temp[2]);
-  temp_hi = _mm_unpackhi_epi32(temp[0], temp[2]);
-  temp[0] = _mm_add_epi32(temp_lo, temp_hi);
-  temp[0] = _mm_shuffle_epi32(temp[0], _MM_SHUFFLE(3, 1, 2, 0));
-
-  temp_lo = _mm_unpacklo_epi32(temp[4], temp[6]);
-  temp_hi = _mm_unpackhi_epi32(temp[4], temp[6]);
-  temp[4] = _mm_add_epi32(temp_lo, temp_hi);
-  temp[4] = _mm_shuffle_epi32(temp[4], _MM_SHUFFLE(3, 1, 2, 0));
-
-  __m128i add = _mm_set1_epi32(offset23);
-  temp[0] = _mm_add_epi32(temp[0], add);
-  temp[4] = _mm_add_epi32(temp[4], add);
-  temp[0] = _mm_srai_epi32(temp[0], shift23);
-  temp[4] = _mm_srai_epi32(temp[4], shift23);
-
-  temp[0] = _mm_packus_epi32(temp[0], temp[4]);
-  temp[0] = _mm_packus_epi16(temp[0], temp[0]);
-
-  return temp[0];
+static void kvz_init_shuffle_masks_chroma(__m256i *shuf_01, __m256i *shuf_23) {
+  // Shuffle pairs
+  *shuf_01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12,
+                              0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+  *shuf_23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14,
+                              2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
 }
 
-static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t *filter[2], int32_t offset23, int32_t shift23)
-{
-  __m256i temp[8];
-  __m256i temp_lo;
-  __m256i temp_hi;
-  __m256i fir = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)filter[0]), _mm_loadl_epi64((__m128i*)filter[1])));
-
-  temp[0] = _mm256_madd_epi16(row[0], fir);
-  temp[1] = _mm256_madd_epi16(row[1], fir);
-  temp_lo = _mm256_unpacklo_epi32(temp[0], temp[1]);
-  temp_hi = _mm256_unpackhi_epi32(temp[0], temp[1]);
-  temp[0] = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp[2] = _mm256_madd_epi16(row[2], fir);
-  temp[3] = _mm256_madd_epi16(row[3], fir);
-  temp_lo = _mm256_unpacklo_epi32(temp[2], temp[3]);
-  temp_hi = _mm256_unpackhi_epi32(temp[2], temp[3]);
-  temp[2] = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp[4] = _mm256_madd_epi16(row[4], fir);
-  temp[5] = _mm256_madd_epi16(row[5], fir);
-  temp_lo = _mm256_unpacklo_epi32(temp[4], temp[5]);
-  temp_hi = _mm256_unpackhi_epi32(temp[4], temp[5]);
-  temp[4] = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp[6] = _mm256_madd_epi16(row[6], fir);
-  temp[7] = _mm256_madd_epi16(row[7], fir);
-  temp_lo = _mm256_unpacklo_epi32(temp[6], temp[7]);
-  temp_hi = _mm256_unpackhi_epi32(temp[6], temp[7]);
-  temp[6] = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp_lo = _mm256_unpacklo_epi32(temp[0], temp[2]);
-  temp_hi = _mm256_unpackhi_epi32(temp[0], temp[2]);
-  temp[0] = _mm256_add_epi32(temp_lo, temp_hi);
-  temp[0] = _mm256_shuffle_epi32(temp[0], _MM_SHUFFLE(3, 1, 2, 0));
-
-  temp_lo = _mm256_unpacklo_epi32(temp[4], temp[6]);
-  temp_hi = _mm256_unpackhi_epi32(temp[4], temp[6]);
-  temp[4] = _mm256_add_epi32(temp_lo, temp_hi);
-  temp[4] = _mm256_shuffle_epi32(temp[4], _MM_SHUFFLE(3, 1, 2, 0));
-
-  __m256i add = _mm256_set1_epi32(offset23);
-  temp[0] = _mm256_add_epi32(temp[0], add);
-  temp[4] = _mm256_add_epi32(temp[4], add);
-  temp[0] = _mm256_srai_epi32(temp[0], shift23);
-  temp[4] = _mm256_srai_epi32(temp[4], shift23);
-
-  temp[0] = _mm256_packus_epi32(temp[0], temp[4]);
-  temp[0] = _mm256_packus_epi16(temp[0], temp[0]);
-
-  return temp[0];
+static void kvz_init_filter_taps(int8_t *filter,
+  __m256i *taps_01_23, __m256i *taps_45_67) {
+  // Filter weights
+  __m256i all_taps = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter));
+  __m256i perm_01 = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
+  __m256i perm_23 = _mm256_setr_epi32(2, 2, 2, 2, 3, 3, 3, 3);
+  all_taps = _mm256_unpacklo_epi16(all_taps, all_taps);
+  *taps_01_23 = _mm256_permutevar8x32_epi32(all_taps, perm_01);
+  *taps_45_67 = _mm256_permutevar8x32_epi32(all_taps, perm_23);
 }
 
-/*
-static __m128i kvz_eight_tap_filter_flip_x8_avx2(__m128i *row, int8_t *filter,  int32_t shift1)
-{
-  __m128i temp[4];
-  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
-  
-  temp[0] = _mm_unpacklo_epi64(row[0], row[1]);
-  temp[0] = _mm_maddubs_epi16(temp[0], fir);
+static void kvz_init_filter_taps_chroma(int8_t *filter,
+  __m256i *taps_01, __m256i *taps_23) {
+  // Filter weights
+  __m256i all_taps = _mm256_set1_epi32(*(int32_t*)filter);
+  all_taps = _mm256_unpacklo_epi16(all_taps, all_taps);
+  *taps_01 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
+  *taps_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
+}
 
-  temp[1] = _mm_unpacklo_epi64(row[2], row[3]);
-  temp[1] = _mm_maddubs_epi16(temp[1], fir);
+static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) {
+  for (int i = 0; i < 4; ++i) filters[i] = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&filter[2 * i]));
+  filters[0] = _mm256_inserti128_si256(filters[0], _mm256_castsi256_si128(filters[3]), 1); // Pairs 01 67

kvazaar-1.2.0.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -21,17 +21,59 @@
 /*
  * \file
  */
-#include "strategies/avx2/picture-avx2.h"
+
+#include "global.h"
 
 #if COMPILE_INTEL_AVX2
+#include "strategies/avx2/picture-avx2.h"
+#include "strategies/avx2/reg_sad_pow2_widths-avx2.h"
+
 #include <immintrin.h>
+#include <emmintrin.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <string.h>
-
 #include "kvazaar.h"
 #include "strategies/strategies-picture.h"
 #include "strategyselector.h"
 #include "strategies/generic/picture-generic.h"
 
+/**
+ * \brief Calculate Sum of Absolute Differences (SAD)
+ *
+ * Calculate Sum of Absolute Differences (SAD) between two rectangular regions
+ * located in arbitrary points in the picture.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param stride  Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                          const int width, const int height, const unsigned stride1, const unsigned stride2)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return reg_sad_w4(data1, data2, height, stride1, stride2);
+  if (width == 8)
+    return reg_sad_w8(data1, data2, height, stride1, stride2);
+  if (width == 12)
+    return reg_sad_w12(data1, data2, height, stride1, stride2);
+  if (width == 16)
+    return reg_sad_w16(data1, data2, height, stride1, stride2);
+  if (width == 24)
+    return reg_sad_w24(data1, data2, height, stride1, stride2);
+  if (width == 32)
+    return reg_sad_w32(data1, data2, height, stride1, stride2);
+  if (width == 64)
+    return reg_sad_w64(data1, data2, height, stride1, stride2);
+  else
+    return reg_sad_arbitrary(data1, data2, width, height, stride1, stride2);
+}
 
 /**
 * \brief Calculate SAD for 8x8 bytes in continuous memory.
@@ -484,13 +526,13 @@
 }
 
 static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4],
-                                       const int strides[4],
+                                       const int stride,
                                        const kvz_pixel *orig,
                                        const int orig_stride,
                                        unsigned costs[4])
 {
   // TODO: AVX2 implementation
-  kvz_satd_4x4_subblock_quad_generic(preds, strides, orig, orig_stride, costs);
+  kvz_satd_4x4_subblock_quad_generic(preds, stride, orig, orig_stride, costs);
 }
 
 static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
@@ -508,13 +550,13 @@
 }
 
 static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds,
-  const int *strides,
+  const int stride,
   const kvz_pixel *orig,
   const int orig_stride,
   unsigned *costs)
 {
-  kvz_satd_8bit_8x8_general_dual_avx2(preds[0], strides[0], preds[1], strides[1], orig, orig_stride, &costs[0], &costs[1]);
-  kvz_satd_8bit_8x8_general_dual_avx2(preds[2], strides[2], preds[3], strides[3], orig, orig_stride, &costs[2], &costs[3]);
+  kvz_satd_8bit_8x8_general_dual_avx2(preds[0], stride, preds[1], stride, orig, orig_stride, &costs[0], &costs[1]);
+  kvz_satd_8bit_8x8_general_dual_avx2(preds[2], stride, preds[3], stride, orig, orig_stride, &costs[2], &costs[3]);
 }
 
 SATD_NxN(8bit_avx2,  8)
@@ -577,7 +619,7 @@
   static void satd_any_size_ ## suffix ( \
       int width, int height, \
       const kvz_pixel **preds, \
-      const int *strides, \
+      const int stride, \
       const kvz_pixel *orig, \
       const int orig_stride, \
       unsigned num_modes, \
@@ -591,7 +633,7 @@
     if (width % 8 != 0) { \
       /* Process the first column using 4x4 blocks. */ \
       for (int y = 0; y < height; y += 4) { \
-        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
             } \
       orig_ptr += 4; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
@@ -602,23 +644,23 @@
     if (height % 8 != 0) { \
       /* Process the first row using 4x4 blocks. */ \
       for (int x = 0; x < width; x += 4 ) { \
-        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
             } \
       orig_ptr += 4 * orig_stride; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
-        pred_ptrs[blk] += 4 * strides[blk]; \
+        pred_ptrs[blk] += 4 * stride; \
             }\
       height -= 4; \
         } \
     /* The rest can now be processed with 8x8 blocks. */ \
     for (int y = 0; y < height; y += 8) { \
       orig_ptr = &orig[y * orig_stride]; \
-      pred_ptrs[0] = &preds[0][y * strides[0]]; \
-      pred_ptrs[1] = &preds[1][y * strides[1]]; \
-      pred_ptrs[2] = &preds[2][y * strides[2]]; \
-      pred_ptrs[3] = &preds[3][y * strides[3]]; \
+      pred_ptrs[0] = &preds[0][y * stride]; \
+      pred_ptrs[1] = &preds[1][y * stride]; \
+      pred_ptrs[2] = &preds[2][y * stride]; \
+      pred_ptrs[3] = &preds[3][y * stride]; \
       for (int x = 0; x < width; x += 8) { \
-        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
         orig_ptr += 8; \
         pred_ptrs[0] += 8; \
         pred_ptrs[1] += 8; \
@@ -714,8 +756,570 @@
   }
 }
 
-#endif //COMPILE_INTEL_AVX2
+static void inter_recon_bipred_no_mov_avx2(
+ const int height,
+ const int width,
+ const int ypos,
+ const int xpos,
+ const hi_prec_buf_t*high_precision_rec0,
+ const hi_prec_buf_t*high_precision_rec1,
+ lcu_t* lcu,
+ kvz_pixel* temp_lcu_y,
+ kvz_pixel* temp_lcu_u,
+ kvz_pixel* temp_lcu_v) {
+
+ // This function is used only when kvazaar can't find any movement from the current block
+ int y_in_lcu, x_in_lcu;
+ __m256i sample0_epi8, sample1_epi8, temp_y_epi8;
+ int32_t * pointer = 0;
+
+ for (int temp_y = 0; temp_y < height; temp_y += 1) {
+  y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+
+  for (int temp_x = 0; temp_x < width; temp_x += 32) {
+
+   x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+   switch (width)
+   {
+
+   case 4:
+
+    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
+    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
+
+    break;
+
+   case 8:
+
+    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    // Store 64-bits from vector to memory
+    _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8));
+
+    break;
+

kvazaar-1.2.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -28,6 +28,7 @@
 #include <immintrin.h>
 #include <stdlib.h>
 
+#include "avx2_common_functions.h"
 #include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
@@ -40,17 +41,316 @@
 #include "tables.h"
 #include "transform.h"
 
+static INLINE int32_t hsum32_8x32i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+
+  a = _mm_add_epi32(a, b);
+  return _mm_cvtsi128_si32(a);
+}
+
+static INLINE int32_t hsum32_16x16i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+  __m256i c = _mm256_cvtepi16_epi32(a);
+  __m256i d = _mm256_cvtepi16_epi32(b);
+
+  c = _mm256_add_epi32(c, d);
+  return hsum32_8x32i(c);
+}
+
+// Rearranges a 16x32b double vector into a format suitable for a stable SIMD
+// max algorithm:
+// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
+static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
+{
+  const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+
+  __m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask);
+  __m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask);
+
+  *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
+  *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
+}
+
+static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
+    __m256i ns, __m256i changes,
+    int16_t *final_change, int32_t *min_pos)
+{
+  // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
+  // to have the same data layout as in costs. Zero extend to 32b width, shift
+  // changes 16 bits to the left, and store them into the same vectors.
+  __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes);
+  __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes);
+
+  __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31);
+  __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20);
+
+  // Reorder to afford result stability (if multiple atoms tie for cheapest,
+  // rightmost ie. the highest is the wanted one)
+  rearrange_512(&costs_hi, &costs_lo);
+  rearrange_512(&pl1hi, &pl1lo);
+
+  // 0: pick hi, 1: pick lo (equality evaluates as 0)
+  __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo);
+  __m256i cost1    = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1);
+  __m256i pl1_1    = _mm256_blendv_epi8(pl1hi,    pl1lo,    cmpmask1);
+
+  __m256i cost2    = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i pl1_2    = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1));
+
+  __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1);
+  __m256i cost3    = _mm256_blendv_epi8(cost2, cost1, cmpmask2);
+  __m256i pl1_3    = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2);
+
+  __m256i cost4    = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_4    = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3);
+  __m256i cost5    = _mm256_blendv_epi8(cost4, cost3, cmpmask3);
+  __m256i pl1_5    = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3);
+
+  __m256i cost6    = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_6    = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5);
+  __m256i pl1_7    = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4);
+
+  __m128i res1_128 = _mm256_castsi256_si128(pl1_7);
+  uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0);
+  uint16_t n = (uint16_t)(tmp1 & 0xffff);
+  uint16_t chng = (uint16_t)(tmp1 >> 16);
+
+  *final_change = (int16_t)chng;
+  *min_pos = (int32_t)n;
+}
+
+static INLINE __m256i concatenate_2x128i(__m128i lo, __m128i hi)
+{
+  __m256i v = _mm256_castsi128_si256(lo);
+  return _mm256_inserti128_si256(v, hi, 1);
+}
+
+static INLINE void scanord_read_vector_32(const int32_t  *__restrict quant_coeff,
+                                          const uint32_t *__restrict scan,
+                                          int8_t scan_mode,
+                                          int32_t subpos,
+                                          int32_t width,
+                                          __m256i *__restrict v_quant_coeffs)
+{
+  const size_t row_offsets[4] = {
+    scan[subpos] + width * 0,
+    scan[subpos] + width * 1,
+    scan[subpos] + width * 2,
+    scan[subpos] + width * 3,
+  };
+
+  const __m256i shufmasks[3] = {
+    _mm256_setr_epi32(5, 2, 6, 0, 3, 7, 4, 1),
+    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    _mm256_setr_epi32(2, 3, 0, 1, 6, 7, 4, 5),
+  };
+
+  const __m256i blend_masks[3] = {
+    _mm256_setr_epi32( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm256_setr_epi32( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm256_setr_epi32( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m256i rearr_masks_lo[3] = {
+    _mm256_setr_epi32(0, 4, 1, 3, 5, 2, 6, 7),
+    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    _mm256_setr_epi32(0, 4, 2, 6, 1, 5, 3, 7),
+  };
+
+  const __m256i rearr_masks_hi[3] = {
+    _mm256_setr_epi32(6, 3, 0, 1, 7, 2, 4, 5),
+    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    _mm256_setr_epi32(2, 6, 0, 4, 3, 7, 1, 5),
+  };
+
+  __m128i coeffs[4] = {
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[0])),
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[1])),
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[2])),
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[3])),
+  };
+
+  __m256i coeffs_upper = concatenate_2x128i(coeffs[0], coeffs[1]);
+  __m256i coeffs_lower = concatenate_2x128i(coeffs[2], coeffs[3]);
+
+  __m256i lower_shuffled = _mm256_permutevar8x32_epi32(coeffs_lower, shufmasks[scan_mode]);
+
+  __m256i upper_blended  = _mm256_blendv_epi8(coeffs_upper,   lower_shuffled, blend_masks[scan_mode]);
+  __m256i lower_blended  = _mm256_blendv_epi8(lower_shuffled, coeffs_upper,   blend_masks[scan_mode]);
+
+  __m256i result_lo      = _mm256_permutevar8x32_epi32(upper_blended, rearr_masks_lo[scan_mode]);
+  __m256i result_hi      = _mm256_permutevar8x32_epi32(lower_blended, rearr_masks_hi[scan_mode]);
+
+  v_quant_coeffs[0] = result_lo;
+  v_quant_coeffs[1] = result_hi;
+}
+
+#define VEC_WIDTH 16
+#define SCAN_SET_SIZE 16
+#define LOG2_SCAN_SET_SIZE 4
+
+static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg)
+{
+  assert(SCAN_SET_SIZE == 16);
+
+  int32_t first_nz_pos_in_cg, last_nz_pos_in_cg;
+  int32_t abssum = 0;
+
+  // Find first and last nonzero coeffs
+  get_first_last_nz_int16(q_coefs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
+
+  // Sum all kvz_quant coeffs between first and last
+  abssum = hsum32_16x16i(q_coefs);
+
+  if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
+    last_cg = 1;
+  }
+
+  if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
+
+    uint32_t q_coef_signbits = _mm256_movemask_epi8(q_coefs);
+    int32_t signbit = (q_coef_signbits >> (2 * first_nz_pos_in_cg + 1)) & 0x1;
+
+    if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
+      int32_t min_pos;
+      int16_t final_change;
+      int16_t cheapest_q;

kvazaar-1.3.0.tar.gz/src/strategies/avx2/reg_sad_pow2_widths-avx2.h Added

@@ -0,0 +1,209 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifndef REG_SAD_POW2_WIDTHS_AVX2_H_
+#define REG_SAD_POW2_WIDTHS_AVX2_H_
+
+#include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
+#include "kvazaar.h"
+
+static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m256i avx_inc = _mm256_setzero_si256();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+    __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+    __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1));
+    __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2));
+    __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 2) * stride1));
+    __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 2) * stride2));
+    __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 3) * stride1));
+    __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 3) * stride2));
+
+    __m256i curr_sads_ab = _mm256_sad_epu8(a, b);
+    __m256i curr_sads_cd = _mm256_sad_epu8(c, d);
+    __m256i curr_sads_ef = _mm256_sad_epu8(e, f);
+    __m256i curr_sads_gh = _mm256_sad_epu8(g, h);
+
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ef);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+      __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+
+      __m256i curr_sads = _mm256_sad_epu8(a, b);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
+    }
+  }
+
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
+  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+
+  __m128i sum_1 = _mm_add_epi64    (inclo, inchi);
+  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad   = _mm_add_epi64    (sum_1, sum_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m256i avx_inc = _mm256_setzero_si256();
+  int32_t y;
+
+  const int32_t height_twoline_groups = height & ~1;
+  const int32_t height_residual_lines = height &  1;
+
+  for (y = 0; y < height_twoline_groups; y += 2) {
+    __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+    __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+    __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32));
+    __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32));
+
+    __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1));
+    __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2));
+    __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1 + 32));
+    __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2 + 32));
+
+    __m256i curr_sads_ab = _mm256_sad_epu8(a, b);
+    __m256i curr_sads_cd = _mm256_sad_epu8(c, d);
+    __m256i curr_sads_ef = _mm256_sad_epu8(e, f);
+    __m256i curr_sads_gh = _mm256_sad_epu8(g, h);
+
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ef);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+      __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+      __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32));
+      __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32));
+
+      __m256i curr_sads_ab = _mm256_sad_epu8(a, b);
+      __m256i curr_sads_cd = _mm256_sad_epu8(c, d);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd);
+    }
+  }
+
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
+  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+
+  __m128i sum_1 = _mm_add_epi64    (inclo, inchi);
+  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad   = _mm_add_epi64    (sum_1, sum_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                 const uint32_t left, const uint32_t right)
+{
+  __m256i avx_inc = _mm256_setzero_si256();
+
+  const size_t block_width      = 32;
+  const size_t block_width_log2 = 5;
+  const size_t lane_width       = 16;
+
+  const int32_t left_eq_wid     = left  >> block_width_log2;
+  const int32_t left_clamped    = left  -  left_eq_wid;
+  const int32_t right_eq_wid    = right >> block_width_log2;
+  const int32_t right_clamped   = right -  right_eq_wid;
+
+  const __m256i zero        = _mm256_setzero_si256();
+  const __m256i lane_widths = _mm256_set1_epi8((uint8_t)lane_width);
+  const __m256i lefts       = _mm256_set1_epi8((uint8_t)left_clamped);
+  const __m256i rights      = _mm256_set1_epi8((uint8_t)right_clamped);
+  const __m256i unsign_mask = _mm256_set1_epi8(0x7f);
+  const __m256i ns          = _mm256_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+
+  const __m256i rightmost_good_idx = _mm256_set1_epi8((uint8_t)(block_width - right - 1));
+
+  const __m256i shufmask1_l    = _mm256_sub_epi8  (ns,          lefts);
+  const __m256i shufmask1_r    = _mm256_add_epi8  (shufmask1_l, rights);
+  const __m256i shufmask1      = _mm256_and_si256 (shufmask1_r, unsign_mask);
+
+  const __m256i epol_mask_r    = _mm256_min_epi8  (ns,    rightmost_good_idx);
+  const __m256i epol_mask      = _mm256_max_epi8  (lefts, epol_mask_r);
+
+  const __m256i mlo2hi_mask_l  = _mm256_cmpgt_epi8(lefts, ns);
+  const __m256i mlo2hi_imask_r = _mm256_cmpgt_epi8(lane_widths, shufmask1);
+  const __m256i mlo2hi_mask_r  = _mm256_cmpeq_epi8(mlo2hi_imask_r, zero);
+
+  // For left != 0,  use low lane of mlo2hi_mask_l as blend mask for high lane.
+  // For right != 0, use low lane of mlo2hi_mask_r as blend mask for low lane.
+  const __m256i xchg_mask1     = _mm256_permute2x128_si256(mlo2hi_mask_l, mlo2hi_mask_r, 0x02);
+
+  // If left != 0 (ie. right == 0), the xchg should only affect high lane,
+  // if right != 0 (ie. left == 0), the low lane. Set bits on the lane that
+  // the xchg should affect. left == right == 0 should never happen, this'll
+  // break if it does.
+  const __m256i lanes_llo_rhi  = _mm256_blend_epi32(lefts, rights, 0xf0);
+  const __m256i xchg_lane_mask = _mm256_cmpeq_epi32(lanes_llo_rhi, zero);
+
+  const __m256i xchg_data_mask = _mm256_and_si256(xchg_mask1, xchg_lane_mask);
+
+  // If we're straddling the left border, start from the left border instead,
+  // and if right border, end on the border
+  const int32_t ld_offset = left - right;
+
+  int32_t y;
+  for (y = 0; y < height; y++) {
+    __m256i a = _mm256_loadu_si256((__m256i *)(pic_data + (y + 0) * pic_stride + 0));
+    __m256i b = _mm256_loadu_si256((__m256i *)(ref_data + (y + 0) * ref_stride + 0  + ld_offset));
+
+    __m256i b_shifted            = _mm256_shuffle_epi8     (b, shufmask1);
+    __m256i b_lanes_reversed     = _mm256_permute4x64_epi64(b_shifted,   _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i b_data_transfered    = _mm256_blendv_epi8      (b_shifted, b_lanes_reversed, xchg_data_mask);
+    __m256i b_epoled             = _mm256_shuffle_epi8     (b_data_transfered, epol_mask);
+
+    __m256i curr_sads_ab         = _mm256_sad_epu8(a, b_epoled);
+
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+  }
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);

kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c Added

@@ -0,0 +1,279 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategyselector.h"
+
+#include "cabac.h"
+#include "context.h"
+#include "encode_coding_tree-generic.h"
+#include "encode_coding_tree.h"
+
+void kvz_encode_coeff_nxn_generic(encoder_state_t * const state,
+                                  cabac_data_t * const cabac,
+                                  const coeff_t *coeff,
+                                  uint8_t width,
+                                  uint8_t type,
+                                  int8_t scan_mode,
+                                  int8_t tr_skip)
+{
+  const encoder_control_t * const encoder = state->encoder_control;
+  int c1 = 1;
+  uint8_t last_coeff_x = 0;
+  uint8_t last_coeff_y = 0;
+  int32_t i;
+  uint32_t sig_coeffgroup_flag[8 * 8] = { 0 };
+
+  int8_t be_valid = encoder->cfg.signhide_enable;
+  int32_t scan_pos_sig;
+  uint32_t go_rice_param = 0;
+  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
+
+  // CONSTANTS
+  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
+  const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
+  const uint32_t *scan           =
+    kvz_g_sig_last_scan[scan_mode][log2_block_size - 1];
+  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
+
+  // Init base contexts according to block type
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]);
+  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) :
+                                 &(cabac->ctx.cu_sig_model_chroma[0]);
+
+  // Scan all coeff groups to find out which of them have coeffs.
+  // Populate sig_coeffgroup_flag with that info.
+
+  unsigned sig_cg_cnt = 0;
+  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
+    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
+      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
+      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
+        // Load four 16-bit coeffs and see if any of them are non-zero.
+        unsigned coeff_pos = cg_pos + coeff_row * width;
+        uint64_t four_coeffs = *(uint64_t*)(&coeff[coeff_pos]);
+        if (four_coeffs) {
+          ++sig_cg_cnt;
+          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
+          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
+          sig_coeffgroup_flag[cg_pos_x + cg_pos_y * num_blk_side] = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  // Rest of the code assumes at least one non-zero coeff.
+  assert(sig_cg_cnt > 0);
+
+  // Find the last coeff group by going backwards in scan order.
+  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
+  while (!sig_coeffgroup_flag[scan_cg[scan_cg_last]]) {
+    --scan_cg_last;
+  }
+
+  // Find the last coeff by going backwards in scan order.
+  unsigned scan_pos_last = scan_cg_last * 16 + 15;
+  while (!coeff[scan[scan_pos_last]]) {
+    --scan_pos_last;
+  }
+
+  int pos_last = scan[scan_pos_last];
+
+  // transform skip flag
+  if(width == 4 && encoder->cfg.trskip_enable) {
+    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
+    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+  }
+
+  last_coeff_x = pos_last & (width - 1);
+  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
+
+  // Code last_coeff_x and last_coeff_y
+  kvz_encode_last_significant_xy(cabac,
+                                 last_coeff_x,
+                                 last_coeff_y,
+                                 width,
+                                 width,
+                                 type,
+                                 scan_mode);
+
+  scan_pos_sig  = scan_pos_last;
+
+  // significant_coeff_flag
+  for (i = scan_cg_last; i >= 0; i--) {
+    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
+    int32_t abs_coeff[16];
+    int32_t cg_blk_pos     = scan_cg[i];
+    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
+    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
+
+    uint32_t coeff_signs   = 0;
+    int32_t last_nz_pos_in_cg = -1;
+    int32_t first_nz_pos_in_cg = 16;
+    int32_t num_non_zero = 0;
+    go_rice_param = 0;
+
+    if (scan_pos_sig == scan_pos_last) {
+      abs_coeff[0] = abs(coeff[pos_last]);
+      coeff_signs  = (coeff[pos_last] < 0);
+      num_non_zero = 1;
+      last_nz_pos_in_cg  = scan_pos_sig;
+      first_nz_pos_in_cg = scan_pos_sig;
+      scan_pos_sig--;
+    }
+
+    if (i == scan_cg_last || i == 0) {
+      sig_coeffgroup_flag[cg_blk_pos] = 1;
+    } else {
+      uint32_t sig_coeff_group   = (sig_coeffgroup_flag[cg_blk_pos] != 0);
+      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                      cg_pos_y, width);
+      cabac->cur_ctx = &base_coeff_group_ctx[ctx_sig];
+      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+    }
+
+    if (sig_coeffgroup_flag[cg_blk_pos]) {
+      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
+                                                             cg_pos_x, cg_pos_y, width);
+
+      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
+        blk_pos = scan[scan_pos_sig];
+        pos_y   = blk_pos >> log2_block_size;
+        pos_x   = blk_pos - (pos_y << log2_block_size);
+        sig    = (coeff[blk_pos] != 0) ? 1 : 0;
+
+        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
+          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+                                             log2_block_size, type);
+          cabac->cur_ctx = &baseCtx[ctx_sig];
+          CABAC_BIN(cabac, sig, "sig_coeff_flag");
+        }
+
+        if (sig) {
+          abs_coeff[num_non_zero] = abs(coeff[blk_pos]);
+          coeff_signs              = 2 * coeff_signs + (coeff[blk_pos] < 0);
+          num_non_zero++;
+
+          if (last_nz_pos_in_cg == -1) {
+            last_nz_pos_in_cg = scan_pos_sig;
+          }
+
+          first_nz_pos_in_cg  = scan_pos_sig;
+        }
+      }
+    } else {
+      scan_pos_sig = sub_pos - 1;
+    }
+
+    if (num_non_zero > 0) {
+      bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */
+                         && !encoder->cfg.lossless;
+      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
+      cabac_ctx_t *base_ctx_mod;
+      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
+
+      if (c1 == 0) {
+        ctx_set++;
+      }
+
+      c1 = 1;
+
+      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma[4 * ctx_set]) :
+                         &(cabac->ctx.cu_one_model_chroma[4 * ctx_set]);

kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.h Added

@@ -0,0 +1,42 @@
+#ifndef ENCODE_CODING_TREE_GENERIC_H_
+#define ENCODE_CODING_TREE_GENERIC_H_
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \file
+ * Functions for writing the coding quadtree and related syntax.
+ */
+
+#include "encoderstate.h"
+#include "global.h"
+
+void kvz_encode_coeff_nxn_generic(encoder_state_t * const state,
+                                  cabac_data_t * const cabac,
+                                  const coeff_t *coeff,
+                                  uint8_t width,
+                                  uint8_t type,
+                                  int8_t scan_mode,
+                                  int8_t tr_skip);
+
+int kvz_strategy_register_encode_generic(void* opaque, uint8_t bitdepth);
+
+#endif // ENCODE_CODING_TREE_GENERIC_H_

kvazaar-1.2.0.tar.gz/src/strategies/generic/ipol-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/ipol-generic.c Changed

@@ -119,510 +119,541 @@
   return temp;
 }
 
-void kvz_filter_inter_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
+void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *c0, *c1, *c2, *c3;
 
-  c0 = kvz_g_luma_filter[0];
-  c1 = kvz_g_luma_filter[1];
-  c2 = kvz_g_luma_filter[2];
-  c3 = kvz_g_luma_filter[3];
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  #define FILTER_OFFSET 3
-  #define FILTER_SIZE 8
+  // Select filters according to the fractional part of the x and y mv components
+  int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
+  int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
 
-  int16_t flipped_hor_filtered[4 * (LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE];
+  int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH];
+  int16_t hor_stride = LCU_WIDTH;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      // Original pixel
-      flipped_hor_filtered[4 * x + 0][y] = (c0[FILTER_OFFSET] * src[src_stride*ypos + xpos + FILTER_OFFSET]) >> shift1;
-      flipped_hor_filtered[4 * x + 1][y] = kvz_eight_tap_filter_hor_generic(c1, &src[src_stride*ypos + xpos]) >> shift1;
-      flipped_hor_filtered[4 * x + 2][y] = kvz_eight_tap_filter_hor_generic(c2, &src[src_stride*ypos + xpos]) >> shift1;
-      flipped_hor_filtered[4 * x + 3][y] = kvz_eight_tap_filter_hor_generic(c3, &src[src_stride*ypos + xpos]) >> shift1;
+  // Filter horizontally
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < 4 * width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dst[(4 * y + 0)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((c0[FILTER_OFFSET] * flipped_hor_filtered[xpos][ypos + FILTER_OFFSET] + offset23) >> shift2) >> shift3); 
-      dst[(4 * y + 1)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c1, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
-      dst[(4 * y + 2)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c2, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
-      dst[(4 * y + 3)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c3, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
-
+  // Filter vertically
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
     }
   }
 }
 
-void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *hor_filter = kvz_g_luma_filter[mv[0]&3];
-  int8_t *ver_filter = kvz_g_luma_filter[mv[1]&3];
+  // Select filters according to the fractional part of the x and y mv components
+  int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
+  int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
 
-  int16_t flipped_hor_filtered[(LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE];
+  int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH];
+  int16_t hor_stride = LCU_WIDTH;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped_hor_filtered[x][y] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride*ypos + xpos]) >> shift1;
+  // Filter horizontally
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dst[y*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
+  // Filter vertically
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dst[y * dst_stride + x] = kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2;
     }
   }
 }
 
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder, 
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
+  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  int8_t fme_level,
+  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
-  //TODO: horizontal and vertical only filtering
-  int32_t x, y;
+  int x, y, first_y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
 
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
-  int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  int16_t flipped_hor_filtered[(LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE];
+  int8_t *fir0 = kvz_g_luma_filter[0];
+  int8_t *fir2 = kvz_g_luma_filter[2];
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped_hor_filtered[x][y] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride*ypos + xpos]) >> shift1;
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
+  int32_t first_row_offset = (KVZ_LUMA_FILTER_OFFSET + 1) * hor_stride;
+
+  int16_t *col_pos0 = hor_first_cols[0];
+  int16_t *col_pos2 = hor_first_cols[2];
+
+  // Horizontally filtered samples from the top row are
+  // not needed unless samples for diagonal positions are filtered later.
+  first_y = fme_level > 1 ? 0 : 1; 
+                                             
+  // HORIZONTAL STEP
+  // Integer pixels
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      hor_intermediate[0][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dst[y*dst_stride + x] = (kvz_eight_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filtered[xpos][ypos])) >> shift2;
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos0[y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
+  }
+
+  // Half pixels
+  for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {

kvazaar-1.2.0.tar.gz/src/strategies/generic/ipol-generic.h -> kvazaar-1.3.0.tar.gz/src/strategies/generic/ipol-generic.h Changed

@@ -32,7 +32,9 @@
 
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
 
 #endif //STRATEGIES_IPOL_GENERIC_H_

kvazaar-1.2.0.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -213,7 +213,7 @@
 }
 
 void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds[4],
-                                       const int strides[4],
+                                       const int stride,
                                        const kvz_pixel *orig,
                                        const int orig_stride,
                                        unsigned costs[4])
@@ -221,10 +221,10 @@
   int32_t diff[4][4 * 4];
   for (int y = 0; y < 4; y++) {
     for (int x = 0; x < 4; x++) {
-      diff[0][x + y * 4] = orig[x + y * orig_stride] - preds[0][x + y * strides[0]];
-      diff[1][x + y * 4] = orig[x + y * orig_stride] - preds[1][x + y * strides[1]];
-      diff[2][x + y * 4] = orig[x + y * orig_stride] - preds[2][x + y * strides[2]];
-      diff[3][x + y * 4] = orig[x + y * orig_stride] - preds[3][x + y * strides[3]];
+      diff[0][x + y * 4] = orig[x + y * orig_stride] - preds[0][x + y * stride];
+      diff[1][x + y * 4] = orig[x + y * orig_stride] - preds[1][x + y * stride];
+      diff[2][x + y * 4] = orig[x + y * orig_stride] - preds[2][x + y * stride];
+      diff[3][x + y * 4] = orig[x + y * orig_stride] - preds[3][x + y * stride];
     }
   }
 
@@ -328,15 +328,15 @@
 }
 
 static void satd_8x8_subblock_quad_generic(const kvz_pixel **preds,
-                                       const int *strides,
+                                       const int stride,
                                        const kvz_pixel *orig,
                                        const int orig_stride,
                                        unsigned *costs)
 {
-  costs[0] = satd_8x8_subblock_generic(orig, orig_stride, preds[0], strides[0]);
-  costs[1] = satd_8x8_subblock_generic(orig, orig_stride, preds[1], strides[1]);
-  costs[2] = satd_8x8_subblock_generic(orig, orig_stride, preds[2], strides[2]);
-  costs[3] = satd_8x8_subblock_generic(orig, orig_stride, preds[3], strides[3]);
+  costs[0] = satd_8x8_subblock_generic(orig, orig_stride, preds[0], stride);
+  costs[1] = satd_8x8_subblock_generic(orig, orig_stride, preds[1], stride);
+  costs[2] = satd_8x8_subblock_generic(orig, orig_stride, preds[2], stride);
+  costs[3] = satd_8x8_subblock_generic(orig, orig_stride, preds[3], stride);
 }
 
 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
@@ -394,7 +394,7 @@
   static void satd_any_size_ ## suffix ( \
       int width, int height, \
       const kvz_pixel **preds, \
-      const int *strides, \
+      const int stride, \
       const kvz_pixel *orig, \
       const int orig_stride, \
       unsigned num_modes, \
@@ -408,7 +408,7 @@
     if (width % 8 != 0) { \
       /* Process the first column using 4x4 blocks. */ \
       for (int y = 0; y < height; y += 4) { \
-        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
             } \
       orig_ptr += 4; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
@@ -419,23 +419,23 @@
     if (height % 8 != 0) { \
       /* Process the first row using 4x4 blocks. */ \
       for (int x = 0; x < width; x += 4 ) { \
-        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
             } \
       orig_ptr += 4 * orig_stride; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
-        pred_ptrs[blk] += 4 * strides[blk]; \
+        pred_ptrs[blk] += 4 * stride; \
             }\
       height -= 4; \
         } \
     /* The rest can now be processed with 8x8 blocks. */ \
     for (int y = 0; y < height; y += 8) { \
       orig_ptr = &orig[y * orig_stride]; \
-      pred_ptrs[0] = &preds[0][y * strides[0]]; \
-      pred_ptrs[1] = &preds[1][y * strides[1]]; \
-      pred_ptrs[2] = &preds[2][y * strides[2]]; \
-      pred_ptrs[3] = &preds[3][y * strides[3]]; \
+      pred_ptrs[0] = &preds[0][y * stride]; \
+      pred_ptrs[1] = &preds[1][y * stride]; \
+      pred_ptrs[2] = &preds[2][y * stride]; \
+      pred_ptrs[3] = &preds[3][y * stride]; \
       for (int x = 0; x < width; x += 8) { \
-        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
         orig_ptr += 8; \
         pred_ptrs[0] += 8; \
         pred_ptrs[1] += 8; \
@@ -535,6 +535,141 @@
   return ssd >> (2*(KVZ_BIT_DEPTH-8));
 }
 
+static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
+	const int hi_prec_luma_rec1,
+	const int hi_prec_chroma_rec0,
+	const int hi_prec_chroma_rec1,
+	int32_t height,
+	int32_t width,
+	int32_t ypos,
+	int32_t xpos,
+	const hi_prec_buf_t*high_precision_rec0,
+	const hi_prec_buf_t*high_precision_rec1,
+	lcu_t* lcu,
+	kvz_pixel* temp_lcu_y,
+	kvz_pixel* temp_lcu_u,
+	kvz_pixel* temp_lcu_v) {
+
+	int shift = 15 - KVZ_BIT_DEPTH;
+	int offset = 1 << (shift - 1);
+
+	int y_in_lcu;
+	int x_in_lcu;
+
+	//After reconstruction, merge the predictors by taking an average of each pixel
+	for (int temp_y = 0; temp_y < height; ++temp_y) {
+
+
+		for (int temp_x = 0; temp_x < width; ++temp_x) {
+			y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+			x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+
+			lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+
+			if (temp_x < width >> 1 && temp_y < height >> 1) {
+
+				y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+				x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+				int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+				int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+			}
+		}
+	}
+
+}
+
+
+static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width)
+{
+  return NULL;
+}
+
+/**
+ * \brief Vertically interpolate SAD outside the frame.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param width  Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int block_width, int block_height, unsigned pic_stride)
+{
+  int x, y;
+  unsigned sad = 0;
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
+    }
+  }
+
+  return sad;
+}
+
+/**
+ * \brief Horizontally interpolate SAD outside the frame.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param width   Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                        int block_width, int block_height, unsigned pic_stride, unsigned ref_stride)
+{
+  int x, y;
+  unsigned sad = 0;
+
+  for (y = 0; y < block_height; ++y) {

kvazaar-1.2.0.tar.gz/src/strategies/generic/picture-generic.h -> kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.h Changed

kvazaar-1.2.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -53,17 +53,19 @@
   uint32_t ac_sum = 0;
 
   for (int32_t n = 0; n < width * height; n++) {
-    int32_t level;
+    int32_t level = coef[n];
+    int64_t abs_level = (int64_t)abs(level);
     int32_t  sign;
 
-    level = coef[n];
     sign = (level < 0 ? -1 : 1);
 
-    level = ((int64_t)abs(level) * quant_coeff[n] + add) >> q_bits;
+    int32_t curr_quant_coeff = quant_coeff[n];
+    level = (abs_level * curr_quant_coeff + add) >> q_bits;
     ac_sum += level;
 
     level *= sign;
     q_coef[n] = (coeff_t)(CLIP(-32768, 32767, level));
+
   }
 
   if (!encoder->cfg.signhide_enable || ac_sum < 2) return;
@@ -71,10 +73,12 @@
   int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2];
 
   for (int32_t n = 0; n < width * height; n++) {
-    int32_t level;
-    level = coef[n];
-    level = ((int64_t)abs(level) * quant_coeff[n] + add) >> q_bits;
-    delta_u[n] = (int32_t)(((int64_t)abs(coef[n]) * quant_coeff[n] - (level << q_bits)) >> q_bits8);
+    int32_t level = coef[n];
+    int64_t abs_level = (int64_t)abs(level);
+    int32_t curr_quant_coeff = quant_coeff[n];
+
+    level = (abs_level * curr_quant_coeff + add) >> q_bits;
+    delta_u[n] = (int32_t)((abs_level * curr_quant_coeff - (level << q_bits)) >> q_bits8);
   }
 
   if (ac_sum >= 2) {
@@ -208,7 +212,7 @@
     kvz_transformskip(state->encoder_control, residual, coeff, width);
   }
   else {
-    kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+    kvz_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type);
   }
 
   // Quantize coeffs. (coeff -> coeff_out)
@@ -246,7 +250,7 @@
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
     }
     else {
-      kvz_itransform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+      kvz_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type);
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
@@ -329,6 +333,48 @@
   return sum;
 }
 
+static INLINE int16_t to_q88(float f)
+{
+  return (int16_t)(f * 256.0f);
+}
+
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t qp)
+{
+  uint32_t sum = 0;
+#define NUM_BUCKETS 5
+  const int16_t wt_m[NUM_BUCKETS] = {
+    to_q88(-0.004916),
+    to_q88(0.010806),
+    to_q88(0.055562),
+    to_q88(0.033436),
+    to_q88(-0.007690),
+  };
+  const int16_t wt_c[NUM_BUCKETS] = {
+    to_q88(0.172024),
+    to_q88(3.421462),
+    to_q88(2.879506),
+    to_q88(5.585471),
+    to_q88(0.256772),
+  };
+
+  int16_t wt[NUM_BUCKETS];
+  for (int32_t i = 0; i < NUM_BUCKETS; i++)
+    wt[i] = wt_m[i] * qp + wt_c[i];
+
+  for (int32_t i = 0; i < width * width; i++) {
+    int16_t curr = coeff[i];
+    int16_t signmask = curr >> 15;
+    int16_t curr_abs = (curr ^ signmask) - signmask;
+    if (curr_abs > 3)
+      curr_abs = 3;
+
+    sum += wt[curr_abs];
+  }
+  sum += wt[NUM_BUCKETS - 1] * width;
+  return sum >> 8;
+#undef NUM_BUCKETS
+}
+
 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -337,6 +383,7 @@
   success &= kvz_strategyselector_register(opaque, "quantize_residual", "generic", 0, &kvz_quantize_residual_generic);
   success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic);
   success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic);
+  success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "generic", 0, &fast_coeff_cost_generic);
 
   return success;
 }

kvazaar-1.3.0.tar.gz/src/strategies/missing-intel-intrinsics.h Added

kvazaar-1.3.0.tar.gz/src/strategies/optimized_sad_func_ptr_t.h Added

kvazaar-1.2.0.tar.gz/src/strategies/sse41/picture-sse41.c -> kvazaar-1.3.0.tar.gz/src/strategies/sse41/picture-sse41.c Changed

@@ -18,73 +18,201 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "strategies/sse41/picture-sse41.h"
+#include "global.h"
 
 #if COMPILE_INTEL_SSE41
+#include "strategies/sse41/picture-sse41.h"
+#include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
+
 #include <immintrin.h>
 #include <stdlib.h>
 
 #include "kvazaar.h"
 #include "strategyselector.h"
 
+uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t width, const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return reg_sad_w4(data1, data2, height, stride1, stride2);
+  if (width == 8)
+    return reg_sad_w8(data1, data2, height, stride1, stride2);
+  if (width == 12)
+    return reg_sad_w12(data1, data2, height, stride1, stride2);
+  if (width == 16)
+    return reg_sad_w16(data1, data2, height, stride1, stride2);
+  if (width == 24)
+    return reg_sad_w24(data1, data2, height, stride1, stride2);
+  else
+    return reg_sad_arbitrary(data1, data2, width, height, stride1, stride2);
+}
 
-unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
-                           const int width, const int height, const unsigned stride1, const unsigned stride2)
+static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width)
 {
-  int y, x;
-  unsigned sad = 0;
-  __m128i sse_inc = _mm_setzero_si128 ();
-  long long int sse_inc_array[2];
-  
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x <= width-16; x+=16) {
-      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
-      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
-      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
-    }
-    
-    {
-      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
-      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
-      switch (((width - (width%2)) - x)/2) {
-        case 0:
-          break;
-        case 1:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
-          break;
-        case 2:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
-          break;
-        case 3:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
-          break;
-        case 4:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
-          break;
-        case 5:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
-          break;
-        case 6:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
-          break;
-        case 7:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
-          break;
-        default:
-          //Should not happen
-          assert(0);
-      }
-      x = (width - (width%2));
-    }
-
-    for (; x < width; ++x) {
-      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
-    }
+  if (width == 0)
+    return reg_sad_w0;
+  if (width == 4)
+    return reg_sad_w4;
+  if (width == 8)
+    return reg_sad_w8;
+  if (width == 12)
+    return reg_sad_w12;
+  if (width == 16)
+    return reg_sad_w16;
+  if (width == 24)
+    return reg_sad_w24;
+  else
+    return NULL;
+}
+
+static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                              int32_t width, int32_t height, uint32_t stride)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return ver_sad_w4(pic_data, ref_data, height, stride);
+  if (width == 8)
+    return ver_sad_w8(pic_data, ref_data, height, stride);
+  if (width == 12)
+    return ver_sad_w12(pic_data, ref_data, height, stride);
+  if (width == 16)
+    return ver_sad_w16(pic_data, ref_data, height, stride);
+  else
+    return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
+}
+
+static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                  int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                  uint32_t left, uint32_t right)
+{
+  const size_t vec_width       = 16;
+  const uint32_t blkwidth_log2 = 5;
+  const uint32_t left_eq_wid   = left  >> blkwidth_log2;
+  const uint32_t right_eq_wid  = right >> blkwidth_log2;
+  const int32_t  left_clamped  = left  - left_eq_wid;
+  const int32_t  right_clamped = right - right_eq_wid;
+
+  const int32_t height_twoline_groups = height & ~1;
+  const int32_t height_residual_lines = height &  1;
+
+  const __m128i zero       = _mm_setzero_si128();
+  const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
+  const __m128i lefts      = _mm_set1_epi8((uint8_t)left_clamped);
+  const __m128i rights     = _mm_set1_epi8((uint8_t)right_clamped);
+  const __m128i nslo       = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  const __m128i nshi       = _mm_add_epi8 (nslo, vec_widths);
+
+  const __m128i rightmost_good_idx = _mm_set1_epi8((uint8_t)((vec_width << 1) - right - 1));
+
+  const __m128i epol_mask_right_lo = _mm_min_epi8  (nslo,            rightmost_good_idx);
+  const __m128i epol_mask_right_hi = _mm_min_epi8  (nshi,            rightmost_good_idx);
+  const __m128i epol_mask_lo       = _mm_max_epi8  (lefts,           epol_mask_right_lo);
+  const __m128i epol_mask_hi       = _mm_max_epi8  (lefts,           epol_mask_right_hi);
+
+  const __m128i is_left            = _mm_cmpeq_epi8(rights,          zero);
+  const __m128i vecwid_for_left    = _mm_and_si128 (is_left,         vec_widths);
+  const __m128i ns_for_shufmask    = _mm_or_si128  (nslo,            vecwid_for_left);
+
+  const __m128i shufmask1_right    = _mm_add_epi8  (ns_for_shufmask, rights);
+  const __m128i shufmask1          = _mm_sub_epi8  (shufmask1_right, lefts);
+
+  const __m128i md2bimask          = _mm_cmpgt_epi8(vec_widths,      shufmask1);
+  const __m128i move_d_to_b_imask  = _mm_or_si128  (is_left,         md2bimask);
+  const __m128i move_b_to_d_mask   = _mm_cmpgt_epi8(lefts,           nslo);
+
+  // If we're straddling the left border, start from the left border instead,
+  // and if right border, end on the border
+  const int32_t ld_offset = left - right;
+
+  int32_t y;
+  __m128i sse_inc = _mm_setzero_si128();
+  for (y = 0; y < height_twoline_groups; y += 2) {
+    __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 0));
+    __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 0  + ld_offset));
+    __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 16));
+    __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 16 + ld_offset));
+    __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + 0));
+    __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + 0  + ld_offset));
+    __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + 16));
+    __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + 16 + ld_offset));
+
+    __m128i b_shifted         = _mm_shuffle_epi8(b, shufmask1);
+    __m128i d_shifted         = _mm_shuffle_epi8(d, shufmask1);
+    __m128i f_shifted         = _mm_shuffle_epi8(f, shufmask1);
+    __m128i h_shifted         = _mm_shuffle_epi8(h, shufmask1);
+
+    // TODO: could these be optimized for two-operand efficiency? Only one of
+    // these ever does useful work, the other should leave the vector untouched,
+    // so could the first result be used in the second calculation or something?
+    __m128i b_with_d_data     = _mm_blendv_epi8(d_shifted, b_shifted, move_d_to_b_imask);
+    __m128i d_with_b_data     = _mm_blendv_epi8(d_shifted, b_shifted, move_b_to_d_mask);
+    __m128i f_with_h_data     = _mm_blendv_epi8(h_shifted, f_shifted, move_d_to_b_imask);
+    __m128i h_with_f_data     = _mm_blendv_epi8(h_shifted, f_shifted, move_b_to_d_mask);
+
+    __m128i b_final           = _mm_shuffle_epi8(b_with_d_data, epol_mask_lo);
+    __m128i d_final           = _mm_shuffle_epi8(d_with_b_data, epol_mask_hi);
+    __m128i f_final           = _mm_shuffle_epi8(f_with_h_data, epol_mask_lo);
+    __m128i h_final           = _mm_shuffle_epi8(h_with_f_data, epol_mask_hi);
+
+    __m128i curr_sads_ab      = _mm_sad_epu8    (a, b_final);
+    __m128i curr_sads_cd      = _mm_sad_epu8    (c, d_final);
+    __m128i curr_sads_ef      = _mm_sad_epu8    (e, f_final);
+    __m128i curr_sads_gh      = _mm_sad_epu8    (g, h_final);

kvazaar-1.3.0.tar.gz/src/strategies/sse41/reg_sad_pow2_widths-sse41.h Added

@@ -0,0 +1,1027 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifndef REG_SAD_POW2_WIDTHS_SSE41_H_
+#define REG_SAD_POW2_WIDTHS_SSE41_H_
+
+#include "kvazaar.h"
+#include "strategies/missing-intel-intrinsics.h"
+#include <immintrin.h>
+
+static INLINE uint32_t reg_sad_w0(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  return 0;
+}
+
+static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(data1 + y * stride1));
+    __m128i b = _mm_cvtsi32_si128(*(uint32_t *)(data2 + y * stride2));
+
+    a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 1) * stride1), 1);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 1) * stride2), 1);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 2) * stride1), 2);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 2) * stride2), 2);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 3) * stride1), 3);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 3) * stride2), 3);
+
+    __m128i curr_sads = _mm_sad_epu8(a, b);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(data1 + y * stride1));
+      __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(data2 + y * stride2));
+
+      __m128i curr_sads = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128d a_d = _mm_setzero_pd();
+    __m128d b_d = _mm_setzero_pd();
+    __m128d c_d = _mm_setzero_pd();
+    __m128d d_d = _mm_setzero_pd();
+
+    a_d = _mm_loadl_pd(a_d, (const double *)(data1 + (y + 0) * stride1));
+    b_d = _mm_loadl_pd(b_d, (const double *)(data2 + (y + 0) * stride2));
+    a_d = _mm_loadh_pd(a_d, (const double *)(data1 + (y + 1) * stride1));
+    b_d = _mm_loadh_pd(b_d, (const double *)(data2 + (y + 1) * stride2));
+
+    c_d = _mm_loadl_pd(c_d, (const double *)(data1 + (y + 2) * stride1));
+    d_d = _mm_loadl_pd(d_d, (const double *)(data2 + (y + 2) * stride2));
+    c_d = _mm_loadh_pd(c_d, (const double *)(data1 + (y + 3) * stride1));
+    d_d = _mm_loadh_pd(d_d, (const double *)(data2 + (y + 3) * stride2));
+
+    __m128i a = _mm_castpd_si128(a_d);
+    __m128i b = _mm_castpd_si128(b_d);
+    __m128i c = _mm_castpd_si128(c_d);
+    __m128i d = _mm_castpd_si128(d_d);
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, d);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(data1 + y * stride1));
+      __m128i b = _mm_loadl_epi64((__m128i *)(data2 + y * stride2));
+
+      __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+  for (y = 0; y < height; y++) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1));
+    __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2));
+
+    __m128i b_masked  = _mm_blend_epi16(a, b, 0x3f);
+    __m128i curr_sads = _mm_sad_epu8   (a, b_masked);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
+    __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
+    __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
+    __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
+    __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1));
+    __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2));
+    __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1));
+    __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2));
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, d);
+    __m128i curr_sads_ef = _mm_sad_epu8(e, f);
+    __m128i curr_sads_gh = _mm_sad_epu8(g, h);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
+      __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
+
+      __m128i curr_sads = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_doublelines = height & ~1;
+  const int32_t height_parity      = height &  1;
+
+  for (y = 0; y < height_doublelines; y += 2) {

kvazaar-1.2.0.tar.gz/src/strategies/strategies-dct.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-dct.c Changed

@@ -55,22 +55,23 @@
 
 
 /**
-* \brief  Get a function that calculates SAD for NxN block.
-*
-* \param n  Width of the region for which SAD is calculated.
-*
-* \returns  Pointer to cost_16bit_nxn_func.
-*/
-dct_func * kvz_get_dct_func(int8_t width, int32_t mode)
+ * \brief  Get a function that performs the transform for a block.
+ *
+ * \param width    Width of the region
+ * \param color    Color plane
+ * \param type     Prediction type
+ *
+ * \returns Pointer to the function.
+ */
+dct_func * kvz_get_dct_func(int8_t width, color_t color, cu_type_t type)
 {
   switch (width) {
   case 4:
-    switch (mode){
-    case 65535:
-      return kvz_dct_4x4;
-    default:
+    if (color == COLOR_Y && type == CU_INTRA) {
       return kvz_fast_forward_dst_4x4;
-  }
+    } else {
+      return kvz_dct_4x4;
+    }
   case 8:
     return kvz_dct_8x8;
   case 16:
@@ -83,21 +84,22 @@
 }
 
 /**
-* \brief  Get a function that calculates SAD for NxN block.
-*
-* \param n  Width of the region for which SAD is calculated.
-*
-* \returns  Pointer to cost_16bit_nxn_func.
-*/
-dct_func * kvz_get_idct_func(int8_t width, int32_t mode)
+ * \brief  Get a function that performs the inverse transform for a block.
+ *
+ * \param width    Width of the region
+ * \param color    Color plane
+ * \param type     Prediction type
+ *
+ * \returns Pointer to the function.
+ */
+dct_func * kvz_get_idct_func(int8_t width, color_t color, cu_type_t type)
 {
   switch (width) {
   case 4:
-    switch (mode){
-    case 65535:
-      return kvz_idct_4x4;
-    default:
+    if (color == COLOR_Y && type == CU_INTRA) {
       return kvz_fast_inverse_dst_4x4;
+    } else {
+      return kvz_idct_4x4;
     }
   case 8:
     return kvz_idct_8x8;

kvazaar-1.2.0.tar.gz/src/strategies/strategies-dct.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-dct.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-encode.c Added

@@ -0,0 +1,41 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/strategies-encode.h"
+
+#include "strategies/avx2/encode_coding_tree-avx2.h"
+#include "strategies/generic/encode_coding_tree-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+encode_coeff_nxn_func *kvz_encode_coeff_nxn;
+
+
+int kvz_strategy_register_encode(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+
+  success &= kvz_strategy_register_encode_generic(opaque, bitdepth);
+
+  if (kvz_g_hardware_flags.intel_flags.avx2) {
+    success &= kvz_strategy_register_encode_avx2(opaque, bitdepth);
+  }
+  return success;
+}

kvazaar-1.3.0.tar.gz/src/strategies/strategies-encode.h Added

@@ -0,0 +1,56 @@
+#ifndef STRATEGIES_ENCODE_H_
+#define STRATEGIES_ENCODE_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for quantization functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "tables.h"
+
+
+// Declare function pointers.
+typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
+                                         cabac_data_t * const cabac,
+                                         const coeff_t *coeff,
+                                         uint8_t width,
+                                         uint8_t type,
+                                         int8_t scan_mode,
+                                         int8_t tr_skip);
+
+// Declare function pointers.
+extern encode_coeff_nxn_func *kvz_encode_coeff_nxn;
+
+int kvz_strategy_register_encode(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_ENCODE_EXPORTS \
+  {"encode_coeff_nxn", (void**) &kvz_encode_coeff_nxn}, \
+
+
+
+#endif //STRATEGIES_ENCODE_H_

kvazaar-1.2.0.tar.gz/src/strategies/strategies-ipol.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-ipol.c Changed

kvazaar-1.2.0.tar.gz/src/strategies/strategies-ipol.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-ipol.h Changed

@@ -34,11 +34,9 @@
 
 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
 
-typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst,
-  int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
-
-typedef unsigned(ipol_frac_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
-  frac_search_block filtered_out[15], int8_t fme_level);
+typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
+  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], 
+  int8_t sample_off_x, int8_t sample_off_y);
 
 typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
   int filter_size, int width, int height, kvz_extended_block *out);
@@ -50,10 +48,10 @@
 typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
 // Declare function pointers.
-extern ipol_func * kvz_filter_inter_quarterpel_luma;
-extern ipol_func * kvz_filter_inter_halfpel_chroma;
-extern ipol_func * kvz_filter_inter_octpel_chroma;
-extern ipol_frac_blocks_func *kvz_filter_frac_blocks_luma;
+extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
+extern ipol_blocks_func * kvz_filter_hpel_blocks_diag_luma;
+extern ipol_blocks_func * kvz_filter_qpel_blocks_hor_ver_luma;
+extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 extern epol_func * kvz_get_extended_block;
 extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
@@ -65,10 +63,10 @@
 
 
 #define STRATEGIES_IPOL_EXPORTS \
-  {"filter_inter_quarterpel_luma", (void**) &kvz_filter_inter_quarterpel_luma}, \
-  {"filter_inter_halfpel_chroma", (void**) &kvz_filter_inter_halfpel_chroma}, \
-  {"filter_inter_octpel_chroma", (void**) &kvz_filter_inter_octpel_chroma}, \
-  {"filter_frac_blocks_luma", (void**) &kvz_filter_frac_blocks_luma}, \
+  {"filter_hpel_blocks_hor_ver_luma", (void**) &kvz_filter_hpel_blocks_hor_ver_luma}, \
+  {"filter_hpel_blocks_diag_luma",    (void**) &kvz_filter_hpel_blocks_diag_luma}, \
+  {"filter_qpel_blocks_hor_ver_luma", (void**) &kvz_filter_qpel_blocks_hor_ver_luma}, \
+  {"filter_qpel_blocks_diag_luma",    (void**) &kvz_filter_qpel_blocks_diag_luma}, \
   {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
   {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
   {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \

kvazaar-1.2.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.c Changed

kvazaar-1.2.0.tar.gz/src/strategies/strategies-picture.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -28,11 +28,12 @@
 
 #include "global.h" // IWYU pragma: keep
 #include "kvazaar.h"
+#include "encoderstate.h"
+#include "strategies/optimized_sad_func_ptr_t.h"
 
 
 typedef kvz_pixel (*pred_buffer)[32 * 32];
 
-
 // Function macro for defining hadamard calculating functions
 // for fixed size blocks. They calculate hadamard for integer
 // multiples of 8x8 with the 8x8 hadamard function.
@@ -108,9 +109,33 @@
     const kvz_pixel *block2, int stride2
 );
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const kvz_pixel *orig, unsigned num_modes, unsigned *costs_out);
-typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int *strides, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
+typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int stride, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
 
 typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
+typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
+typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int32_t block_width, int32_t block_height,
+                                uint32_t pic_stride);
+typedef uint32_t (hor_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int32_t width, int32_t height, uint32_t pic_stride,
+                                uint32_t ref_stride, uint32_t left, uint32_t right);
+
+typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
+	const int hi_prec_luma_rec1,
+	const int hi_prec_chroma_rec0,
+	const int hi_prec_chroma_rec1,
+	int height,
+	int width,
+	int ypos,
+	int xpos,
+	const hi_prec_buf_t*high_precision_rec0,
+	const hi_prec_buf_t*high_precision_rec1,
+	lcu_t* lcu,
+	kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
+	kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
+	kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]);
+	
+	
 
 // Declare function pointers.
 extern reg_sad_func * kvz_reg_sad;
@@ -144,6 +169,12 @@
 
 extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;
 
+extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend;
+
+extern get_optimized_sad_func *kvz_get_optimized_sad;
+extern ver_sad_func *kvz_ver_sad;
+extern hor_sad_func *kvz_hor_sad;
+
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
 cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
@@ -175,6 +206,10 @@
   {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \
   {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \
   {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
+  {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \
+  {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
+  {"ver_sad", (void**) &kvz_ver_sad}, \
+  {"hor_sad", (void**) &kvz_hor_sad}, \

kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.c Changed

kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/x86_asm/x86inc.asm Added

@@ -0,0 +1,1466 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2014 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%ifndef private_prefix
+    %define private_prefix kvz
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+%macro SECTION_RODATA 0-1 16
+    SECTION .rodata align=%1
+%endmacro
+
+%macro SECTION_TEXT 0-1 16
+    SECTION .text align=%1
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+%macro CPUNOP 1
+    %ifdef __YASM_MAJOR__
+        CPU %1
+    %endif
+%endmacro
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPUNOP amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
+;      MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64

kvazaar-1.2.0.tar.gz/src/strategyselector.c -> kvazaar-1.3.0.tar.gz/src/strategyselector.c Changed

@@ -26,9 +26,6 @@
 
 #ifdef _WIN32
 #include <windows.h>
-#elif MACOS
-#include <sys/param.h>
-#include <sys/sysctl.h>
 #else
 #include <unistd.h>
 #endif
@@ -89,6 +86,11 @@
     return 0;
   }
   
+  if (!kvz_strategy_register_encode(&strategies, bitdepth)) {
+    fprintf(stderr, "kvz_strategy_register_encode failed!\n");
+    return 0;
+  }
+  
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
     
@@ -372,40 +374,67 @@
 #endif // COMPILE_INTEL
 
 #if COMPILE_POWERPC
-#include <fcntl.h>
-#include <unistd.h>
-#include <linux/auxvec.h>
+#  if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
+#ifdef __linux__
 #include <asm/cputable.h>
+#else
+#include <machine/cpu.h>
+#endif
+#include <sys/auxv.h>
 
-//Source: http://freevec.org/function/altivec_runtime_detection_linux
 static int altivec_available(void)
 {
-    int result = 0;
-    unsigned long buf[64];
-    ssize_t count;
-    int fd, i;
- 
-    fd = open("/proc/self/auxv", O_RDONLY);
-    if (fd < 0) {
-        return 0;
-    }
-    // loop on reading
-    do {
-        count = read(fd, buf, sizeof(buf));
-        if (count < 0)
-            break;
-        for (i=0; i < (count / sizeof(unsigned long)); i += 2) {
-            if (buf[i] == AT_HWCAP) {
-                result = !!(buf[i+1] & PPC_FEATURE_HAS_ALTIVEC);
-                goto out_close;
-            } else if (buf[i] == AT_NULL)
-                goto out_close;
-        }
-    } while (count == sizeof(buf));
-out_close:
-    close(fd);
-    return result;
+    unsigned long hwcap = 0;
+#ifdef __linux__
+    hwcap = getauxval(AT_HWCAP);
+#else
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#endif
+    return !!(hwcap & PPC_FEATURE_HAS_ALTIVEC);
 }
+#  elif defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+
+static int altivec_available(void)
+{
+  u_long cpu_features = 0;
+  size_t len = sizeof(cpu_features);
+
+  sysctlbyname("hw.cpu_features", &cpu_features, &len, NULL, 0);
+  return !!(cpu_features & PPC_FEATURE_HAS_ALTIVEC);
+}
+#  elif defined(__APPLE__) || defined(__NetBSD__) || defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#ifndef __APPLE__
+#include <machine/cpu.h>
+#endif
+
+static int altivec_available(void)
+{
+  int cpu_altivec = 0;
+  size_t len = sizeof(cpu_altivec);
+#ifdef HW_VECTORUNIT
+  int mib[] = { CTL_HW, HW_VECTORUNIT };
+#else
+  int mib[] = { CTL_MACHDEP, CPU_ALTIVEC };
+#endif
+
+  sysctl(mib, sizeof(mib)/sizeof(mib[0]), &cpu_altivec, &len, NULL, 0);
+  return cpu_altivec;
+}
+#  else
+static int altivec_available(void)
+{
+#if COMPILE_POWERPC_ALTIVEC
+  return 1;
+#else
+  return 0;
+#endif
+}
+#  endif
 #endif //COMPILE_POWERPC
 
 static void set_hardware_flags(int32_t cpuid) {

kvazaar-1.2.0.tar.gz/src/strategyselector.h -> kvazaar-1.3.0.tar.gz/src/strategyselector.h Changed

kvazaar-1.2.0.tar.gz/src/threadqueue.c -> kvazaar-1.3.0.tar.gz/src/threadqueue.c Changed

kvazaar-1.2.0.tar.gz/src/threadqueue.h -> kvazaar-1.3.0.tar.gz/src/threadqueue.h Changed

kvazaar-1.3.0.tar.gz/src/threadwrapper Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/LICENSE Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/README.md Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/include Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/include/pthread.h Added

@@ -0,0 +1,53 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* pthread_cond_t;
+typedef void* pthread_cond_t;
+typedef void* pthread_mutex_t;
+typedef void* pthread_t;
+typedef void*(voidp_voidp_func)(void*);
+
+typedef void pthread_attr_t;
+typedef void pthread_condattr_t;
+typedef void pthread_mutexattr_t;
+
+// Parameter names that have been commented away do nothing,
+// as they are always null when the functions are used in Kvazaar.
+
+int pthread_cond_broadcast(pthread_cond_t* cond);
+int pthread_cond_destroy(pthread_cond_t* cond);
+int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t* /*attr*/);
+int pthread_cond_signal(pthread_cond_t* cond);
+int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex);
+
+int pthread_create(pthread_t* thread, const pthread_attr_t* /*attr*/, voidp_voidp_func executee, void* arg);
+void pthread_exit(void* /*value_ptr*/);
+int pthread_join(pthread_t thread, void** /*value_ptr*/);
+
+int pthread_mutex_destroy(pthread_mutex_t* mutex);
+int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* /*attr*/);
+int pthread_mutex_lock(pthread_mutex_t* mutex);
+int pthread_mutex_unlock(pthread_mutex_t* mutex);
+
+#ifdef __cplusplus
+}
+#endif

kvazaar-1.3.0.tar.gz/src/threadwrapper/include/semaphore.h Added

@@ -0,0 +1,33 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* sem_t;
+
+int sem_destroy(sem_t* sem);
+// pshared is always 0 in Kvazaar on w32.
+int sem_init(sem_t* sem, int /*pshared*/, unsigned int value);
+int sem_post(sem_t* sem);
+int sem_wait(sem_t* sem);
+
+#ifdef __cplusplus
+}
+#endif

kvazaar-1.3.0.tar.gz/src/threadwrapper/src Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/src/pthread.cpp Added

@@ -0,0 +1,88 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "pthread.h"
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+
+int pthread_cond_broadcast(pthread_cond_t* cond) {
+    static_cast<std::condition_variable*>(*cond)->notify_all();
+    return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t* cond) {
+    delete static_cast<std::condition_variable*>(*cond);
+    *cond = nullptr;
+    return 0;
+}
+
+int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t*) {
+    *cond = new std::condition_variable();
+    return 0;
+}
+
+int pthread_cond_signal(pthread_cond_t* cond) {
+    static_cast<std::condition_variable*>(*cond)->notify_one();
+    return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) {
+    std::mutex* real_mutex = static_cast<std::mutex*>(*mutex);
+    std::unique_lock<std::mutex> lock(*real_mutex, std::adopt_lock);
+    static_cast<std::condition_variable*>(*cond)->wait(lock);
+    lock.release();
+    return 0;
+}
+
+int pthread_create(pthread_t* thread, const pthread_attr_t*, voidp_voidp_func executee, void* arg) {
+    *thread = new std::thread(executee, arg);
+    return 0;
+}
+
+void pthread_exit(void*) {
+    // It might be enough to do nothing here
+    // considering Kvazaar's current use of pthread_exit
+}
+
+int pthread_join(pthread_t thread, void**) {
+    std::thread* real_thread = static_cast<std::thread*>(thread);
+    real_thread->join();
+    delete real_thread;
+    return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t* mutex) {
+    delete static_cast<std::mutex*>(*mutex);
+    *mutex = nullptr;
+    return 0;
+}
+
+int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t*) {
+    *mutex = new std::mutex();
+    return 0;
+}
+
+int pthread_mutex_lock(pthread_mutex_t* mutex) {
+    static_cast<std::mutex*>(*mutex)->lock();
+    return 0;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t* mutex) {
+    static_cast<std::mutex*>(*mutex)->unlock();
+    return 0;
+}

kvazaar-1.3.0.tar.gz/src/threadwrapper/src/semaphore.cpp Added

@@ -0,0 +1,72 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "semaphore.h"
+#include <condition_variable>
+#include <mutex>
+
+
+class Semaphore {
+public:
+
+    Semaphore(int value):
+        val_(value) {
+    }
+
+    void post() {
+        std::unique_lock<std::mutex> lck(mtx_);
+        if (++val_ <= 0) {
+            cvar_.notify_one();
+        }
+    }
+
+    void wait() {
+        std::unique_lock<std::mutex> lck(mtx_);
+        if (--val_ < 0) {
+            cvar_.wait(lck);
+        }
+    }
+
+
+private:
+
+    int val_;
+    std::condition_variable cvar_;
+    std::mutex mtx_;
+
+};  // class Semaphore
+
+
+int sem_destroy(sem_t* sem) {
+    delete static_cast<Semaphore*>(*sem);
+    *sem = nullptr;
+    return 0;
+}
+
+int sem_init(sem_t* sem, int, unsigned int value) {
+    *sem = new Semaphore(value);
+    return 0;
+}
+
+int sem_post(sem_t* sem) {
+    static_cast<Semaphore*>(*sem)->post();
+    return 0;
+}
+
+int sem_wait(sem_t* sem) {
+    static_cast<Semaphore*>(*sem)->wait();
+    return 0;
+}

kvazaar-1.2.0.tar.gz/src/transform.c -> kvazaar-1.3.0.tar.gz/src/transform.c Changed

@@ -186,15 +186,25 @@
  * \param coeff transform coefficients
  * \param block_size width of transform
  */
-void kvz_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, int8_t block_size, int32_t mode)
+void kvz_transform2d(const encoder_control_t * const encoder,
+                     int16_t *block,
+                     int16_t *coeff,
+                     int8_t block_size,
+                     color_t color,
+                     cu_type_t type)
 {
-  dct_func *dct_func = kvz_get_dct_func(block_size, mode);  
+  dct_func *dct_func = kvz_get_dct_func(block_size, color, type);
   dct_func(encoder->bitdepth, block, coeff);
 }
 
-void kvz_itransform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, int8_t block_size, int32_t mode)
+void kvz_itransform2d(const encoder_control_t * const encoder,
+                      int16_t *block,
+                      int16_t *coeff,
+                      int8_t block_size,
+                      color_t color,
+                      cu_type_t type)
 {
-  dct_func *idct_func = kvz_get_idct_func(block_size, mode);
+  dct_func *idct_func = kvz_get_idct_func(block_size, color, type);
   idct_func(encoder->bitdepth, coeff, block);
 }
 
@@ -359,19 +369,22 @@
     }
 
   } else if (can_use_trskip) {
+    int8_t tr_skip = 0;
+
     // Try quantization with trskip and use it if it's better.
     has_coeffs = kvz_quantize_residual_trskip(state,
                                               cur_pu,
                                               tr_width,
                                               color,
                                               scan_idx,
-                                              &cur_pu->intra.tr_skip,
+                                              &tr_skip,
                                               lcu_width,
                                               lcu_width,
                                               ref,
                                               pred,
                                               pred,
                                               coeff);
+    cur_pu->tr_skip = tr_skip;
   } else {
     has_coeffs = kvz_quantize_residual(state,
                                        cur_pu,
@@ -450,10 +463,8 @@
       LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
     };
 
-    if (luma && depth < MAX_DEPTH) {
+    if (depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
-    }
-    if (chroma && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U);
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V);
     }

kvazaar-1.2.0.tar.gz/src/transform.h -> kvazaar-1.3.0.tar.gz/src/transform.h Changed

@@ -38,8 +38,18 @@
 void kvz_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
 void kvz_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
 
-void kvz_transform2d(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size, int32_t mode);
-void kvz_itransform2d(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size, int32_t mode);
+void kvz_transform2d(const encoder_control_t * const encoder,
+                     int16_t *block,
+                     int16_t *coeff,
+                     int8_t block_size,
+                     color_t color,
+                     cu_type_t type);
+void kvz_itransform2d(const encoder_control_t * const encoder,
+                      int16_t *block,
+                      int16_t *coeff,
+                      int8_t block_size,
+                      color_t color,
+                      cu_type_t type);
 
 int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset);

kvazaar-1.2.0.tar.gz/tests/Makefile.am -> kvazaar-1.3.0.tar.gz/tests/Makefile.am Changed

kvazaar-1.2.0.tar.gz/tests/dct_tests.c -> kvazaar-1.3.0.tar.gz/tests/dct_tests.c Changed

kvazaar-1.3.0.tar.gz/tests/inter_recon_bipred_tests.c Added

@@ -0,0 +1,184 @@
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2017 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License version 2.1 as
+* published by the Free Software Foundation.
+*
+* Kvazaar is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+#include "greatest/greatest.h"
+
+#include "test_strategies.h"
+#include "strategies/generic/picture-generic.h"
+#include <string.h>
+#include <stdlib.h>
+
+
+static lcu_t expected_test_result;
+static lcu_t result;
+
+static lcu_t lcu1;
+
+int temp1, temp2, temp3, temp4;
+
+int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } };
+int width = 16;
+int height = 16;
+int xpos = 0;
+int ypos = 0;
+
+
+kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH];
+kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C];
+kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C];
+
+int hi_prec_luma_rec0;
+int hi_prec_luma_rec1;
+int hi_prec_chroma_rec0;
+int hi_prec_chroma_rec1;
+
+hi_prec_buf_t* high_precision_rec0 = 0;
+hi_prec_buf_t* high_precision_rec1 = 0;
+
+int temp_x, temp_y;
+
+
+
+static void setup()
+{
+
+	memset(lcu1.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(lcu1.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(lcu1.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+
+	memset(expected_test_result.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(expected_test_result.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(expected_test_result.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+	memcpy(expected_test_result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
+	memcpy(expected_test_result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
+	memcpy(expected_test_result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
+
+	// Setup is not optimized working function from picture-generic.c.
+
+	
+	int shift = 15 - KVZ_BIT_DEPTH;
+	int offset = 1 << (shift - 1);
+
+ hi_prec_luma_rec0 = mv_param[0][0] & 3 || mv_param[0][1] & 3;
+ hi_prec_luma_rec1 = mv_param[1][0] & 3 || mv_param[1][1] & 3;
+
+ hi_prec_chroma_rec0 = mv_param[0][0] & 7 || mv_param[0][1] & 7;
+ hi_prec_chroma_rec1 = mv_param[1][0] & 7 || mv_param[1][1] & 7;
+
+	if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+	if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+
+	
+
+
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+		}
+
+	}
+	for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+			int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+
+
+
+		}
+	}
+}
+
+
+TEST test_inter_recon_bipred()
+{
+
+
+	memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
+	memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
+	memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
+
+	
+	kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); 
+ 
+ for (temp_y = 0; temp_y < height; ++temp_y) {
+  int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+  for (temp_x = 0; temp_x < width; temp_x += 1) {
+   int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+   printf("%d ", result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]);
+  }
+ }
+ printf("\n");
+ 
+ /*
+ for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+  int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+  for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+   int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+   printf("%d ", result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]);
+  }
+ }
+ printf("\n");
+ */
+
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; temp_x+=1) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			ASSERT_EQ_FMT(expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], "%d");
+		}
+	}
+
+	for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			ASSERT_EQ_FMT(expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu], result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu], "%d");
+			ASSERT_EQ_FMT(expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu], result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu], "%d");
+		}
+	}
+	
+	PASS();
+}
+
+SUITE(inter_recon_bipred_tests)
+{
+	setup();
+
+	for (volatile int i = 0; i < strategies.count; ++i) {
+		if (strcmp(strategies.strategies[i].type, "inter_recon_bipred") != 0) {
+			continue;
+		}
+
+		kvz_inter_recon_bipred_blend = strategies.strategies[i].fptr;
+		RUN_TEST(test_inter_recon_bipred);
+	}
+}

kvazaar-1.2.0.tar.gz/tests/sad_tests.c -> kvazaar-1.3.0.tar.gz/tests/sad_tests.c Changed

kvazaar-1.2.0.tar.gz/tests/speed_tests.c -> kvazaar-1.3.0.tar.gz/tests/speed_tests.c Changed

kvazaar-1.2.0.tar.gz/tests/test_gop.sh -> kvazaar-1.3.0.tar.gz/tests/test_gop.sh Changed

@@ -9,4 +9,13 @@
 valgrind_test 264x130 10 $common_args --gop=8 -p0 --owf=1
 valgrind_test 264x130 10 $common_args --gop=8 -p0 --owf=4
 valgrind_test 264x130 20 $common_args --gop=8 -p16 --owf=0
+valgrind_test 264x130 10 $common_args --gop=8 -p1 --owf=4
 valgrind_test 264x130 10 $common_args --gop=lp-g4d3t1 -p5 --owf=4
+valgrind_test 264x130 10 $common_args --gop=8 -p8 --owf=4 --no-open-gop
+valgrind_test 264x130 30 $common_args --gop=8 -p16 --owf=16
+# Do more extensive tests in a private gitlab CI runner
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 $common_args --gop=8 -p8 --owf=0 --no-open-gop; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 40 $common_args --gop=8 -p32 --owf=4 --no-open-gop; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 70 $common_args --gop=8 -p64 --owf=4 --no-open-gop; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 50 $common_args --gop=8 -p40 --owf=4 --no-open-gop; fi
+if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 10 $common_args --gop=8 -p8 --owf=0 --no-open-gop --bipred; fi

kvazaar-1.2.0.tar.gz/tests/test_owf_wpp_tiles.sh -> kvazaar-1.3.0.tar.gz/tests/test_owf_wpp_tiles.sh Changed

kvazaar-1.2.0.tar.gz/tests/test_rate_control.sh -> kvazaar-1.3.0.tar.gz/tests/test_rate_control.sh Changed

kvazaar-1.2.0.tar.gz/tests/test_slices.sh -> kvazaar-1.3.0.tar.gz/tests/test_slices.sh Changed

kvazaar-1.2.0.tar.gz/tests/test_smp.sh -> kvazaar-1.3.0.tar.gz/tests/test_smp.sh Changed

kvazaar-1.2.0.tar.gz/tests/tests_main.c -> kvazaar-1.3.0.tar.gz/tests/tests_main.c Changed

kvazaar-1.2.0.tar.gz/tests/util.sh -> kvazaar-1.3.0.tar.gz/tests/util.sh Changed