Changes of Revision 11

x265.changes Changed
x
 
1
@@ -1,4 +1,47 @@
2
 -------------------------------------------------------------------
3
+Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
4
+
5
+- Update to version 1.8:
6
+  API Changes:
7
+  * Experimental support for Main12 is now enabled. Partial
8
+    assembly support exists.
9
+  * Main12 and Intra/Still picture profiles are now supported.
10
+    Still picture profile is detected based on
11
+    x265_param::totalFrames.
12
+  * Three classes of encoding statistics are now available
13
+    through the API.
14
+    + x265_stats - contains encoding statistics, available
15
+      through x265_encoder_get_stats()
16
+    + x265_frame_stats and x265_cu_stats - contains frame
17
+      encoding statistics, available through recon x265_picture
18
+  * --csv
19
+  * x265_encoder_log() is now deprecated
20
+  * x265_param::csvfn is also deprecated
21
+  * --log-level now controls only console logging, frame
22
+    level console logging has been removed.
23
+  * Support added for new color transfer characteristic ARIB
24
+    STD-B67
25
+  New Features:
26
+  * limit-refs
27
+    + This feature limits the references analysed for
28
+      individual CUS.
29
+    + Provides a nice tradeoff between efficiency and
30
+      performance.
31
+    + aq-mode 3
32
+  * A new aq-mode that provides additional biasing for
33
+    low-light conditions.
34
+  * An improved scene cut detection logic that allows
35
+    ratecontrol to manage visual quality at fade-ins and
36
+    fade-outs better.
37
+  Preset and Tune Options:
38
+  * tune grain
39
+    + Increases psyRdoq strength to 10.0, and rdoq-level to 2.
40
+    + qg-size
41
+  * Default value changed to 32.
42
+- soname bump to 68
43
+- Reworked arm.patch for 1.8
44
+
45
+-------------------------------------------------------------------
46
 Fri May 29 09:11:02 UTC 2015 - aloisio@gmx.com
47
 
48
 - soname bump to 59
49
x265.spec Changed
26
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  59
6
+%define soname  68
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        1.7
10
+Version:        1.8
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -43,9 +43,9 @@
15
 streams. 
16
 
17
 %prep
18
-%setup -q -n "%{name}_%{version}/build/linux"
19
+%setup -q -n "%{name}_11047/build/linux"
20
 cd ../..
21
-%patch0
22
+%patch0 -p1
23
 cd -
24
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
25
 sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
26
arm.patch Changed
69
 
1
@@ -1,9 +1,11 @@
2
---- source/CMakeLists.txt.orig 2015-04-28 21:43:18.585528552 +0200
3
-+++ source/CMakeLists.txt  2015-04-28 21:47:14.995334232 +0200
4
-@@ -50,10 +50,18 @@
5
-         set(X64 1)
6
-         add_definitions(-DX86_64=1)
7
-     endif()
8
+Index: x265_11047/source/CMakeLists.txt
9
+===================================================================
10
+--- x265_11047.orig/source/CMakeLists.txt
11
++++ x265_11047/source/CMakeLists.txt
12
+@@ -56,10 +56,22 @@ elseif(POWERMATCH GREATER "-1")
13
+     message(STATUS "Detected POWER target processor")
14
+     set(POWER 1)
15
+     add_definitions(-DX265_ARCH_POWER=1)
16
 +elseif(${SYSPROC} MATCHES "armv5.*")
17
 +    message(STATUS "Detected ARMV5 system processor")
18
 +    set(ARMV5 1)
19
@@ -19,10 +21,14 @@
20
 +    message(STATUS "Detected ARMV7 system processor")
21
 +    set(ARMV7 1)
22
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
23
++elseif(${SYSPROC} STREQUAL "aarch64")
24
++    message(STATUS "Detected AArch64 system processor")
25
++    set(ARMV7 1)
26
++    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
27
  else()
28
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
29
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
30
-@@ -155,8 +163,8 @@
31
+@@ -169,8 +181,8 @@ if(GCC)
32
      elseif(X86 AND NOT X64)
33
          add_definitions(-march=i686)
34
      endif()
35
@@ -33,8 +39,10 @@
36
      endif()
37
      if(FPROFILE_GENERATE)
38
          if(INTEL_CXX)
39
---- source/common/cpu.cpp.orig 2015-04-28 21:47:44.634923269 +0200
40
-+++ source/common/cpu.cpp  2015-04-28 21:49:50.305468867 +0200
41
+Index: x265_11047/source/common/cpu.cpp
42
+===================================================================
43
+--- x265_11047.orig/source/common/cpu.cpp
44
++++ x265_11047/source/common/cpu.cpp
45
 @@ -37,7 +37,7 @@
46
  #include <machine/cpu.h>
47
  #endif
48
@@ -44,20 +52,3 @@
49
  #include <signal.h>
50
  #include <setjmp.h>
51
  static sigjmp_buf jmpbuf;
52
-@@ -340,7 +340,6 @@
53
-     }
54
- 
55
-     canjump = 1;
56
--    x265_cpu_neon_test();
57
-     canjump = 0;
58
-     signal(SIGILL, oldsig);
59
- #endif // if !HAVE_NEON
60
-@@ -356,7 +355,7 @@
61
-     // which may result in incorrect detection and the counters stuck enabled.
62
-     // right now Apple does not seem to support performance counters for this test
63
- #ifndef __MACH__
64
--    flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
65
-+    //flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
66
- #endif
67
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
68
- #endif // if HAVE_ARMV6
69
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-59
3
+libx265-68
4
x265_1.7.tar.gz/source/filters/filters.cpp Deleted
81
 
1
@@ -1,79 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2013 x265 project
4
- *
5
- * Authors: Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "filters.h"
26
-#include "common.h"
27
-
28
-/* The dithering algorithm is based on Sierra-2-4A error diffusion. */
29
-void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride,
30
-                 int width, int height, int16_t *errors, int bitDepth)
31
-{
32
-    const int lShift = 16 - bitDepth;
33
-    const int rShift = 16 - bitDepth + 2;
34
-    const int half = (1 << (16 - bitDepth + 1));
35
-    const int pixelMax = (1 << bitDepth) - 1;
36
-
37
-    memset(errors, 0, (width + 1) * sizeof(int16_t));
38
-    int pitch = 1;
39
-    for (int y = 0; y < height; y++, src += srcStride, dst += dstStride)
40
-    {
41
-        int16_t err = 0;
42
-        for (int x = 0; x < width; x++)
43
-        {
44
-            err = err * 2 + errors[x] + errors[x + 1];
45
-            dst[x * pitch] = (pixel)x265_clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
46
-            errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift);
47
-        }
48
-    }
49
-}
50
-
51
-void ditherImage(x265_picture& picIn, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth)
52
-{
53
-    /* This portion of code is from readFrame in x264. */
54
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
55
-    {
56
-        if ((picIn.bitDepth & 7) && (picIn.bitDepth != 16))
57
-        {
58
-            /* upconvert non 16bit high depth planes to 16bit */
59
-            uint16_t *plane = (uint16_t*)picIn.planes[i];
60
-            uint32_t pixelCount = x265_picturePlaneSize(picIn.colorSpace, picWidth, picHeight, i);
61
-            int lShift = 16 - picIn.bitDepth;
62
-
63
-            /* This loop assumes width is equal to stride which
64
-               happens to be true for file reader outputs */
65
-            for (uint32_t j = 0; j < pixelCount; j++)
66
-            {
67
-                plane[j] = plane[j] << lShift;
68
-            }
69
-        }
70
-    }
71
-
72
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
73
-    {
74
-        int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]);
75
-        int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]);
76
-
77
-        ditherPlane(((pixel*)picIn.planes[i]), picIn.stride[i] / sizeof(pixel), ((uint16_t*)picIn.planes[i]),
78
-                    picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
79
-    }
80
-}
81
x265_1.7.tar.gz/source/filters/filters.h Deleted
33
 
1
@@ -1,31 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2013 x265 project
4
- *
5
- * Authors: Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_FILTERS_H
26
-#define X265_FILTERS_H
27
-
28
-#include "x265.h"
29
-
30
-void ditherImage(x265_picture&, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
31
-
32
-#endif //X265_FILTERS_H
33
x265_1.7.tar.gz/.hg_archival.txt -> x265_1.8.tar.gz/.hg_archival.txt Changed
9
 
1
@@ -1,4 +1,5 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 8425278def1edf0931dc33fc518e1950063e76b0
4
+node: 5dcc9d3a928c400b41a3547d7bfee10340519e56
5
 branch: stable
6
-tag: 1.7
7
+latesttag: 1.8
8
+latesttagdistance: 1
9
x265_1.7.tar.gz/.hgtags -> x265_1.8.tar.gz/.hgtags Changed
7
 
1
@@ -15,3 +15,5 @@
2
 5e604833c5aa605d0b6efbe5234492b5e7d8ac61 1.4
3
 9f0324125f53a12f766f6ed6f98f16e2f42337f4 1.5
4
 cbeb7d8a4880e4020c4545dd8e498432c3c6cad3 1.6
5
+8425278def1edf0931dc33fc518e1950063e76b0 1.7
6
+e27327f5da35c5feb660360336fdc94bd0afe719 1.8
7
x265_1.8.tar.gz/build/linux/multilib.sh Added
43
 
1
@@ -0,0 +1,41 @@
2
+#!/bin/sh
3
+
4
+mkdir -p 8bit 10bit 12bit
5
+
6
+cd 12bit
7
+cmake ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
8
+make ${MAKEFLAGS}
9
+
10
+cd ../10bit
11
+cmake ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
12
+make ${MAKEFLAGS}
13
+
14
+cd ../8bit
15
+ln -sf ../10bit/libx265.a libx265_main10.a
16
+ln -sf ../12bit/libx265.a libx265_main12.a
17
+cmake ../../../source -DEXTRA_LIB="x265_main10.a;x265_main12.a" -DEXTRA_LINK_FLAGS=-L. -DLINKED_10BIT=ON -DLINKED_12BIT=ON
18
+make ${MAKEFLAGS}
19
+
20
+# rename the 8bit library, then combine all three into libx265.a
21
+mv libx265.a libx265_main.a
22
+
23
+uname=`uname`
24
+if [ "$uname" = "Linux" ]
25
+then
26
+
27
+# On Linux, we use GNU ar to combine the static libraries together
28
+ar -M <<EOF
29
+CREATE libx265.a
30
+ADDLIB libx265_main.a
31
+ADDLIB libx265_main10.a
32
+ADDLIB libx265_main12.a
33
+SAVE
34
+END
35
+EOF
36
+
37
+else
38
+
39
+# Mac/BSD libtool
40
+libtool -static -o libx265.a libx265_main.a libx265_main10.a libx265_main12.a 2>/dev/null
41
+
42
+fi
43
x265_1.8.tar.gz/build/msys/multilib.sh Added
31
 
1
@@ -0,0 +1,29 @@
2
+#!/bin/sh
3
+
4
+mkdir -p 8bit 10bit 12bit
5
+
6
+cd 12bit
7
+cmake -G "MSYS Makefiles" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
8
+make ${MAKEFLAGS}
9
+cp libx265.a ../8bit/libx265_main12.a
10
+
11
+cd ../10bit
12
+cmake -G "MSYS Makefiles" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
13
+make ${MAKEFLAGS}
14
+cp libx265.a ../8bit/libx265_main10.a
15
+
16
+cd ../8bit
17
+cmake -G "MSYS Makefiles" ../../../source -DEXTRA_LIB="x265_main10.a;x265_main12.a" -DEXTRA_LINK_FLAGS=-L. -DLINKED_10BIT=ON -DLINKED_12BIT=ON
18
+make ${MAKEFLAGS}
19
+
20
+# rename the 8bit library, then combine all three into libx265.a using GNU ar
21
+mv libx265.a libx265_main.a
22
+
23
+ar -M <<EOF
24
+CREATE libx265.a
25
+ADDLIB libx265_main.a
26
+ADDLIB libx265_main10.a
27
+ADDLIB libx265_main12.a
28
+SAVE
29
+END
30
+EOF
31
x265_1.8.tar.gz/build/vc10-x86_64/multilib.bat Added
46
 
1
@@ -0,0 +1,44 @@
2
+@echo off
3
+if "%VS100COMNTOOLS%" == "" (
4
+  msg "%username%" "Visual Studio 10 not detected"
5
+  exit 1
6
+)
7
+
8
+call "%VS100COMNTOOLS%\..\..\VC\vcvarsall.bat"
9
+
10
+@mkdir 12bit
11
+@mkdir 10bit
12
+@mkdir 8bit
13
+
14
+@cd 12bit
15
+cmake -G "Visual Studio 10 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
16
+if exist x265.sln (
17
+  MSBuild /property:Configuration="Release" x265.sln
18
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
19
+)
20
+
21
+@cd ..\10bit
22
+cmake -G "Visual Studio 10 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
23
+if exist x265.sln (
24
+  MSBuild /property:Configuration="Release" x265.sln
25
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
26
+)
27
+
28
+@cd ..\8bit
29
+if not exist x265-static-main10.lib (
30
+  msg "%username%" "10bit build failed"
31
+  exit 1
32
+)
33
+if not exist x265-static-main12.lib (
34
+  msg "%username%" "12bit build failed"
35
+  exit 1
36
+)
37
+cmake -G "Visual Studio 10 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
38
+if exist x265.sln (
39
+  MSBuild /property:Configuration="Release" x265.sln
40
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
41
+  move Release\x265-static.lib x265-static-main.lib
42
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
43
+)
44
+
45
+pause
46
x265_1.8.tar.gz/build/vc11-x86_64/multilib.bat Added
46
 
1
@@ -0,0 +1,44 @@
2
+@echo off
3
+if "%VS110COMNTOOLS%" == "" (
4
+  msg "%username%" "Visual Studio 11 not detected"
5
+  exit 1
6
+)
7
+
8
+call "%VS110COMNTOOLS%\..\..\VC\vcvarsall.bat"
9
+
10
+@mkdir 12bit
11
+@mkdir 10bit
12
+@mkdir 8bit
13
+
14
+@cd 12bit
15
+cmake -G "Visual Studio 11 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
16
+if exist x265.sln (
17
+  MSBuild /property:Configuration="Release" x265.sln
18
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
19
+)
20
+
21
+@cd ..\10bit
22
+cmake -G "Visual Studio 11 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
23
+if exist x265.sln (
24
+  MSBuild /property:Configuration="Release" x265.sln
25
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
26
+)
27
+
28
+@cd ..\8bit
29
+if not exist x265-static-main10.lib (
30
+  msg "%username%" "10bit build failed"
31
+  exit 1
32
+)
33
+if not exist x265-static-main12.lib (
34
+  msg "%username%" "12bit build failed"
35
+  exit 1
36
+)
37
+cmake -G "Visual Studio 11 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
38
+if exist x265.sln (
39
+  MSBuild /property:Configuration="Release" x265.sln
40
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
41
+  move Release\x265-static.lib x265-static-main.lib
42
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
43
+)
44
+
45
+pause
46
x265_1.8.tar.gz/build/vc12-x86_64/multilib.bat Added
46
 
1
@@ -0,0 +1,44 @@
2
+@echo off
3
+if "%VS120COMNTOOLS%" == "" (
4
+  msg "%username%" "Visual Studio 12 not detected"
5
+  exit 1
6
+)
7
+
8
+call "%VS120COMNTOOLS%\..\..\VC\vcvarsall.bat"
9
+
10
+@mkdir 12bit
11
+@mkdir 10bit
12
+@mkdir 8bit
13
+
14
+@cd 12bit
15
+cmake -G "Visual Studio 12 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
16
+if exist x265.sln (
17
+  MSBuild /property:Configuration="Release" x265.sln
18
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
19
+)
20
+
21
+@cd ..\10bit
22
+cmake -G "Visual Studio 12 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
23
+if exist x265.sln (
24
+  MSBuild /property:Configuration="Release" x265.sln
25
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
26
+)
27
+
28
+@cd ..\8bit
29
+if not exist x265-static-main10.lib (
30
+  msg "%username%" "10bit build failed"
31
+  exit 1
32
+)
33
+if not exist x265-static-main12.lib (
34
+  msg "%username%" "12bit build failed"
35
+  exit 1
36
+)
37
+cmake -G "Visual Studio 12 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
38
+if exist x265.sln (
39
+  MSBuild /property:Configuration="Release" x265.sln
40
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
41
+  move Release\x265-static.lib x265-static-main.lib
42
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
43
+)
44
+
45
+pause
46
x265_1.8.tar.gz/build/vc9-x86_64/multilib.bat Added
46
 
1
@@ -0,0 +1,44 @@
2
+@echo off
3
+if "%VS90COMNTOOLS%" == "" (
4
+  msg "%username%" "Visual Studio 9 not detected"
5
+  exit 1
6
+)
7
+
8
+call "%VS90COMNTOOLS%\..\..\VC\vcvarsall.bat"
9
+
10
+@mkdir 12bit
11
+@mkdir 10bit
12
+@mkdir 8bit
13
+
14
+@cd 12bit
15
+cmake -G "Visual Studio 9 2008 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
16
+if exist x265.sln (
17
+  MSBuild /property:Configuration="Release" x265.sln
18
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
19
+)
20
+
21
+@cd ..\10bit
22
+cmake -G "Visual Studio 9 2008 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
23
+if exist x265.sln (
24
+  MSBuild /property:Configuration="Release" x265.sln
25
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
26
+)
27
+
28
+@cd ..\8bit
29
+if not exist x265-static-main10.lib (
30
+  msg "%username%" "10bit build failed"
31
+  exit 1
32
+)
33
+if not exist x265-static-main12.lib (
34
+  msg "%username%" "12bit build failed"
35
+  exit 1
36
+)
37
+cmake -G "Visual Studio 9 2008 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
38
+if exist x265.sln (
39
+  MSBuild /property:Configuration="Release" x265.sln
40
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
41
+  move Release\x265-static.lib x265-static-main.lib
42
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
43
+)
44
+
45
+pause
46
x265_1.7.tar.gz/doc/reST/api.rst -> x265_1.8.tar.gz/doc/reST/api.rst Changed
201
 
1
@@ -41,9 +41,9 @@
2
 x265 will accept input pixels of any depth between 8 and 16 bits
3
 regardless of the depth of its internal pixels (8 or 10).  It will shift
4
 and mask input pixels as required to reach the internal depth. If
5
-downshifting is being performed using our CLI application, the
6
-:option:`--dither` option may be enabled to reduce banding. This feature
7
-is not available through the C interface.
8
+downshifting is being performed using our CLI application (to 8 bits),
9
+the :option:`--dither` option may be enabled to reduce banding. This
10
+feature is not available through the C interface.
11
 
12
 Encoder
13
 =======
14
@@ -159,7 +159,8 @@
15
    helps future-proof your code in many ways, but the x265 API is
16
    versioned in such a way that we prevent linkage against a build of
17
    x265 that does not match the version of the header you are compiling
18
-   against. This is function of the X265_BUILD macro.
19
+   against (unless you use x265_api_query() to acquire the library's
20
+   interfaces). This is function of the X265_BUILD macro.
21
 
22
 **x265_encoder_parameters()** may be used to get a copy of the param
23
 structure from the encoder after it has been opened, in order to see the
24
@@ -190,7 +191,7 @@
25
     *      presets is not recommended without a more fine-grained breakdown of
26
     *      parameters to take this into account. */
27
    int x265_encoder_reconfig(x265_encoder *, x265_param *);
28
-   
29
+
30
 Pictures
31
 ========
32
 
33
@@ -320,7 +321,8 @@
34
    provided, the encoder will fill it with data pertaining to the
35
    output picture corresponding to the output NALs, including the
36
    recontructed image, POC and decode timestamp. These pictures will be
37
-   in encode (or decode) order.
38
+   in encode (or decode) order. The encoder will also write corresponding 
39
+   frame encode statistics into **x265_frame_stats**.
40
 
41
 When the last of the raw input pictures has been sent to the encoder,
42
 **x265_encoder_encode()** must still be called repeatedly with a
43
@@ -338,15 +340,6 @@
44
 Cleanup
45
 =======
46
 
47
-At the end of the encode, the application will want to trigger logging
48
-of the final encode statistics, if :option:`--csv` had been specified::
49
-
50
-   /* x265_encoder_log:
51
-    *       write a line to the configured CSV file.  If a CSV filename was not
52
-    *       configured, or file open failed, or the log level indicated frame level
53
-    *       logging, this function will perform no write. */
54
-   void x265_encoder_log(x265_encoder *encoder, int argc, char **argv);
55
-
56
 Finally, the encoder must be closed in order to free all of its
57
 resources. An encoder that has been flushed cannot be restarted and
58
 reused. Once **x265_encoder_close()** has been called, the encoder
59
@@ -370,52 +363,150 @@
60
 Multi-library Interface
61
 =======================
62
 
63
-If your application might want to make a runtime selection between
64
-a number of libx265 libraries (perhaps 8bpp and 16bpp), then you will
65
-want to use the multi-library interface.
66
-
67
-Instead of directly using all of the **x265_** methods documented
68
-above, you query an x265_api structure from your libx265 and then use
69
-the function pointers within that structure of the same name, but
70
-without the **x265_** prefix. So **x265_param_default()** becomes
71
-**api->param_default()**. The key method is x265_api_get()::
72
-
73
-    /* x265_api_get:
74
-     *   Retrieve the programming interface for a linked x265 library.
75
-     *   May return NULL if no library is available that supports the
76
-     *   requested bit depth. If bitDepth is 0, the function is guarunteed
77
-     *   to return a non-NULL x265_api pointer from the system default
78
-     *   libx265 */
79
-    const x265_api* x265_api_get(int bitDepth);
80
-
81
-Note that using this multi-library API in your application is only the
82
-first step.
83
-
84
-Your application must link to one build of libx265 (statically or 
85
-dynamically) and this linked version of libx265 will support one 
86
-bit-depth (8 or 10 bits). 
87
-
88
-Your application must now request the API for the bitDepth you would 
89
-prefer the encoder to use (8 or 10). If the requested bitdepth is zero, 
90
-or if it matches the bitdepth of the system default libx265 (the 
91
-currently linked library), then this library will be used for encode.
92
-If you request a different bit-depth, the linked libx265 will attempt 
93
-to dynamically bind a shared library with a name appropriate for the 
94
-requested bit-depth:
95
-
96
-    8-bit:  libx265_main.dll
97
-    10-bit: libx265_main10.dll
98
-
99
-    (the shared library extension is obviously platform specific. On
100
-    Linux it is .so while on Mac it is .dylib)
101
-
102
-For example on Windows, one could package together an x265.exe
103
-statically linked against the 8bpp libx265 together with a
104
-libx265_main10.dll in the same folder, and this executable would be able
105
-to encode main and main10 bitstreams.
106
-
107
-On Linux, x265 packagers could install 8bpp static and shared libraries
108
-under the name libx265 (so all applications link against 8bpp libx265)
109
-and then also install libx265_main10.so (symlinked to its numbered solib).
110
-Thus applications which use x265_api_get() will be able to generate main
111
-or main10 bitstreams.
112
+If your application might want to make a runtime bit-depth selection, it
113
+will need to use one of these bit-depth introspection interfaces which
114
+returns an API structure containing the public function entry points and
115
+constants.
116
+
117
+Instead of directly using all of the **x265_** methods documented above,
118
+you query an x265_api structure from your libx265 and then use the
119
+function pointers of the same name (minus the **x265_** prefix) within
120
+that structure.  For instance **x265_param_default()** becomes
121
+**api->param_default()**.
122
+
123
+x265_api_get
124
+------------
125
+
126
+The first bit-depth instrospecton method is x265_api_get(). It designed
127
+for applications that might statically link with libx265, or will at
128
+least be tied to a particular SONAME or API version::
129
+
130
+   /* x265_api_get:
131
+    *   Retrieve the programming interface for a linked x265 library.
132
+    *   May return NULL if no library is available that supports the
133
+    *   requested bit depth. If bitDepth is 0, the function is guarunteed
134
+    *   to return a non-NULL x265_api pointer from the system default
135
+    *   libx265 */
136
+   const x265_api* x265_api_get(int bitDepth);
137
+
138
+Like **x265_encoder_encode()**, this function has the build number
139
+automatically appended to the function name via macros. This ties your
140
+application to a particular binary API version of libx265 (the one you
141
+compile against). If you attempt to link with a libx265 with a different
142
+API version number, the link will fail.
143
+
144
+Obviously this has no meaningful effect on applications which statically
145
+link to libx265.
146
+
147
+x265_api_query
148
+--------------
149
+
150
+The second bit-depth introspection method is designed for applications
151
+which need more flexibility in API versioning.  If you use
152
+**x265_api_query()** and dynamically link to libx265 at runtime (using
153
+dlopen() on POSIX or LoadLibrary() on Windows) your application is no
154
+longer directly tied to the API version that it was compiled against::
155
+
156
+   /* x265_api_query:
157
+    *   Retrieve the programming interface for a linked x265 library, like
158
+    *   x265_api_get(), except this function accepts X265_BUILD as the second
159
+    *   argument rather than using the build number as part of the function name.
160
+    *   Applications which dynamically link to libx265 can use this interface to
161
+    *   query the library API and achieve a relative amount of version skew
162
+    *   flexibility. The function may return NULL if the library determines that
163
+    *   the apiVersion that your application was compiled against is not compatible
164
+    *   with the library you have linked with.
165
+    *
166
+    *   api_major_version will be incremented any time non-backward compatible
167
+    *   changes are made to any public structures or functions. If
168
+    *   api_major_version does not match X265_MAJOR_VERSION from the x265.h your
169
+    *   application compiled against, your application must not use the returned
170
+    *   x265_api pointer.
171
+    *
172
+    *   Users of this API *must* also validate the sizes of any structures which
173
+    *   are not treated as opaque in application code. For instance, if your
174
+    *   application dereferences a x265_param pointer, then it must check that
175
+    *   api->sizeof_param matches the sizeof(x265_param) that your application
176
+    *   compiled with. */
177
+   const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
178
+
179
+A number of validations must be performed on the returned API structure
180
+in order to determine if it is safe for use by your application. If you
181
+do not perform these checks, your application is liable to crash::
182
+
183
+   if (api->api_major_version != X265_MAJOR_VERSION) /* do not use */
184
+   if (api->sizeof_param != sizeof(x265_param))      /* do not use */
185
+   if (api->sizeof_picture != sizeof(x265_picture))  /* do not use */
186
+   if (api->sizeof_stats != sizeof(x265_stats))      /* do not use */
187
+   if (api->sizeof_zone != sizeof(x265_zone))        /* do not use */
188
+   etc.
189
+
190
+Note that if your application does not directly allocate or dereference
191
+one of these structures, if it treats the structure as opaque or does
192
+not use it at all, then it can skip the size check for that structure.
193
+
194
+In particular, if your application uses api->param_alloc(),
195
+api->param_free(), api->param_parse(), etc and never directly accesses
196
+any x265_param fields, then it can skip the check on the
197
+sizeof(x265_parm) and thereby ignore changes to that structure (which
198
+account for a large percentage of X265_BUILD bumps).
199
+
200
+Build Implications
201
x265_1.7.tar.gz/doc/reST/cli.rst -> x265_1.8.tar.gz/doc/reST/cli.rst Changed
201
 
1
@@ -28,7 +28,7 @@
2
 
3
 Generally, when an option expects a string value from a list of strings
4
 the user may specify the integer ordinal of the value they desire. ie:
5
-:option:`--log-level` 4 is equivalent to :option:`--log-level` debug.
6
+:option:`--log-level` 3 is equivalent to :option:`--log-level` debug.
7
 
8
 Executable Options
9
 ==================
10
@@ -52,6 +52,7 @@
11
    2. unable to open encoder
12
    3. unable to generate stream headers
13
    4. encoder abort
14
+   5. unable to open csv file
15
 
16
 Logging/Statistic Options
17
 =========================
18
@@ -67,9 +68,8 @@
19
    0. error
20
    1. warning
21
    2. info **(default)**
22
-   3. frame
23
-   4. debug
24
-   5. full
25
+   3. debug
26
+   4. full
27
 
28
 .. option:: --no-progress
29
 
30
@@ -80,9 +80,9 @@
31
 .. option:: --csv <filename>
32
 
33
    Writes encoding results to a comma separated value log file. Creates
34
-   the file if it doesnt already exist, else adds one line per run.  if
35
-   :option:`--log-level` is frame or above, it writes one line per
36
-   frame. Default none
37
+   the file if it doesnt already exist. If :option:`--csv-log-level` is 0, 
38
+   it adds one line per run. If :option:`--csv-log-level` is greater than
39
+   0, it writes one line per frame. Default none
40
 
41
    When frame level logging is enabled, several frame performance
42
    statistics are listed:
43
@@ -123,13 +123,17 @@
44
    enough ahead for the necessary reference data to be available. This
45
    is more of a problem for P frames where some blocks are much more
46
    expensive than others.
47
+   
48
+   **CLI ONLY**
49
 
50
+.. option:: --csv-log-level <integer>
51
 
52
-.. option:: --cu-stats, --no-cu-stats
53
+        CSV logging level. Default 0
54
+        0. summary
55
+        1. frame level logging
56
+        2. frame level logging with performance statistics
57
 
58
-   Records statistics on how each CU was coded (split depths and other
59
-   mode decisions) and reports those statistics at the end of the
60
-   encode. Default disabled
61
+        **CLI ONLY**
62
 
63
 .. option:: --ssim, --no-ssim
64
 
65
@@ -349,6 +353,13 @@
66
 
67
    **CLI ONLY**
68
 
69
+.. option:: --total-frames <integer>
70
+
71
+   The number of frames intended to be encoded.  It may be left
72
+   unspecified, but when it is specified rate control can make use of
73
+   this information. It is also used to determine if an encode is
74
+   actually a stillpicture profile encode (single frame)
75
+
76
 .. option:: --dither
77
 
78
    Enable high quality downscaling. Dithering is based on the diffusion
79
@@ -384,7 +395,7 @@
80
 
81
    **Range of values:** positive int or float, or num/denom
82
 
83
-.. option:: --interlaceMode <false|tff|bff>, --no-interlaceMode
84
+.. option:: --interlace <false|tff|bff>, --no-interlace
85
 
86
    0. progressive pictures **(default)**
87
    1. top field first 
88
@@ -419,14 +430,18 @@
89
 
90
    **CLI ONLY**
91
 
92
-.. option:: --output-depth, -D 8|10
93
+.. option:: --output-depth, -D 8|10|12
94
 
95
    Bitdepth of output HEVC bitstream, which is also the internal bit
96
    depth of the encoder. If the requested bit depth is not the bit
97
    depth of the linked libx265, it will attempt to bind libx265_main
98
-   for an 8bit encoder, or libx265_main10 for a 10bit encoder, with the
99
+   for an 8bit encoder, libx265_main10 for a 10bit encoder, or
100
+   libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
101
    same API version as the linked libx265.
102
 
103
+   If the output depth is not specified but :option:`--profile` is
104
+   specified, the output depth will be derived from the profile name.
105
+
106
    **CLI ONLY**
107
 
108
 Profile, Level, Tier
109
@@ -439,15 +454,44 @@
110
    profile.  May abort the encode if the specified profile is
111
    impossible to be supported by the compile options chosen for the
112
    encoder (a high bit depth encoder will be unable to output
113
-   bitstreams compliant with Main or Mainstillpicture).
114
+   bitstreams compliant with Main or MainStillPicture).
115
+
116
+   The following profiles are supported in x265.
117
+
118
+   8bit profiles::
119
+
120
+   main, main-intra, mainstillpicture (or msp for short)
121
+   main444-8 main444-intra main444-stillpicture
122
+   See note below on signaling intra and stillpicture profiles.
123
+   
124
+   10bit profiles::
125
+
126
+   main10, main10-intra
127
+   main422-10, main422-10-intra
128
+   main444-10, main444-10-intra
129
+
130
+   12bit profiles::
131
+
132
+   main12, main12-intra
133
+   main422-12, main422-12-intra
134
+   main444-12, main444-12-intra
135
+
136
+
137
+   **CLI ONLY**
138
 
139
-   API users must use x265_param_apply_profile() after configuring
140
+   API users must call x265_param_apply_profile() after configuring
141
    their param structure. Any changes made to the param structure after
142
    this call might make the encode non-compliant.
143
 
144
-   **Values:** main, main10, mainstillpicture, main422-8, main422-10, main444-8, main444-10
145
+   The CLI application will derive the output bit depth from the
146
+   profile name if :option:`--output-depth` is not specified.
147
 
148
-   **CLI ONLY**
149
+.. note::
150
+
151
+   All 12bit presets are extremely unstable, do not use them yet.
152
+   16bit is not supported at all, but those profiles are included
153
+   because it is possible for libx265 to make bitstreams compatible
154
+   with them.
155
 
156
 .. option:: --level-idc <integer|float>
157
 
158
@@ -479,6 +523,9 @@
159
    specified level, main tier first, turning on high tier only if 
160
    necessary and available at that level.
161
 
162
+   If :option:`--level-idc` has not been specified, this argument is
163
+   ignored.
164
+
165
 .. option:: --ref <1..16>
166
 
167
    Max number of L0 references to be allowed. This number has a linear
168
@@ -511,6 +558,7 @@
169
    Default: disabled
170
 
171
 .. note::
172
+
173
    :option:`--profile`, :option:`--level-idc`, and
174
    :option:`--high-tier` are only intended for use when you are
175
    targeting a particular decoder (or decoders) with fixed resource
176
@@ -519,6 +567,29 @@
177
    parameters to meet those requirements but it will never raise
178
    them. It may enable VBV constraints on a CRF encode.
179
 
180
+   Also note that x265 determines the decoder requirement profile and
181
+   level in three steps.  First, the user configures an x265_param
182
+   structure with their suggested encoder options and then optionally
183
+   calls x265_param_apply_profile() to enforce a specific profile
184
+   (main, main10, etc). Second, an encoder is created from this
185
+   x265_param instance and the :option:`--level-idc` and
186
+   :option:`--high-tier` parameters are used to reduce bitrate or other
187
+   features in order to enforce the target level. Finally, the encoder
188
+   re-examines the final set of parameters and detects the actual
189
+   minimum decoder requirement level and this is what is signaled in
190
+   the bitstream headers. The detected decoder level will only use High
191
+   tier if the user specified a High tier level.
192
+
193
+   The signaled profile will be determined by the encoder's internal
194
+   bitdepth and input color space. If :option:`--keyint` is 0 or 1,
195
+   then an intra variant of the profile will be signaled.
196
+
197
+   If :option:`--total-frames` is 1, then a stillpicture variant will
198
+   be signaled, but this parameter is not always set by applications,
199
+   particularly not when the CLI uses stdin streaming or when libx265
200
+   is used by third-party applications.
201
x265_1.7.tar.gz/doc/reST/presets.rst -> x265_1.8.tar.gz/doc/reST/presets.rst Changed
18
 
1
@@ -114,12 +114,12 @@
2
 ~~~~~~~~~~~~~~~~~~~~
3
 
4
 :option:`--tune` *grain* tries to improve the retention of film grain in
5
-the reconstructed output. It helps rate distortion optimizations select
6
-modes which preserve high frequency noise:
7
+the reconstructed output. It disables rate distortion optimizations in
8
+quantization, and increases the default psy-rd.
9
 
10
     * :option:`--psy-rd` 0.5
11
-    * :option:`--rdoq-level` 1
12
-    * :option:`--psy-rdoq` 30
13
+    * :option:`--rdoq-level` 0
14
+    * :option:`--psy-rdoq` 0
15
 
16
 It lowers the strength of adaptive quantization, so residual energy can
17
 be more evenly distributed across the (noisy) picture:
18
x265_1.7.tar.gz/doc/reST/threading.rst -> x265_1.8.tar.gz/doc/reST/threading.rst Changed
62
 
1
@@ -28,7 +28,7 @@
2
 providers are recommended to call this method when they make new jobs
3
 available.
4
 
5
-Worker jobs are not allowed to block except when abosultely necessary
6
+Worker jobs are not allowed to block except when absolutely necessary
7
 for data locking. If a job becomes blocked, the work function is
8
 expected to drop that job so the worker thread may go back to the pool
9
 and find more work.
10
@@ -94,10 +94,10 @@
11
 
12
 If a worker thread job has work which can be performed in parallel by
13
 many threads, it may allocate a bonded task group and enlist the help of
14
-other idle worker threads in the same pool. Those threads will cooperate
15
-to complete the work of the bonded task group and then return to their
16
-idle states. The larger and more uniform those tasks are, the better the
17
-bonded task group will perform.
18
+other idle worker threads from the same thread pool. Those threads will
19
+cooperate to complete the work of the bonded task group and then return
20
+to their idle states. The larger and more uniform those tasks are, the
21
+better the bonded task group will perform.
22
 
23
 Parallel Mode Analysis
24
 ~~~~~~~~~~~~~~~~~~~~~~
25
@@ -105,19 +105,20 @@
26
 When :option:`--pmode` is enabled, each CU (at all depths from 64x64 to
27
 8x8) will distribute its analysis work to the thread pool via a bonded
28
 task group. Each analysis job will measure the cost of one prediction
29
-for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At
30
-slower presets, the amount of increased parallelism is often enough to
31
-be able to reduce frame parallelism while achieving the same overall CPU
32
-utilization. Reducing frame threads is often beneficial to ABR and VBV
33
-rate control.
34
+for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP).
35
+
36
+At slower presets, the amount of increased parallelism from pmode is
37
+often enough to be able to reduce or disable frame parallelism while
38
+achieving the same overall CPU utilization. Reducing frame threads is
39
+often beneficial to ABR and VBV rate control.
40
 
41
 Parallel Motion Estimation
42
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
43
 
44
 When :option:`--pme` is enabled all of the analysis functions which
45
 perform motion searches to reference frames will distribute those motion
46
-searches as jobs for worker threads via a bonded task group (if more
47
-than two motion searches are required).
48
+searches to other worker threads via a bonded task group (if more than
49
+two motion searches are required).
50
 
51
 Frame Threading
52
 ===============
53
@@ -241,7 +242,7 @@
54
 bonded task groups to measure single frame cost estimates using slices.
55
 (see :option:`--lookahead-slices`)
56
 
57
-The function slicetypeDecide() itself is also be performed by a worker
58
+The main slicetypeDecide() function itself is also performed by a worker
59
 thread if your encoder has a thread pool, else it runs within the
60
 context of the thread which calls the x265_encoder_encode().
61
 
62
x265_1.7.tar.gz/source/CMakeLists.txt -> x265_1.8.tar.gz/source/CMakeLists.txt Changed
185
 
1
@@ -30,7 +30,7 @@
2
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
3
 
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 59)
6
+set(X265_BUILD 68)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -42,6 +42,8 @@
11
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
12
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
13
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
14
+set(POWER_ALIASES ppc64 ppc64le)
15
+list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
16
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
17
     message(STATUS "Detected x86 target processor")
18
     set(X86 1)
19
@@ -50,6 +52,10 @@
20
         set(X64 1)
21
         add_definitions(-DX86_64=1)
22
     endif()
23
+elseif(POWERMATCH GREATER "-1")
24
+    message(STATUS "Detected POWER target processor")
25
+    set(POWER 1)
26
+    add_definitions(-DX265_ARCH_POWER=1)
27
 elseif(${SYSPROC} STREQUAL "armv6l")
28
     message(STATUS "Detected ARM target processor")
29
     set(ARM 1)
30
@@ -82,6 +88,10 @@
31
         endif()
32
     endif()
33
     mark_as_advanced(LIBRT NUMA_FOUND)
34
+    option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
35
+    if(NO_ATOMICS)
36
+        add_definitions(-DNO_ATOMICS=1)
37
+    endif(NO_ATOMICS)
38
 endif(UNIX)
39
 
40
 if(X64 AND NOT WIN32)
41
@@ -260,6 +270,8 @@
42
         message(STATUS "Found Yasm ${YASM_VERSION_STRING} to build assembly primitives")
43
         option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
44
     endif()
45
+else()
46
+    option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
47
 endif()
48
 
49
 option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF)
50
@@ -270,23 +282,59 @@
51
 # Build options
52
 set(LIB_INSTALL_DIR lib CACHE STRING "Install location of libraries")
53
 set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables")
54
+set(EXTRA_LIB "" CACHE STRING "Extra libraries to link against")
55
+set(EXTRA_LINK_FLAGS "" CACHE STRING "Extra link flags")
56
+if(EXTRA_LINK_FLAGS)
57
+    list(APPEND LINKER_OPTIONS ${EXTRA_LINK_FLAGS})
58
+endif()
59
+if(EXTRA_LIB)
60
+    option(LINKED_8BIT  "8bit libx265 is being linked with this library" OFF)
61
+    option(LINKED_10BIT "10bit libx265 is being linked with this library" OFF)
62
+    option(LINKED_12BIT "12bit libx265 is being linked with this library" OFF)
63
+endif(EXTRA_LIB)
64
+mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
65
 
66
 if(X64)
67
-    # NOTE: We only officially support 16bit-per-pixel compiles of x265
68
-    # on 64bit architectures. 16bpp plus large resolution plus slow
69
+    # NOTE: We only officially support high-bit-depth compiles of x265
70
+    # on 64bit architectures. Main10 plus large resolution plus slow
71
     # preset plus 32bit address space usually means malloc failure.  You
72
     # can disable this if(X64) check if you desparately need a 32bit
73
     # build with 10bit/12bit support, but this violates the "shrink wrap
74
     # license" so to speak.  If it breaks you get to keep both halves.
75
-    # You will likely need to compile without assembly
76
-    option(HIGH_BIT_DEPTH "Store pixels as 16bit values" OFF)
77
+    # You will need to disable assembly manually.
78
+    option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
79
 endif(X64)
80
 if(HIGH_BIT_DEPTH)
81
-    add_definitions(-DHIGH_BIT_DEPTH=1)
82
+    option(MAIN12 "Support Main12 instead of Main10" OFF)
83
+    if(MAIN12)
84
+        add_definitions(-DHIGH_BIT_DEPTH=1 -DX265_DEPTH=12)
85
+    else()
86
+        add_definitions(-DHIGH_BIT_DEPTH=1 -DX265_DEPTH=10)
87
+    endif()
88
 else(HIGH_BIT_DEPTH)
89
-    add_definitions(-DHIGH_BIT_DEPTH=0)
90
+    add_definitions(-DHIGH_BIT_DEPTH=0 -DX265_DEPTH=8)
91
 endif(HIGH_BIT_DEPTH)
92
 
93
+# this option can only be used when linking multiple libx265 libraries
94
+# together, and some alternate API access method is implemented.
95
+option(EXPORT_C_API "Implement public C programming interface" ON)
96
+mark_as_advanced(EXPORT_C_API)
97
+if(EXPORT_C_API)
98
+    set(X265_NS x265)
99
+    add_definitions(-DEXPORT_C_API=1)
100
+elseif(HIGH_BIT_DEPTH)
101
+    if(MAIN12)
102
+        set(X265_NS x265_12bit)
103
+    else()
104
+        set(X265_NS x265_10bit)
105
+    endif()
106
+    add_definitions(-DEXPORT_C_API=0)
107
+else()
108
+    set(X265_NS x265_8bit)
109
+    add_definitions(-DEXPORT_C_API=0)
110
+endif()
111
+add_definitions(-DX265_NS=${X265_NS})
112
+
113
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
114
 if(WARNINGS_AS_ERRORS)
115
     if(GCC)
116
@@ -375,6 +423,9 @@
117
 if(NOT MSVC)
118
     set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
119
 endif()
120
+if(EXTRA_LIB)
121
+    target_link_libraries(x265-static ${EXTRA_LIB})
122
+endif()
123
 install(TARGETS x265-static
124
     LIBRARY DESTINATION ${LIB_INSTALL_DIR}
125
     ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
126
@@ -415,7 +466,7 @@
127
         if(APPLE)
128
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
129
         else()
130
-            set_target_properties(x265-shared PROPERTIES LINK_FLAGS "-Wl,-Bsymbolic,-znoexecstack")
131
+            list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
132
         endif()
133
     endif()
134
     set_target_properties(x265-shared PROPERTIES SOVERSION ${X265_BUILD})
135
@@ -429,6 +480,9 @@
136
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
137
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
138
     endif()
139
+    if(EXTRA_LIB)
140
+        target_link_libraries(x265-shared ${EXTRA_LIB})
141
+    endif()
142
     if(LINKER_OPTIONS)
143
         # set_target_properties can't do list expansion
144
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")
145
@@ -468,16 +522,14 @@
146
 endif()
147
 
148
 # Main CLI application
149
-option(ENABLE_CLI "Build standalone CLI application" ON)
150
+set(ENABLE_CLI ON CACHE BOOL "Build standalone CLI application")
151
 if(ENABLE_CLI)
152
     file(GLOB InputFiles input/input.cpp input/yuv.cpp input/y4m.cpp input/*.h)
153
     file(GLOB OutputFiles output/output.cpp output/reconplay.cpp output/*.h
154
                           output/yuv.cpp output/y4m.cpp # recon
155
                           output/raw.cpp)               # muxers
156
-    file(GLOB FilterFiles filters/*.cpp filters/*.h)
157
     source_group(input FILES ${InputFiles})
158
     source_group(output FILES ${OutputFiles})
159
-    source_group(filters FILES ${FilterFiles})
160
 
161
     check_include_files(getopt.h HAVE_GETOPT_H)
162
     if(NOT HAVE_GETOPT_H)
163
@@ -487,13 +539,18 @@
164
         include_directories(compat/getopt)
165
         set(GETOPT compat/getopt/getopt.c compat/getopt/getopt.h)
166
     endif(NOT HAVE_GETOPT_H)
167
+    if(WIN32)
168
+        set(ExportDefs "${PROJECT_BINARY_DIR}/x265.def")
169
+    endif(WIN32)
170
 
171
     if(XCODE)
172
         # Xcode seems unable to link the CLI with libs, so link as one targget
173
-        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h x265cli.h
174
-                           $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
175
+        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
176
+                       x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
177
+                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
178
     else()
179
-        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h x265cli.h)
180
+        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
181
+                       ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp)
182
         if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
183
             # The CLI cannot link to the shared library on Windows, it
184
             # requires internal APIs not exported from the DLL
185
x265_1.7.tar.gz/source/cmake/CMakeASM_YASMInformation.cmake -> x265_1.8.tar.gz/source/cmake/CMakeASM_YASMInformation.cmake Changed
17
 
1
@@ -31,9 +31,13 @@
2
 endif()
3
 
4
 if(HIGH_BIT_DEPTH)
5
-    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10)
6
+    if(MAIN12)
7
+        list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
8
+    else()
9
+        list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
10
+    endif()
11
 else()
12
-    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8)
13
+    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
14
 endif()
15
 
16
 list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS}")
17
x265_1.7.tar.gz/source/cmake/FindYasm.cmake -> x265_1.8.tar.gz/source/cmake/FindYasm.cmake Changed
10
 
1
@@ -2,7 +2,7 @@
2
 
3
 # Simple path search with YASM_ROOT environment variable override
4
 find_program(YASM_EXECUTABLE 
5
- NAMES yasm yasm-1.2.0-win32 yasm-1.2.0-win64
6
+ NAMES yasm yasm-1.2.0-win32 yasm-1.2.0-win64 yasm yasm-1.3.0-win32 yasm-1.3.0-win64
7
  HINTS $ENV{YASM_ROOT} ${YASM_ROOT}
8
  PATH_SUFFIXES bin
9
 )
10
x265_1.7.tar.gz/source/common/CMakeLists.txt -> x265_1.8.tar.gz/source/common/CMakeLists.txt Changed
56
 
1
@@ -1,7 +1,21 @@
2
 # vim: syntax=cmake
3
 
4
+list(APPEND VFLAGS "-DX265_VERSION=${X265_VERSION}")
5
+if(EXTRA_LIB)
6
+    if(LINKED_8BIT)
7
+        list(APPEND VFLAGS "-DLINKED_8BIT=1")
8
+    endif(LINKED_8BIT)
9
+    if(LINKED_10BIT)
10
+        list(APPEND VFLAGS "-DLINKED_10BIT=1")
11
+    endif(LINKED_10BIT)
12
+    if(LINKED_12BIT)
13
+        list(APPEND VFLAGS "-DLINKED_12BIT=1")
14
+    endif(LINKED_12BIT)
15
+endif(EXTRA_LIB)
16
+
17
 if(ENABLE_ASSEMBLY)
18
     set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
19
+    list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
20
 
21
     set(SSE3  vec/dct-sse3.cpp)
22
     set(SSSE3 vec/dct-ssse3.cpp)
23
@@ -46,7 +60,7 @@
24
                mc-a2.asm pixel-util8.asm blockcopy8.asm
25
                pixeladd8.asm dct8.asm)
26
     if(HIGH_BIT_DEPTH)
27
-        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
28
+        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm)
29
     else()
30
         set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm)
31
     endif()
32
@@ -69,6 +83,10 @@
33
     source_group(Assembly FILES ${ASM_PRIMITIVES})
34
 endif(ENABLE_ASSEMBLY)
35
 
36
+# set_target_properties can't do list expansion
37
+string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
38
+set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS ${VERSION_FLAGS})
39
+
40
 check_symbol_exists(strtok_r "string.h" HAVE_STRTOK_R)
41
 if(HAVE_STRTOK_R)
42
     set_source_files_properties(param.cpp PROPERTIES COMPILE_FLAGS -DHAVE_STRTOK_R=1)
43
@@ -81,11 +99,8 @@
44
     set(WINXP winxp.h winxp.cpp)
45
 endif(WIN32)
46
 
47
-set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
48
-
49
 add_library(common OBJECT
50
-    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES}
51
-    ${LIBCOMMON_SRC} ${LIBCOMMON_HDR} ${WINXP}
52
+    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP}
53
     primitives.cpp primitives.h
54
     pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
55
     constants.cpp constants.h
56
x265_1.7.tar.gz/source/common/bitstream.cpp -> x265_1.8.tar.gz/source/common/bitstream.cpp Changed
10
 
1
@@ -1,7 +1,7 @@
2
 #include "common.h"
3
 #include "bitstream.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #if defined(_MSC_VER)
9
 #pragma warning(disable: 4244)
10
x265_1.7.tar.gz/source/common/bitstream.h -> x265_1.8.tar.gz/source/common/bitstream.h Changed
10
 
1
@@ -24,7 +24,7 @@
2
 #ifndef X265_BITSTREAM_H
3
 #define X265_BITSTREAM_H 1
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class BitInterface
10
x265_1.7.tar.gz/source/common/common.cpp -> x265_1.8.tar.gz/source/common/common.cpp Changed
25
 
1
@@ -33,6 +33,8 @@
2
 #include <sys/time.h>
3
 #endif
4
 
5
+namespace X265_NS {
6
+
7
 #if CHECKED_BUILD || _DEBUG
8
 int g_checkFailures;
9
 #endif
10
@@ -50,8 +52,6 @@
11
 #endif
12
 }
13
 
14
-using namespace x265;
15
-
16
 #define X265_ALIGNBYTES 32
17
 
18
 #if _WIN32
19
@@ -215,3 +215,5 @@
20
     fclose(fh);
21
     return NULL;
22
 }
23
+
24
+}
25
x265_1.7.tar.gz/source/common/common.h -> x265_1.8.tar.gz/source/common/common.h Changed
66
 
1
@@ -106,7 +106,7 @@
2
 /* If compiled with CHECKED_BUILD perform run-time checks and log any that
3
  * fail, both to stderr and to a file */
4
 #if CHECKED_BUILD || _DEBUG
5
-extern int g_checkFailures;
6
+namespace X265_NS { extern int g_checkFailures; }
7
 #define X265_CHECK(expr, ...) if (!(expr)) { \
8
     x265_log(NULL, X265_LOG_ERROR, __VA_ARGS__); \
9
     FILE *fp = fopen("x265_check_failures.txt", "a"); \
10
@@ -126,16 +126,20 @@
11
 typedef uint64_t sum2_t;
12
 typedef uint64_t pixel4;
13
 typedef int64_t  ssum2_t;
14
-#define X265_DEPTH 10          // compile time configurable bit depth
15
 #else
16
 typedef uint8_t  pixel;
17
 typedef uint16_t sum_t;
18
 typedef uint32_t sum2_t;
19
 typedef uint32_t pixel4;
20
-typedef int32_t  ssum2_t;      //Signed sum
21
-#define X265_DEPTH 8           // compile time configurable bit depth
22
+typedef int32_t  ssum2_t; // Signed sum
23
 #endif // if HIGH_BIT_DEPTH
24
 
25
+#if X265_DEPTH <= 10
26
+typedef uint32_t sse_ret_t;
27
+#else
28
+typedef uint64_t sse_ret_t;
29
+#endif
30
+
31
 #ifndef NULL
32
 #define NULL 0
33
 #endif
34
@@ -313,7 +317,7 @@
35
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
36
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
37
 
38
-namespace x265 {
39
+namespace X265_NS {
40
 
41
 enum { SAO_NUM_OFFSET = 4 };
42
 
43
@@ -409,9 +413,7 @@
44
 /* located in pixel.cpp */
45
 void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
46
 
47
-}
48
-
49
-/* outside x265 namespace, but prefixed. defined in common.cpp */
50
+/* located in common.cpp */
51
 int64_t  x265_mdate(void);
52
 #define  x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
53
 void     general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
54
@@ -426,7 +428,10 @@
55
 void     x265_free(void *ptr);
56
 char*    x265_slurp_file(const char *filename);
57
 
58
-void     x265_setup_primitives(x265_param* param, int cpu); /* primitives.cpp */
59
+/* located in primitives.cpp */
60
+void     x265_setup_primitives(x265_param* param);
61
+void     x265_report_simd(x265_param* param);
62
+}
63
 
64
 #include "constants.h"
65
 
66
x265_1.7.tar.gz/source/common/constants.cpp -> x265_1.8.tar.gz/source/common/constants.cpp Changed
69
 
1
@@ -25,9 +25,50 @@
2
 #include "constants.h"
3
 #include "threading.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
+
8
+#if X265_DEPTH == 12
9
+
10
+// lambda = pow(2, (double)q / 6 - 2) * (1 << (12 - 8));
11
+double x265_lambda_tab[QP_MAX_MAX + 1] =
12
+{
13
+    4.0000,    4.4898,    5.0397,    5.6569,     6.3496,
14
+    7.1272,    8.0000,    8.9797,    10.0794,    11.3137,
15
+    12.6992,   14.2544,   16.0000,   17.9594,    20.1587,
16
+    22.6274,   25.3984,   28.5088,   32.0000,    35.9188,
17
+    40.3175,   45.2548,   50.7968,   57.0175,    64.0000,
18
+    71.8376,   80.6349,   90.5097,   101.5937,   114.0350,
19
+    128.0000,  143.6751,  161.2699,  181.0193,   203.1873,
20
+    228.0701,  256.0000,  287.3503,  322.5398,   362.0387,
21
+    406.3747,  456.1401,  512.0000,  574.7006,   645.0796,
22
+    724.0773,  812.7493,  912.2803,  1024.0000,  1149.4011,
23
+    1290.1592, 1448.1547, 1625.4987, 1824.5606,  2048.0000,
24
+    2298.8023, 2580.3183, 2896.3094, 3250.9974,  3649.1211,
25
+    4096.0000, 4597.6045, 5160.6366, 5792.6188,  6501.9947,
26
+    7298.2423, 8192.0000, 9195.2091, 10321.2732, 11585.2375
27
+};
28
+
29
+// lambda2 = pow(lambda, 2) * scale (0.85);
30
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
31
+{
32
+    13.6000,       17.1349,       21.5887,       27.2000,       34.2699,
33
+    43.1773,       54.4000,       68.5397,       86.3546,       108.8000,
34
+    137.0794,      172.7092,      217.6000,      274.1588,      345.4185,
35
+    435.2000,      548.3176,      690.8369,      870.4000,      1096.6353,
36
+    1381.6739,     1740.8000,     2193.2706,     2763.3478,     3481.6000,
37
+    4386.5411,     5526.6955,     6963.2000,     8773.0822,     11053.3910,
38
+    13926.4000,    17546.1645,    22106.7819,    27852.8000,    35092.3291,
39
+    44213.5641,    55705.6000,    70184.6579,    88427.1282,    111411.2000,
40
+    140369.3159,   176854.2563,   222822.4000,   280738.6324,   353708.5127,
41
+    445644.8001,   561477.2648,   707417.0237,   891289.6000,   1122954.5277,
42
+    1414834.0484,  1782579.2003,  2245909.0566,  2829668.0981,  3565158.4000,
43
+    4491818.1146,  5659336.1938,  7130316.8013,  8983636.2264,  11318672.3923,
44
+    14260633.6000, 17967272.4585, 22637344.7751, 28521267.1953, 35934544.9165,
45
+    45274689.5567, 57042534.4000, 71869089.8338, 90549379.1181, 114085068.8008
46
+};
47
+
48
+#elif X265_DEPTH == 10
49
 
50
-#if HIGH_BIT_DEPTH
51
 // lambda = pow(2, (double)q / 6 - 2) * (1 << (X265_DEPTH - 8));
52
 double x265_lambda_tab[QP_MAX_MAX + 1] =
53
 {
54
@@ -324,11 +365,12 @@
55
       4,  12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
56
 };
57
 
58
-ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
59
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]) =
60
 {
61
     { 0,  4,  1,  8,  5,  2, 12,  9,  6,  3, 13, 10,  7, 14, 11, 15 },
62
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
63
-    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 }
64
+    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 },
65
+    { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 }
66
 };
67
 
68
 const uint16_t g_scan16x16[16 * 16] =
69
x265_1.7.tar.gz/source/common/constants.h -> x265_1.8.tar.gz/source/common/constants.h Changed
19
 
1
@@ -26,7 +26,7 @@
2
 
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 extern int g_ctuSizeConfigured;
10
@@ -83,7 +83,7 @@
11
 extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
12
 extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
13
 extern const uint16_t g_scan8x8diag[8 * 8];
14
-extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4];
15
+extern const uint16_t g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4];  // +1 for safe buffer area for codeCoeffNxN assembly optimize, there have up to 15 bytes beyond bound read
16
 
17
 extern const uint8_t g_lastCoeffTable[32];
18
 extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes
19
x265_1.7.tar.gz/source/common/contexts.h -> x265_1.8.tar.gz/source/common/contexts.h Changed
16
 
1
@@ -102,11 +102,12 @@
2
 #define OFF_TQUANT_BYPASS_FLAG_CTX (OFF_TRANSFORMSKIP_FLAG_CTX + 2 * NUM_TRANSFORMSKIP_FLAG_CTX)
3
 #define MAX_OFF_CTX_MOD            (OFF_TQUANT_BYPASS_FLAG_CTX +     NUM_TQUANT_BYPASS_FLAG_CTX)
4
 
5
-namespace x265 {
6
+extern "C" const uint32_t PFX(entropyStateBits)[128];
7
+
8
+namespace X265_NS {
9
 // private namespace
10
 
11
 extern const uint32_t g_entropyBits[128];
12
-extern const uint32_t g_entropyStateBits[128];
13
 extern const uint8_t g_nextState[128][2];
14
 
15
 #define sbacGetMps(S)            ((S) & 1)
16
x265_1.7.tar.gz/source/common/cpu.cpp -> x265_1.8.tar.gz/source/common/cpu.cpp Changed
137
 
1
@@ -57,7 +57,7 @@
2
 
3
 #endif // if X265_ARCH_ARM
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 const cpu_name_t cpu_names[] =
8
 {
9
 #if X265_ARCH_X86
10
@@ -107,9 +107,9 @@
11
 
12
 extern "C" {
13
 /* cpu-a.asm */
14
-int x265_cpu_cpuid_test(void);
15
-void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
16
-void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
17
+int PFX(cpu_cpuid_test)(void);
18
+void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
19
+void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
20
 }
21
 
22
 #if defined(_MSC_VER)
23
@@ -125,16 +125,16 @@
24
     uint32_t max_extended_cap, max_basic_cap;
25
 
26
 #if !X86_64
27
-    if (!x265_cpu_cpuid_test())
28
+    if (!PFX(cpu_cpuid_test)())
29
         return 0;
30
 #endif
31
 
32
-    x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
33
+    PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
34
     max_basic_cap = eax;
35
     if (max_basic_cap == 0)
36
         return 0;
37
 
38
-    x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
39
+    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
40
     if (edx & 0x00800000)
41
         cpu |= X265_CPU_MMX;
42
     else
43
@@ -159,7 +159,7 @@
44
     if ((ecx & 0x18000000) == 0x18000000)
45
     {
46
         /* Check for OS support */
47
-        x265_cpu_xgetbv(0, &eax, &edx);
48
+        PFX(cpu_xgetbv)(0, &eax, &edx);
49
         if ((eax & 0x6) == 0x6)
50
         {
51
             cpu |= X265_CPU_AVX;
52
@@ -170,7 +170,7 @@
53
 
54
     if (max_basic_cap >= 7)
55
     {
56
-        x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
57
+        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
58
         /* AVX2 requires OS support, but BMI1/2 don't. */
59
         if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
60
             cpu |= X265_CPU_AVX2;
61
@@ -185,12 +185,12 @@
62
     if (cpu & X265_CPU_SSSE3)
63
         cpu |= X265_CPU_SSE2_IS_FAST;
64
 
65
-    x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
66
+    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
67
     max_extended_cap = eax;
68
 
69
     if (max_extended_cap >= 0x80000001)
70
     {
71
-        x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
72
+        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
73
 
74
         if (ecx & 0x00000020)
75
             cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
76
@@ -233,7 +233,7 @@
77
 
78
     if (!strcmp((char*)vendor, "GenuineIntel"))
79
     {
80
-        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
81
+        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
82
         int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
83
         int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
84
         if (family == 6)
85
@@ -264,11 +264,11 @@
86
     if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
87
     {
88
         /* cacheline size is specified in 3 places, any of which may be missing */
89
-        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
90
+        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
91
         int cache = (ebx & 0xff00) >> 5; // cflush size
92
         if (!cache && max_extended_cap >= 0x80000006)
93
         {
94
-            x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
95
+            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
96
             cache = ecx & 0xff; // cacheline size
97
         }
98
         if (!cache && max_basic_cap >= 2)
99
@@ -281,7 +281,7 @@
100
             int max, i = 0;
101
             do
102
             {
103
-                x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
104
+                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
105
                 max = buf[0] & 0xff;
106
                 buf[0] &= ~0xff;
107
                 for (int j = 0; j < 4; j++)
108
@@ -318,8 +318,8 @@
109
 #elif X265_ARCH_ARM
110
 
111
 extern "C" {
112
-void x265_cpu_neon_test(void);
113
-int x265_cpu_fast_neon_mrc_test(void);
114
+void PFX(cpu_neon_test)(void);
115
+int PFX(cpu_fast_neon_mrc_test)(void);
116
 }
117
 
118
 uint32_t cpu_detect(void)
119
@@ -340,7 +340,7 @@
120
     }
121
 
122
     canjump = 1;
123
-    x265_cpu_neon_test();
124
+    PFX(cpu_neon_test)();
125
     canjump = 0;
126
     signal(SIGILL, oldsig);
127
 #endif // if !HAVE_NEON
128
@@ -356,7 +356,7 @@
129
     // which may result in incorrect detection and the counters stuck enabled.
130
     // right now Apple does not seem to support performance counters for this test
131
 #ifndef __MACH__
132
-    flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
133
+    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
134
 #endif
135
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
136
 #endif // if HAVE_ARMV6
137
x265_1.7.tar.gz/source/common/cpu.h -> x265_1.8.tar.gz/source/common/cpu.h Changed
37
 
1
@@ -27,24 +27,29 @@
2
 
3
 #include "common.h"
4
 
5
+/* All assembly functions are prefixed with X265_NS (macro expanded) */
6
+#define PFX3(prefix, name) prefix ## _ ## name
7
+#define PFX2(prefix, name) PFX3(prefix, name)
8
+#define PFX(name)          PFX2(X265_NS, name)
9
+
10
 // from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp
11
-extern "C" void x265_cpu_emms(void);
12
-extern "C" void x265_safe_intel_cpu_indicator_init(void);
13
+extern "C" void PFX(cpu_emms)(void);
14
+extern "C" void PFX(safe_intel_cpu_indicator_init)(void);
15
 
16
 #if _MSC_VER && _WIN64
17
-#define x265_emms() x265_cpu_emms()
18
+#define x265_emms() PFX(cpu_emms)()
19
 #elif _MSC_VER
20
 #include <mmintrin.h>
21
 #define x265_emms() _mm_empty()
22
 #elif __GNUC__
23
 // Cannot use _mm_empty() directly without compiling all the source with
24
 // a fixed CPU arch, which we would like to avoid at the moment
25
-#define x265_emms() x265_cpu_emms()
26
+#define x265_emms() PFX(cpu_emms)()
27
 #else
28
-#define x265_emms() x265_cpu_emms()
29
+#define x265_emms() PFX(cpu_emms)()
30
 #endif
31
 
32
-namespace x265 {
33
+namespace X265_NS {
34
 uint32_t cpu_detect(void);
35
 
36
 struct cpu_name_t
37
x265_1.7.tar.gz/source/common/cudata.cpp -> x265_1.8.tar.gz/source/common/cudata.cpp Changed
92
 
1
@@ -28,33 +28,33 @@
2
 #include "mv.h"
3
 #include "cudata.h"
4
 
5
-using namespace x265;
6
-
7
-namespace {
8
-// file private namespace
9
+using namespace X265_NS;
10
 
11
 /* for all bcast* and copy* functions, dst and src are aligned to MIN(size, 32) */
12
 
13
-void bcast1(uint8_t* dst, uint8_t val)  { dst[0] = val; }
14
+static void bcast1(uint8_t* dst, uint8_t val)  { dst[0] = val; }
15
 
16
-void copy4(uint8_t* dst, uint8_t* src)  { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; }
17
-void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101u * val; }
18
+static void copy4(uint8_t* dst, uint8_t* src)  { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; }
19
+static void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101u * val; }
20
 
21
-void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; }
22
-void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; }
23
+static void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; }
24
+static void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; }
25
 
26
-void copy64(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; 
27
-                                          ((uint64_t*)dst)[2] = ((uint64_t*)src)[2]; ((uint64_t*)dst)[3] = ((uint64_t*)src)[3];
28
-                                          ((uint64_t*)dst)[4] = ((uint64_t*)src)[4]; ((uint64_t*)dst)[5] = ((uint64_t*)src)[5];
29
-                                          ((uint64_t*)dst)[6] = ((uint64_t*)src)[6]; ((uint64_t*)dst)[7] = ((uint64_t*)src)[7]; }
30
-void bcast64(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val;
31
-                                          ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; ((uint64_t*)dst)[2] = bval; ((uint64_t*)dst)[3] = bval;
32
-                                          ((uint64_t*)dst)[4] = bval; ((uint64_t*)dst)[5] = bval; ((uint64_t*)dst)[6] = bval; ((uint64_t*)dst)[7] = bval; }
33
+static void copy64(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; 
34
+                                                 ((uint64_t*)dst)[2] = ((uint64_t*)src)[2]; ((uint64_t*)dst)[3] = ((uint64_t*)src)[3];
35
+                                                 ((uint64_t*)dst)[4] = ((uint64_t*)src)[4]; ((uint64_t*)dst)[5] = ((uint64_t*)src)[5];
36
+                                                 ((uint64_t*)dst)[6] = ((uint64_t*)src)[6]; ((uint64_t*)dst)[7] = ((uint64_t*)src)[7]; }
37
+static void bcast64(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val;
38
+                                                 ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; ((uint64_t*)dst)[2] = bval; ((uint64_t*)dst)[3] = bval;
39
+                                                 ((uint64_t*)dst)[4] = bval; ((uint64_t*)dst)[5] = bval; ((uint64_t*)dst)[6] = bval; ((uint64_t*)dst)[7] = bval; }
40
 
41
 /* at 256 bytes, memset/memcpy will probably use SIMD more effectively than our uint64_t hack,
42
  * but hand-written assembly would beat it. */
43
-void copy256(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 256); }
44
-void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
45
+static void copy256(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 256); }
46
+static void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
47
+
48
+namespace {
49
+// file private namespace
50
 
51
 /* Check whether 2 addresses point to the same column */
52
 inline bool isEqualCol(int addrA, int addrB, int numUnits)
53
@@ -112,38 +112,6 @@
54
     return MV((int16_t)mvx, (int16_t)mvy);
55
 }
56
 
57
-// Partition table.
58
-// First index is partitioning mode. Second index is partition index.
59
-// Third index is 0 for partition sizes, 1 for partition offsets. The 
60
-// sizes and offsets are encoded as two packed 4-bit values (X,Y). 
61
-// X and Y represent 1/4 fractions of the block size.
62
-const uint32_t partTable[8][4][2] =
63
-{
64
-    //        XY
65
-    { { 0x44, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2Nx2N.
66
-    { { 0x42, 0x00 }, { 0x42, 0x02 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxN.
67
-    { { 0x24, 0x00 }, { 0x24, 0x20 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_Nx2N.
68
-    { { 0x22, 0x00 }, { 0x22, 0x20 }, { 0x22, 0x02 }, { 0x22, 0x22 } }, // SIZE_NxN.
69
-    { { 0x41, 0x00 }, { 0x43, 0x01 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnU.
70
-    { { 0x43, 0x00 }, { 0x41, 0x03 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnD.
71
-    { { 0x14, 0x00 }, { 0x34, 0x10 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_nLx2N.
72
-    { { 0x34, 0x00 }, { 0x14, 0x30 }, { 0x00, 0x00 }, { 0x00, 0x00 } }  // SIZE_nRx2N.
73
-};
74
-
75
-// Partition Address table.
76
-// First index is partitioning mode. Second index is partition address.
77
-const uint32_t partAddrTable[8][4] =
78
-{
79
-    { 0x00, 0x00, 0x00, 0x00 }, // SIZE_2Nx2N.
80
-    { 0x00, 0x08, 0x08, 0x08 }, // SIZE_2NxN.
81
-    { 0x00, 0x04, 0x04, 0x04 }, // SIZE_Nx2N.
82
-    { 0x00, 0x04, 0x08, 0x0C }, // SIZE_NxN.
83
-    { 0x00, 0x02, 0x02, 0x02 }, // SIZE_2NxnU.
84
-    { 0x00, 0x0A, 0x0A, 0x0A }, // SIZE_2NxnD.
85
-    { 0x00, 0x01, 0x01, 0x01 }, // SIZE_nLx2N.
86
-    { 0x00, 0x05, 0x05, 0x05 }  // SIZE_nRx2N.
87
-};
88
-
89
 }
90
 
91
 cubcast_t CUData::s_partSet[NUM_FULL_DEPTH] = { NULL, NULL, NULL, NULL, NULL };
92
x265_1.7.tar.gz/source/common/cudata.h -> x265_1.8.tar.gz/source/common/cudata.h Changed
62
 
1
@@ -28,7 +28,7 @@
2
 #include "slice.h"
3
 #include "mv.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class FrameData;
10
@@ -121,6 +121,38 @@
11
 // Partition count table, index represents partitioning mode.
12
 const uint32_t nbPartsTable[8] = { 1, 2, 2, 4, 2, 2, 2, 2 };
13
 
14
+// Partition table.
15
+// First index is partitioning mode. Second index is partition index.
16
+// Third index is 0 for partition sizes, 1 for partition offsets. The 
17
+// sizes and offsets are encoded as two packed 4-bit values (X,Y). 
18
+// X and Y represent 1/4 fractions of the block size.
19
+const uint32_t partTable[8][4][2] =
20
+{
21
+    //        XY
22
+    { { 0x44, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2Nx2N.
23
+    { { 0x42, 0x00 }, { 0x42, 0x02 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxN.
24
+    { { 0x24, 0x00 }, { 0x24, 0x20 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_Nx2N.
25
+    { { 0x22, 0x00 }, { 0x22, 0x20 }, { 0x22, 0x02 }, { 0x22, 0x22 } }, // SIZE_NxN.
26
+    { { 0x41, 0x00 }, { 0x43, 0x01 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnU.
27
+    { { 0x43, 0x00 }, { 0x41, 0x03 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnD.
28
+    { { 0x14, 0x00 }, { 0x34, 0x10 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_nLx2N.
29
+    { { 0x34, 0x00 }, { 0x14, 0x30 }, { 0x00, 0x00 }, { 0x00, 0x00 } }  // SIZE_nRx2N.
30
+};
31
+
32
+// Partition Address table.
33
+// First index is partitioning mode. Second index is partition address.
34
+const uint32_t partAddrTable[8][4] =
35
+{
36
+    { 0x00, 0x00, 0x00, 0x00 }, // SIZE_2Nx2N.
37
+    { 0x00, 0x08, 0x08, 0x08 }, // SIZE_2NxN.
38
+    { 0x00, 0x04, 0x04, 0x04 }, // SIZE_Nx2N.
39
+    { 0x00, 0x04, 0x08, 0x0C }, // SIZE_NxN.
40
+    { 0x00, 0x02, 0x02, 0x02 }, // SIZE_2NxnU.
41
+    { 0x00, 0x0A, 0x0A, 0x0A }, // SIZE_2NxnD.
42
+    { 0x00, 0x01, 0x01, 0x01 }, // SIZE_nLx2N.
43
+    { 0x00, 0x05, 0x05, 0x05 }  // SIZE_nRx2N.
44
+};
45
+
46
 // Holds part data for a CU of a given size, from an 8x8 CU to a CTU
47
 class CUData
48
 {
49
@@ -222,8 +254,11 @@
50
     void     getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
51
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
52
     void     getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
53
+    uint32_t getBestRefIdx(uint32_t subPartIdx) const { return ((m_interDir[subPartIdx] & 1) << m_refIdx[0][subPartIdx]) | 
54
+                                                              (((m_interDir[subPartIdx] >> 1) & 1) << (m_refIdx[1][subPartIdx] + 16)); }
55
+    uint32_t getPUOffset(uint32_t puIdx, uint32_t absPartIdx) const { return (partAddrTable[(int)m_partSize[absPartIdx]][puIdx] << (g_unitSizeDepth - m_cuDepth[absPartIdx]) * 2) >> 4; }
56
 
57
-    uint32_t getNumPartInter() const              { return nbPartsTable[(int)m_partSize[0]]; }
58
+    uint32_t getNumPartInter(uint32_t absPartIdx) const              { return nbPartsTable[(int)m_partSize[absPartIdx]]; }
59
     bool     isIntra(uint32_t absPartIdx) const   { return m_predMode[absPartIdx] == MODE_INTRA; }
60
     bool     isInter(uint32_t absPartIdx) const   { return !!(m_predMode[absPartIdx] & MODE_INTER); }
61
     bool     isSkipped(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_SKIP; }
62
x265_1.7.tar.gz/source/common/dct.cpp -> x265_1.8.tar.gz/source/common/dct.cpp Changed
201
 
1
@@ -29,19 +29,18 @@
2
 
3
 #include "common.h"
4
 #include "primitives.h"
5
+#include "contexts.h"   // costCoeffNxN_c
6
+#include "threading.h"  // CLZ
7
 
8
-using namespace x265;
9
+using namespace X265_NS;
10
 
11
 #if _MSC_VER
12
 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
13
 #endif
14
 
15
-namespace {
16
-// anonymous file-static namespace
17
-
18
 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
19
 // give identical results
20
-void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
21
+static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
22
 {
23
     int c[4];
24
     int rnd_factor = 1 << (shift - 1);
25
@@ -61,7 +60,7 @@
26
     }
27
 }
28
 
29
-void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
30
+static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
31
 {
32
     int i, c[4];
33
     int rnd_factor = 1 << (shift - 1);
34
@@ -81,7 +80,7 @@
35
     }
36
 }
37
 
38
-void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
39
+static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
40
 {
41
     int j, k;
42
     int E[8], O[8];
43
@@ -134,7 +133,7 @@
44
     }
45
 }
46
 
47
-void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
48
+static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
49
 {
50
     int j, k;
51
     int E[16], O[16];
52
@@ -203,7 +202,7 @@
53
     }
54
 }
55
 
56
-void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
57
+static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
58
 {
59
     int j, k;
60
     int E[4], O[4];
61
@@ -240,7 +239,7 @@
62
     }
63
 }
64
 
65
-void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
66
+static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
67
 {
68
     int j;
69
     int E[2], O[2];
70
@@ -265,7 +264,7 @@
71
     }
72
 }
73
 
74
-void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
75
+static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
76
 {
77
     int j, k;
78
     int E[4], O[4];
79
@@ -301,7 +300,7 @@
80
     }
81
 }
82
 
83
-void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
84
+static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
85
 {
86
     int j, k;
87
     int E[8], O[8];
88
@@ -352,7 +351,7 @@
89
     }
90
 }
91
 
92
-void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
93
+static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
94
 {
95
     int j, k;
96
     int E[16], O[16];
97
@@ -416,7 +415,7 @@
98
     }
99
 }
100
 
101
-void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
102
+static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
103
 {
104
     int j;
105
     int E[2], O[2];
106
@@ -440,7 +439,7 @@
107
     }
108
 }
109
 
110
-void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
111
+static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
112
 {
113
     const int shift_1st = 1 + X265_DEPTH - 8;
114
     const int shift_2nd = 8;
115
@@ -457,7 +456,7 @@
116
     fastForwardDst(coef, dst, shift_2nd);
117
 }
118
 
119
-void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
120
+static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
121
 {
122
     const int shift_1st = 1 + X265_DEPTH - 8;
123
     const int shift_2nd = 8;
124
@@ -474,7 +473,7 @@
125
     partialButterfly4(coef, dst, shift_2nd, 4);
126
 }
127
 
128
-void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
129
+static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
130
 {
131
     const int shift_1st = 2 + X265_DEPTH - 8;
132
     const int shift_2nd = 9;
133
@@ -491,7 +490,7 @@
134
     partialButterfly8(coef, dst, shift_2nd, 8);
135
 }
136
 
137
-void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
138
+static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
139
 {
140
     const int shift_1st = 3 + X265_DEPTH - 8;
141
     const int shift_2nd = 10;
142
@@ -508,7 +507,7 @@
143
     partialButterfly16(coef, dst, shift_2nd, 16);
144
 }
145
 
146
-void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
147
+static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
148
 {
149
     const int shift_1st = 4 + X265_DEPTH - 8;
150
     const int shift_2nd = 11;
151
@@ -525,7 +524,7 @@
152
     partialButterfly32(coef, dst, shift_2nd, 32);
153
 }
154
 
155
-void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
156
+static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
157
 {
158
     const int shift_1st = 7;
159
     const int shift_2nd = 12 - (X265_DEPTH - 8);
160
@@ -542,7 +541,7 @@
161
     }
162
 }
163
 
164
-void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
165
+static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
166
 {
167
     const int shift_1st = 7;
168
     const int shift_2nd = 12 - (X265_DEPTH - 8);
169
@@ -559,7 +558,7 @@
170
     }
171
 }
172
 
173
-void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
174
+static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
175
 {
176
     const int shift_1st = 7;
177
     const int shift_2nd = 12 - (X265_DEPTH - 8);
178
@@ -576,7 +575,7 @@
179
     }
180
 }
181
 
182
-void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
183
+static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
184
 {
185
     const int shift_1st = 7;
186
     const int shift_2nd = 12 - (X265_DEPTH - 8);
187
@@ -593,7 +592,7 @@
188
     }
189
 }
190
 
191
-void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
192
+static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
193
 {
194
     const int shift_1st = 7;
195
     const int shift_2nd = 12 - (X265_DEPTH - 8);
196
@@ -610,10 +609,10 @@
197
     }
198
 }
199
 
200
-void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
201
x265_1.7.tar.gz/source/common/deblock.cpp -> x265_1.8.tar.gz/source/common/deblock.cpp Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "slice.h"
3
 #include "mv.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #define DEBLOCK_SMALLEST_BLOCK  8
9
 #define DEFAULT_INTRA_TC_OFFSET 2
10
x265_1.7.tar.gz/source/common/deblock.h -> x265_1.8.tar.gz/source/common/deblock.h Changed
10
 
1
@@ -26,7 +26,7 @@
2
 
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class CUData;
10
x265_1.7.tar.gz/source/common/frame.cpp -> x265_1.8.tar.gz/source/common/frame.cpp Changed
10
 
1
@@ -26,7 +26,7 @@
2
 #include "picyuv.h"
3
 #include "framedata.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 Frame::Frame()
9
 {
10
x265_1.7.tar.gz/source/common/frame.h -> x265_1.8.tar.gz/source/common/frame.h Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "lowres.h"
3
 #include "threading.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class FrameData;
10
x265_1.7.tar.gz/source/common/framedata.cpp -> x265_1.8.tar.gz/source/common/framedata.cpp Changed
10
 
1
@@ -24,7 +24,7 @@
2
 #include "framedata.h"
3
 #include "picyuv.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 FrameData::FrameData()
9
 {
10
x265_1.7.tar.gz/source/common/framedata.h -> x265_1.8.tar.gz/source/common/framedata.h Changed
72
 
1
@@ -28,12 +28,61 @@
2
 #include "slice.h"
3
 #include "cudata.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class PicYuv;
10
 class JobProvider;
11
 
12
+#define INTER_MODES 4 // 2Nx2N, 2NxN, Nx2N, AMP modes
13
+#define INTRA_MODES 3 // DC, Planar, Angular modes
14
+
15
+/* Current frame stats for 2 pass */
16
+struct FrameStats
17
+{
18
+    int         mvBits;    /* MV bits (MV+Ref+Block Type) */
19
+    int         coeffBits; /* Texture bits (DCT coefs) */
20
+    int         miscBits;
21
+
22
+    int         intra8x8Cnt;
23
+    int         inter8x8Cnt;
24
+    int         skip8x8Cnt;
25
+
26
+    /* CU type counts stored as percentage */
27
+    double      percent8x8Intra;
28
+    double      percent8x8Inter;
29
+    double      percent8x8Skip;
30
+    double      avgLumaDistortion;
31
+    double      avgChromaDistortion;
32
+    double      avgPsyEnergy;
33
+    double      avgLumaLevel;
34
+    double      lumaLevel;
35
+    double      percentIntraNxN;
36
+    double      percentSkipCu[NUM_CU_DEPTH];
37
+    double      percentMergeCu[NUM_CU_DEPTH];
38
+    double      percentIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
39
+    double      percentInterDistribution[NUM_CU_DEPTH][3];           // 2Nx2N, RECT, AMP modes percentage
40
+
41
+    uint64_t    cntIntraNxN;
42
+    uint64_t    totalCu;
43
+    uint64_t    totalCtu;
44
+    uint64_t    lumaDistortion;
45
+    uint64_t    chromaDistortion;
46
+    uint64_t    psyEnergy;
47
+    uint64_t    cntSkipCu[NUM_CU_DEPTH];
48
+    uint64_t    cntMergeCu[NUM_CU_DEPTH];
49
+    uint64_t    cntInter[NUM_CU_DEPTH];
50
+    uint64_t    cntIntra[NUM_CU_DEPTH];
51
+    uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
52
+    uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
53
+    uint16_t    maxLumaLevel;
54
+
55
+    FrameStats()
56
+    {
57
+        memset(this, 0, sizeof(FrameStats));
58
+    }
59
+};
60
+
61
 /* Per-frame data that is used during encodes and referenced while the picture
62
  * is available for reference. A FrameData instance is attached to a Frame as it
63
  * comes out of the lookahead. Frames which are not being encoded do not have a
64
@@ -85,6 +134,7 @@
65
 
66
     RCStatCU*      m_cuStat;
67
     RCStatRow*     m_rowStat;
68
+    FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
69
 
70
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
71
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
72
x265_1.7.tar.gz/source/common/intrapred.cpp -> x265_1.8.tar.gz/source/common/intrapred.cpp Changed
28
 
1
@@ -24,7 +24,7 @@
2
 #include "common.h"
3
 #include "primitives.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 namespace {
9
 
10
@@ -50,7 +50,7 @@
11
     filtered[tuSize2 + tuSize2] = leftLast;
12
 }
13
 
14
-void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
15
+static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
16
 {
17
     // boundary pixels processing
18
     dst[0] = (pixel)((above[0] + left[0] + 2 * dst[0] + 2) >> 2);
19
@@ -234,7 +234,7 @@
20
 }
21
 }
22
 
23
-namespace x265 {
24
+namespace X265_NS {
25
 // x265 private namespace
26
 
27
 void setupIntraPrimitives_c(EncoderPrimitives& p)
28
x265_1.7.tar.gz/source/common/ipfilter.cpp -> x265_1.8.tar.gz/source/common/ipfilter.cpp Changed
36
 
1
@@ -27,13 +27,15 @@
2
 #include "primitives.h"
3
 #include "x265.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #if _MSC_VER
9
 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
10
 #endif
11
 
12
 namespace {
13
+// file local namespace
14
+
15
 template<int width, int height>
16
 void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
17
 {
18
@@ -53,7 +55,7 @@
19
     }
20
 }
21
 
22
-void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX)
23
+static void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX)
24
 {
25
     for (int y = 0; y < height; y++)
26
     {
27
@@ -369,7 +371,7 @@
28
 }
29
 }
30
 
31
-namespace x265 {
32
+namespace X265_NS {
33
 // x265 private namespace
34
 
35
 #define CHROMA_420(W, H) \
36
x265_1.7.tar.gz/source/common/loopfilter.cpp -> x265_1.8.tar.gz/source/common/loopfilter.cpp Changed
71
 
1
@@ -36,13 +36,13 @@
2
     return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
3
 }
4
 
5
-void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
6
+static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
7
 {
8
     for (int x = 0; x < endX; x++)
9
         dst[x] = signOf(src1[x] - src2[x]);
10
 }
11
 
12
-void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
13
+static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
14
 {
15
     int x, y;
16
     int8_t signRight, signLeft0;
17
@@ -62,7 +62,7 @@
18
     }
19
 }
20
 
21
-void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
22
+static void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
23
 {
24
     int x;
25
     int8_t signDown;
26
@@ -77,7 +77,7 @@
27
     }
28
 }
29
 
30
-void processSaoCUE1_2Rows(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
31
+static void processSaoCUE1_2Rows(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
32
 {
33
     int x, y;
34
     int8_t signDown;
35
@@ -96,7 +96,7 @@
36
     }
37
 }
38
 
39
-void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
40
+static void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
41
 {
42
     int x;
43
     for (x = 0; x < width; x++)
44
@@ -108,7 +108,7 @@
45
     }
46
 }
47
 
48
-void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
49
+static void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
50
 {
51
     int8_t signDown;
52
     int8_t edgeType;
53
@@ -122,7 +122,7 @@
54
     }
55
 }
56
 
57
-void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
58
+static void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
59
 {
60
     #define SAO_BO_BITS 5
61
     const int boShift = X265_DEPTH - SAO_BO_BITS;
62
@@ -138,7 +138,7 @@
63
 }
64
 }
65
 
66
-namespace x265 {
67
+namespace X265_NS {
68
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
69
 {
70
     p.saoCuOrgE0 = processSaoCUE0;
71
x265_1.7.tar.gz/source/common/lowres.cpp -> x265_1.8.tar.gz/source/common/lowres.cpp Changed
47
 
1
@@ -25,7 +25,7 @@
2
 #include "lowres.h"
3
 #include "mv.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled)
9
 {
10
@@ -36,13 +36,13 @@
11
     lumaStride = width + 2 * origPic->m_lumaMarginX;
12
     if (lumaStride & 31)
13
         lumaStride += 32 - (lumaStride & 31);
14
-    int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
15
-    int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
16
-    int cuCount = cuWidth * cuHeight;
17
+    maxBlocksInRow = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
18
+    maxBlocksInCol = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
19
+    int cuCount = maxBlocksInRow * maxBlocksInCol;
20
 
21
     /* rounding the width to multiple of lowres CU size */
22
-    width = cuWidth * X265_LOWRES_CU_SIZE;
23
-    lines = cuHeight * X265_LOWRES_CU_SIZE;
24
+    width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
25
+    lines = maxBlocksInCol * X265_LOWRES_CU_SIZE;
26
 
27
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
28
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
29
@@ -74,7 +74,7 @@
30
     {
31
         for (int j = 0; j < bframes + 2; j++)
32
         {
33
-            CHECKED_MALLOC(rowSatds[i][j], int32_t, cuHeight);
34
+            CHECKED_MALLOC(rowSatds[i][j], int32_t, maxBlocksInCol);
35
             CHECKED_MALLOC(lowresCosts[i][j], uint16_t, cuCount);
36
         }
37
     }
38
@@ -126,7 +126,7 @@
39
 void Lowres::init(PicYuv *origPic, int poc)
40
 {
41
     bLastMiniGopBFrame = false;
42
-    bScenecut = true;  // could be a scene-cut, until ruled out by flash detection
43
+    bScenecut = false;  // could be a scene-cut, until ruled out by flash detection
44
     bKeyframe = false; // Not a keyframe unless identified by lookahead
45
     frameNum = poc;
46
     leadingBframes = 0;
47
x265_1.7.tar.gz/source/common/lowres.h -> x265_1.8.tar.gz/source/common/lowres.h Changed
19
 
1
@@ -29,7 +29,7 @@
2
 #include "picyuv.h"
3
 #include "mv.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 struct ReferencePlanes
10
@@ -130,6 +130,8 @@
11
     uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]);
12
     int32_t*  lowresMvCosts[2][X265_BFRAME_MAX + 1];
13
     MV*       lowresMvs[2][X265_BFRAME_MAX + 1];
14
+    uint32_t  maxBlocksInRow;
15
+    uint32_t  maxBlocksInCol;
16
 
17
     /* used for vbvLookahead */
18
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
19
x265_1.7.tar.gz/source/common/md5.cpp -> x265_1.8.tar.gz/source/common/md5.cpp Changed
10
 
1
@@ -25,7 +25,7 @@
2
 #include "common.h"
3
 #include "md5.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 #ifndef ARCH_BIG_ENDIAN
10
x265_1.7.tar.gz/source/common/md5.h -> x265_1.8.tar.gz/source/common/md5.h Changed
10
 
1
@@ -27,7 +27,7 @@
2
 
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 //private x265 namespace
8
 
9
 typedef struct MD5Context
10
x265_1.7.tar.gz/source/common/mv.h -> x265_1.8.tar.gz/source/common/mv.h Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "primitives.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 #if _MSC_VER
10
x265_1.7.tar.gz/source/common/param.cpp -> x265_1.8.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -52,7 +52,7 @@
2
  */
3
 
4
 #undef strtok_r
5
-char* strtok_r(char* str, const char* delim, char** nextp)
6
+static char* strtok_r(char* str, const char* delim, char** nextp)
7
 {
8
     if (!str)
9
         str = *nextp;
10
@@ -76,27 +76,35 @@
11
 
12
 #endif // if !defined(HAVE_STRTOK_R)
13
 
14
-using namespace x265;
15
+#if EXPORT_C_API
16
+
17
+/* these functions are exported as C functions (default) */
18
+using namespace X265_NS;
19
+extern "C" {
20
+
21
+#else
22
+
23
+/* these functions exist within private namespace (multilib) */
24
+namespace X265_NS {
25
+
26
+#endif
27
 
28
-extern "C"
29
 x265_param *x265_param_alloc()
30
 {
31
     return (x265_param*)x265_malloc(sizeof(x265_param));
32
 }
33
 
34
-extern "C"
35
 void x265_param_free(x265_param* p)
36
 {
37
     x265_free(p);
38
 }
39
 
40
-extern "C"
41
 void x265_param_default(x265_param* param)
42
 {
43
     memset(param, 0, sizeof(x265_param));
44
 
45
     /* Applying default values to all elements in the param structure */
46
-    param->cpuid = x265::cpu_detect();
47
+    param->cpuid = X265_NS::cpu_detect();
48
     param->bEnableWavefront = 1;
49
     param->frameNumThreads = 0;
50
 
51
@@ -111,7 +119,7 @@
52
     param->bEnableSsim = 0;
53
 
54
     /* Source specifications */
55
-    param->internalBitDepth = x265_max_bit_depth;
56
+    param->internalBitDepth = X265_DEPTH;
57
     param->internalCsp = X265_CSP_I420;
58
 
59
     param->levelIdc = 0;
60
@@ -151,6 +159,7 @@
61
     param->subpelRefine = 2;
62
     param->searchRange = 57;
63
     param->maxNumMergeCand = 2;
64
+    param->limitReferences = 0;
65
     param->bEnableWeightedPred = 1;
66
     param->bEnableWeightedBiPred = 0;
67
     param->bEnableEarlySkip = 0;
68
@@ -197,6 +206,7 @@
69
     param->rc.rateControlMode = X265_RC_CRF;
70
     param->rc.qp = 32;
71
     param->rc.aqMode = X265_AQ_VARIANCE;
72
+    param->rc.qgSize = 32;
73
     param->rc.aqStrength = 1.0;
74
     param->rc.cuTree = 1;
75
     param->rc.rfConstantMax = 0;
76
@@ -210,7 +220,6 @@
77
     param->rc.zones = NULL;
78
     param->rc.bEnableSlowFirstPass = 0;
79
     param->rc.bStrictCbr = 0;
80
-    param->rc.qgSize = 64; /* Same as maxCUSize */
81
 
82
     /* Video Usability Information (VUI) */
83
     param->vui.aspectRatioIdc = 0;
84
@@ -234,10 +243,13 @@
85
     param->vui.defDispWinBottomOffset = 0;
86
 }
87
 
88
-extern "C"
89
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
90
 {
91
-    x265_param_default(param);
92
+#if EXPORT_C_API
93
+    ::x265_param_default(param);
94
+#else
95
+    X265_NS::x265_param_default(param);
96
+#endif
97
 
98
     if (preset)
99
     {
100
@@ -430,8 +442,8 @@
101
             param->deblockingFilterBetaOffset = -2;
102
             param->deblockingFilterTCOffset = -2;
103
             param->bIntraInBFrames = 0;
104
-            param->rdoqLevel = 1;
105
-            param->psyRdoq = 30;
106
+            param->rdoqLevel = 2;
107
+            param->psyRdoq = 10.0;
108
             param->psyRd = 0.5;
109
             param->rc.ipFactor = 1.1;
110
             param->rc.pbFactor = 1.1;
111
@@ -459,16 +471,6 @@
112
     return 0;
113
 }
114
 
115
-static double x265_atof(const char* str, bool& bError)
116
-{
117
-    char *end;
118
-    double v = strtod(str, &end);
119
-
120
-    if (end == str || *end != '\0')
121
-        bError = true;
122
-    return v;
123
-}
124
-
125
 static int parseName(const char* arg, const char* const* names, bool& bError)
126
 {
127
     for (int i = 0; names[i]; i++)
128
@@ -485,7 +487,6 @@
129
 #define atof(str) x265_atof(str, bError)
130
 #define atobool(str) (bNameWasBool = true, x265_atobool(str, bError))
131
 
132
-extern "C"
133
 int x265_param_parse(x265_param* p, const char* name, const char* value)
134
 {
135
     bool bError = false;
136
@@ -581,6 +582,7 @@
137
         }
138
     }
139
     OPT("cu-stats") p->bLogCuStats = atobool(value);
140
+    OPT("total-frames") p->totalFrames = atoi(value);
141
     OPT("annexb") p->bAnnexB = atobool(value);
142
     OPT("repeat-headers") p->bRepeatHeaders = atobool(value);
143
     OPT("wpp") p->bEnableWavefront = atobool(value);
144
@@ -641,6 +643,7 @@
145
         }
146
     }
147
     OPT("ref") p->maxNumReferences = atoi(value);
148
+    OPT("limit-refs") p->limitReferences = atoi(value);
149
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
150
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
151
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
152
@@ -827,7 +830,7 @@
153
         p->vui.chromaSampleLocTypeTopField = atoi(value);
154
         p->vui.chromaSampleLocTypeBottomField = p->vui.chromaSampleLocTypeTopField;
155
     }
156
-    OPT("crop-rect")
157
+    OPT2("display-window", "crop-rect")
158
     {
159
         p->vui.bEnableDefaultDisplayWindowFlag = 1;
160
         bError |= sscanf(value, "%d,%d,%d,%d",
161
@@ -845,7 +848,6 @@
162
         p->rc.bStatRead = pass & 2;
163
     }
164
     OPT("stats") p->rc.statFileName = strdup(value);
165
-    OPT("csv") p->csvfn = strdup(value);
166
     OPT("scaling-list") p->scalingLists = strdup(value);
167
     OPT2("pools", "numa-pools") p->numaPools = strdup(value);
168
     OPT("lambda-file") p->rc.lambdaFileName = strdup(value);
169
@@ -864,7 +866,9 @@
170
     return bError ? X265_PARAM_BAD_VALUE : 0;
171
 }
172
 
173
-namespace x265 {
174
+} /* end extern "C" or namespace */
175
+
176
+namespace X265_NS {
177
 // internal encoder functions
178
 
179
 int x265_atoi(const char* str, bool& bError)
180
@@ -877,6 +881,16 @@
181
     return v;
182
 }
183
 
184
+double x265_atof(const char* str, bool& bError)
185
+{
186
+    char *end;
187
+    double v = strtod(str, &end);
188
+
189
+    if (end == str || *end != '\0')
190
+        bError = true;
191
+    return v;
192
+}
193
+
194
 /* cpu name can be:
195
  *   auto || true - x265::cpu_detect()
196
  *   false || no  - disabled
197
@@ -893,7 +907,7 @@
198
     if (isdigit(value[0]))
199
         cpu = x265_atoi(value, bError);
200
     else
201
x265_1.7.tar.gz/source/common/param.h -> x265_1.8.tar.gz/source/common/param.h Changed
48
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -24,7 +25,8 @@
10
 #ifndef X265_PARAM_H
11
 #define X265_PARAM_H
12
 
13
-namespace x265 {
14
+namespace X265_NS {
15
+
16
 int   x265_check_params(x265_param *param);
17
 int   x265_set_globals(x265_param *param);
18
 void  x265_print_params(x265_param *param);
19
@@ -32,13 +34,27 @@
20
 void  x265_param_apply_fastfirstpass(x265_param *p);
21
 char* x265_param2string(x265_param *param);
22
 int   x265_atoi(const char *str, bool& bError);
23
+double x265_atof(const char *str, bool& bError);
24
 int   parseCpuName(const char *value, bool& bError);
25
 void  setParamAspectRatio(x265_param *p, int width, int height);
26
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
27
 bool  parseLambdaFile(x265_param *param);
28
 
29
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
30
-static const char * const logLevelNames[] = { "none", "error", "warning", "info", "frame", "debug", "full", 0 };
31
+static const char * const logLevelNames[] = { "none", "error", "warning", "info", "debug", "full", 0 };
32
+
33
+#if EXPORT_C_API
34
+#define PARAM_NS
35
+#else
36
+/* declare param functions within private namespace */
37
+void x265_param_free(x265_param *);
38
+x265_param* x265_param_alloc();
39
+void x265_param_default(x265_param *param);
40
+int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
41
+int x265_param_apply_profile(x265_param *, const char *profile);
42
+int x265_param_parse(x265_param *p, const char *name, const char *value);
43
+#define PARAM_NS X265_NS
44
+#endif
45
 
46
 #define MAXPARAMSIZE 2000
47
 }
48
x265_1.7.tar.gz/source/common/piclist.cpp -> x265_1.8.tar.gz/source/common/piclist.cpp Changed
10
 
1
@@ -25,7 +25,7 @@
2
 #include "piclist.h"
3
 #include "frame.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 void PicList::pushFront(Frame& curFrame)
9
 {
10
x265_1.7.tar.gz/source/common/piclist.h -> x265_1.8.tar.gz/source/common/piclist.h Changed
14
 
1
@@ -24,9 +24,10 @@
2
 #ifndef X265_PICLIST_H
3
 #define X265_PICLIST_H
4
 
5
-#include <cstdlib>
6
+#include "common.h"
7
+
8
+namespace X265_NS {
9
 
10
-namespace x265 {
11
 class Frame;
12
 
13
 class PicList
14
x265_1.7.tar.gz/source/common/picyuv.cpp -> x265_1.8.tar.gz/source/common/picyuv.cpp Changed
147
 
1
@@ -26,7 +26,7 @@
2
 #include "slice.h"
3
 #include "primitives.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 PicYuv::PicYuv()
9
 {
10
@@ -148,52 +148,62 @@
11
     padx++;
12
     pady++;
13
 
14
-    if (pic.bitDepth < X265_DEPTH)
15
-    {
16
-        pixel *yPixel = m_picOrg[0];
17
-        pixel *uPixel = m_picOrg[1];
18
-        pixel *vPixel = m_picOrg[2];
19
+    X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
20
 
21
-        uint8_t *yChar = (uint8_t*)pic.planes[0];
22
-        uint8_t *uChar = (uint8_t*)pic.planes[1];
23
-        uint8_t *vChar = (uint8_t*)pic.planes[2];
24
-        int shift = X265_MAX(0, X265_DEPTH - pic.bitDepth);
25
-
26
-        primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
27
-        primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
28
-        primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
29
-    }
30
-    else if (pic.bitDepth == 8)
31
+    if (pic.bitDepth == 8)
32
     {
33
-        pixel *yPixel = m_picOrg[0];
34
-        pixel *uPixel = m_picOrg[1];
35
-        pixel *vPixel = m_picOrg[2];
36
+#if (X265_DEPTH > 8)
37
+        {
38
+            pixel *yPixel = m_picOrg[0];
39
+            pixel *uPixel = m_picOrg[1];
40
+            pixel *vPixel = m_picOrg[2];
41
+
42
+            uint8_t *yChar = (uint8_t*)pic.planes[0];
43
+            uint8_t *uChar = (uint8_t*)pic.planes[1];
44
+            uint8_t *vChar = (uint8_t*)pic.planes[2];
45
+            int shift = (X265_DEPTH - 8);
46
+
47
+            primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
48
+            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
49
+            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
50
+        }
51
+#else /* Case for (X265_DEPTH == 8) */
52
+        // TODO: Does we need this path? may merge into above in future
53
+        {
54
+            pixel *yPixel = m_picOrg[0];
55
+            pixel *uPixel = m_picOrg[1];
56
+            pixel *vPixel = m_picOrg[2];
57
 
58
-        uint8_t *yChar = (uint8_t*)pic.planes[0];
59
-        uint8_t *uChar = (uint8_t*)pic.planes[1];
60
-        uint8_t *vChar = (uint8_t*)pic.planes[2];
61
+            uint8_t *yChar = (uint8_t*)pic.planes[0];
62
+            uint8_t *uChar = (uint8_t*)pic.planes[1];
63
+            uint8_t *vChar = (uint8_t*)pic.planes[2];
64
 
65
-        for (int r = 0; r < height; r++)
66
-        {
67
-            memcpy(yPixel, yChar, width * sizeof(pixel));
68
+            for (int r = 0; r < height; r++)
69
+            {
70
+                memcpy(yPixel, yChar, width * sizeof(pixel));
71
 
72
-            yPixel += m_stride;
73
-            yChar += pic.stride[0] / sizeof(*yChar);
74
-        }
75
+                yPixel += m_stride;
76
+                yChar += pic.stride[0] / sizeof(*yChar);
77
+            }
78
 
79
-        for (int r = 0; r < height >> m_vChromaShift; r++)
80
-        {
81
-            memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
82
-            memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
83
+            for (int r = 0; r < height >> m_vChromaShift; r++)
84
+            {
85
+                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
86
+                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
87
 
88
-            uPixel += m_strideC;
89
-            vPixel += m_strideC;
90
-            uChar += pic.stride[1] / sizeof(*uChar);
91
-            vChar += pic.stride[2] / sizeof(*vChar);
92
+                uPixel += m_strideC;
93
+                vPixel += m_strideC;
94
+                uChar += pic.stride[1] / sizeof(*uChar);
95
+                vChar += pic.stride[2] / sizeof(*vChar);
96
+            }
97
         }
98
+#endif /* (X265_DEPTH > 8) */
99
     }
100
     else /* pic.bitDepth > 8 */
101
     {
102
+        /* defensive programming, mask off bits that are supposed to be zero */
103
+        uint16_t mask = (1 << X265_DEPTH) - 1;
104
+        int shift = abs(pic.bitDepth - X265_DEPTH);
105
         pixel *yPixel = m_picOrg[0];
106
         pixel *uPixel = m_picOrg[1];
107
         pixel *vPixel = m_picOrg[2];
108
@@ -202,15 +212,20 @@
109
         uint16_t *uShort = (uint16_t*)pic.planes[1];
110
         uint16_t *vShort = (uint16_t*)pic.planes[2];
111
 
112
-        /* defensive programming, mask off bits that are supposed to be zero */
113
-        uint16_t mask = (1 << X265_DEPTH) - 1;
114
-        int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH);
115
-
116
-        /* shift and mask pixels to final size */
117
-
118
-        primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
119
-        primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
120
-        primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
121
+        if (pic.bitDepth > X265_DEPTH)
122
+        {
123
+            /* shift right and mask pixels to final size */
124
+            primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
125
+            primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
126
+            primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
127
+        }
128
+        else /* Case for (pic.bitDepth <= X265_DEPTH) */
129
+        {
130
+            /* shift left and mask pixels to final size */
131
+            primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
132
+            primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
133
+            primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
134
+        }
135
     }
136
 
137
     /* extend the right edge if width was not multiple of the minimum CU size */
138
@@ -259,7 +274,7 @@
139
     }
140
 }
141
 
142
-namespace x265 {
143
+namespace X265_NS {
144
 
145
 template<uint32_t OUTPUT_BITDEPTH_DIV8>
146
 static void md5_block(MD5Context& md5, const pixel* plane, uint32_t n)
147
x265_1.7.tar.gz/source/common/picyuv.h -> x265_1.8.tar.gz/source/common/picyuv.h Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "md5.h"
3
 #include "x265.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class ShortYuv;
10
x265_1.7.tar.gz/source/common/pixel.cpp -> x265_1.8.tar.gz/source/common/pixel.cpp Changed
201
 
1
@@ -30,7 +30,7 @@
2
 
3
 #include <cstdlib> // abs()
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 namespace {
9
 // place functions in anonymous namespace (file static)
10
@@ -117,9 +117,9 @@
11
 }
12
 
13
 template<int lx, int ly, class T1, class T2>
14
-int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
15
+sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
16
 {
17
-    int sum = 0;
18
+    sse_ret_t sum = 0;
19
     int tmp;
20
 
21
     for (int y = 0; y < ly; y++)
22
@@ -159,7 +159,7 @@
23
     return (a + s) ^ s;
24
 }
25
 
26
-int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
27
+static int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
28
 {
29
     sum2_t tmp[4][2];
30
     sum2_t a0, a1, a2, a3, b0, b1;
31
@@ -219,7 +219,7 @@
32
 }
33
 
34
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
35
-int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
36
+static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
37
 {
38
     sum2_t tmp[4][4];
39
     sum2_t a0, a1, a2, a3;
40
@@ -308,7 +308,7 @@
41
     return (int)sum;
42
 }
43
 
44
-int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
45
+inline int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
46
 {
47
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
48
 }
49
@@ -359,12 +359,12 @@
50
     return (int)sum;
51
 }
52
 
53
-int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
54
+static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
55
 {
56
     return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
57
 }
58
 
59
-int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
60
+static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
61
 {
62
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
63
         + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
64
@@ -516,7 +516,7 @@
65
             dst[k * blockSize + l] = src[l * stride + k];
66
 }
67
 
68
-void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
69
+static void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
70
 {
71
     int x, y;
72
 
73
@@ -541,7 +541,7 @@
74
     }
75
 }
76
 
77
-void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
78
+static void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
79
 {
80
     int x, y;
81
 
82
@@ -582,7 +582,7 @@
83
     }
84
 }
85
 
86
-void scale1D_128to64(pixel *dst, const pixel *src)
87
+static void scale1D_128to64(pixel *dst, const pixel *src)
88
 {
89
     int x;
90
     const pixel* src1 = src;
91
@@ -608,7 +608,7 @@
92
     }
93
 }
94
 
95
-void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
96
+static void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
97
 {
98
     uint32_t x, y;
99
 
100
@@ -627,6 +627,7 @@
101
     }
102
 }
103
 
104
+static
105
 void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
106
                             intptr_t src_stride, intptr_t dst_stride, int width, int height)
107
 {
108
@@ -653,7 +654,7 @@
109
 }
110
 
111
 /* structural similarity metric */
112
-void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
113
+static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
114
 {
115
     for (int z = 0; z < 2; z++)
116
     {
117
@@ -681,7 +682,7 @@
118
     }
119
 }
120
 
121
-float ssim_end_1(int s1, int s2, int ss, int s12)
122
+static float ssim_end_1(int s1, int s2, int ss, int s12)
123
 {
124
 /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
125
  * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
126
@@ -689,7 +690,7 @@
127
 
128
 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
129
 #if HIGH_BIT_DEPTH
130
-    X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
131
+    X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
132
 #define type float
133
     static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
134
     static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
135
@@ -711,7 +712,7 @@
136
 #undef PIXEL_MAX
137
 }
138
 
139
-float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
140
+static float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
141
 {
142
     float ssim = 0.0;
143
 
144
@@ -920,7 +921,7 @@
145
     }
146
 }
147
 
148
-void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
149
+static void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
150
 {
151
     for (int r = 0; r < height; r++)
152
     {
153
@@ -932,7 +933,7 @@
154
     }
155
 }
156
 
157
-void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
158
+static void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
159
 {
160
     for (int r = 0; r < height; r++)
161
     {
162
@@ -944,9 +945,21 @@
163
     }
164
 }
165
 
166
+static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
167
+{
168
+    for (int r = 0; r < height; r++)
169
+    {
170
+        for (int c = 0; c < width; c++)
171
+            dst[c] = (pixel)((src[c] << shift) & mask);
172
+
173
+        dst += dstStride;
174
+        src += srcStride;
175
+    }
176
+}
177
+
178
 /* Estimate the total amount of influence on future quality that could be had if we
179
  * were to improve the reference samples used to inter predict any given CU. */
180
-void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
181
+static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
182
                              const int32_t* invQscales, const double* fpsFactor, int len)
183
 {
184
     double fps = *fpsFactor / 256;
185
@@ -962,7 +975,7 @@
186
 }
187
 }  // end anonymous namespace
188
 
189
-namespace x265 {
190
+namespace X265_NS {
191
 // x265 private namespace
192
 
193
 /* Extend the edges of a picture so that it may safely be used for motion
194
@@ -1244,6 +1257,7 @@
195
 
196
     p.planecopy_cp = planecopy_cp_c;
197
     p.planecopy_sp = planecopy_sp_c;
198
+    p.planecopy_sp_shl = planecopy_sp_shl_c;
199
     p.propagateCost = estimateCUPropagateCost;
200
 }
201
x265_1.7.tar.gz/source/common/predict.cpp -> x265_1.8.tar.gz/source/common/predict.cpp Changed
46
 
1
@@ -28,7 +28,7 @@
2
 #include "predict.h"
3
 #include "primitives.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #if _MSC_VER
9
 #pragma warning(disable: 4127) // conditional expression is constant
10
@@ -776,30 +776,17 @@
11
         // Fill left & below-left samples
12
         adiTemp += picStride;
13
         adi--;
14
-        pNeighborFlags--;
15
-        for (int j = 0; j < leftUnits; j++)
16
+        // NOTE: over copy here, but reduce condition operators
17
+        for (int j = 0; j < leftUnits * unitHeight; j++)
18
         {
19
-            if (*pNeighborFlags)
20
-                for (int i = 0; i < unitHeight; i++)
21
-                    adi[-i] = adiTemp[i * picStride];
22
-
23
-            adiTemp += unitHeight * picStride;
24
-            adi -= unitHeight;
25
-            pNeighborFlags--;
26
+            adi[-j] = adiTemp[j * picStride];
27
         }
28
 
29
         // Fill above & above-right samples
30
         adiTemp = adiOrigin - picStride;
31
         adi = adiLineBuffer + (leftUnits * unitHeight) + unitWidth;
32
-        pNeighborFlags = bNeighborFlags + leftUnits + 1;
33
-        for (int j = 0; j < aboveUnits; j++)
34
-        {
35
-            if (*pNeighborFlags)
36
-                memcpy(adi, adiTemp, unitWidth * sizeof(*adiTemp));
37
-            adiTemp += unitWidth;
38
-            adi += unitWidth;
39
-            pNeighborFlags++;
40
-        }
41
+        // NOTE: over copy here, but reduce condition operators
42
+        memcpy(adi, adiTemp, aboveUnits * unitWidth * sizeof(*adiTemp));
43
 
44
         // Pad reference samples when necessary
45
         int curr = 0;
46
x265_1.7.tar.gz/source/common/predict.h -> x265_1.8.tar.gz/source/common/predict.h Changed
10
 
1
@@ -30,7 +30,7 @@
2
 #include "shortyuv.h"
3
 #include "yuv.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 
8
 class CUData;
9
 class Slice;
10
x265_1.7.tar.gz/source/common/primitives.cpp -> x265_1.8.tar.gz/source/common/primitives.cpp Changed
155
 
1
@@ -24,7 +24,7 @@
2
 #include "common.h"
3
 #include "primitives.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 extern const uint8_t lumaPartitionMapTable[] =
10
@@ -56,6 +56,7 @@
11
 void setupFilterPrimitives_c(EncoderPrimitives &p);
12
 void setupIntraPrimitives_c(EncoderPrimitives &p);
13
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
14
+void setupSaoPrimitives_c(EncoderPrimitives &p);
15
 
16
 void setupCPrimitives(EncoderPrimitives &p)
17
 {
18
@@ -64,6 +65,7 @@
19
     setupFilterPrimitives_c(p);     // ipfilter.cpp
20
     setupIntraPrimitives_c(p);      // intrapred.cpp
21
     setupLoopFilterPrimitives_c(p); // loopfilter.cpp
22
+    setupSaoPrimitives_c(p);        // sao.cpp
23
 }
24
 
25
 void setupAliasPrimitives(EncoderPrimitives &p)
26
@@ -72,7 +74,7 @@
27
     /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
28
     for (int i = 0; i < NUM_CU_SIZES; i++)
29
     {
30
-        p.cu[i].sse_pp = (pixelcmp_t)p.cu[i].sse_ss;
31
+        p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss;
32
 
33
         p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
34
         p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
35
@@ -185,62 +187,36 @@
36
 
37
     p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
38
 }
39
-}
40
-using namespace x265;
41
 
42
-/* cpuid >= 0 - force CPU type
43
- * cpuid < 0  - auto-detect if uninitialized */
44
-void x265_setup_primitives(x265_param *param, int cpuid)
45
+void x265_report_simd(x265_param* param)
46
 {
47
-    if (cpuid < 0)
48
-        cpuid = x265::cpu_detect();
49
-
50
-    // initialize global variables
51
-    if (!primitives.pu[0].sad)
52
-    {
53
-        setupCPrimitives(primitives);
54
-
55
-        /* We do not want the encoder to use the un-optimized intra all-angles
56
-         * C references. It is better to call the individual angle functions
57
-         * instead. We must check for NULL before using this primitive */
58
-        for (int i = 0; i < NUM_TR_SIZE; i++)
59
-            primitives.cu[i].intra_pred_allangs = NULL;
60
-
61
-#if ENABLE_ASSEMBLY
62
-        setupInstrinsicPrimitives(primitives, cpuid);
63
-        setupAssemblyPrimitives(primitives, cpuid);
64
-#else
65
-        x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
66
-#endif
67
-
68
-        setupAliasPrimitives(primitives);
69
-    }
70
-
71
     if (param->logLevel >= X265_LOG_INFO)
72
     {
73
+        int cpuid = param->cpuid;
74
+
75
         char buf[1000];
76
         char *p = buf + sprintf(buf, "using cpu capabilities:");
77
         char *none = p;
78
-        for (int i = 0; x265::cpu_names[i].flags; i++)
79
+        for (int i = 0; X265_NS::cpu_names[i].flags; i++)
80
         {
81
-            if (!strcmp(x265::cpu_names[i].name, "SSE")
82
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE")
83
                 && (cpuid & X265_CPU_SSE2))
84
                 continue;
85
-            if (!strcmp(x265::cpu_names[i].name, "SSE2")
86
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE2")
87
                 && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW)))
88
                 continue;
89
-            if (!strcmp(x265::cpu_names[i].name, "SSE3")
90
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE3")
91
                 && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
92
                 continue;
93
-            if (!strcmp(x265::cpu_names[i].name, "SSE4.1")
94
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1")
95
                 && (cpuid & X265_CPU_SSE42))
96
                 continue;
97
-            if (!strcmp(x265::cpu_names[i].name, "BMI1")
98
+            if (!strcmp(X265_NS::cpu_names[i].name, "BMI1")
99
                 && (cpuid & X265_CPU_BMI2))
100
                 continue;
101
-            if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags
102
-                && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags))
103
-                p += sprintf(p, " %s", x265::cpu_names[i].name);
104
+            if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags
105
+                && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags))
106
+                p += sprintf(p, " %s", X265_NS::cpu_names[i].name);
107
         }
108
 
109
         if (p == none)
110
@@ -249,14 +225,40 @@
111
     }
112
 }
113
 
114
+void x265_setup_primitives(x265_param *param)
115
+{
116
+    if (!primitives.pu[0].sad)
117
+    {
118
+        setupCPrimitives(primitives);
119
+
120
+        /* We do not want the encoder to use the un-optimized intra all-angles
121
+         * C references. It is better to call the individual angle functions
122
+         * instead. We must check for NULL before using this primitive */
123
+        for (int i = 0; i < NUM_TR_SIZE; i++)
124
+            primitives.cu[i].intra_pred_allangs = NULL;
125
+
126
+#if ENABLE_ASSEMBLY
127
+        setupInstrinsicPrimitives(primitives, param->cpuid);
128
+        setupAssemblyPrimitives(primitives, param->cpuid);
129
+#endif
130
+
131
+        setupAliasPrimitives(primitives);
132
+    }
133
+
134
+    x265_report_simd(param);
135
+}
136
+}
137
+
138
 #if ENABLE_ASSEMBLY
139
 /* these functions are implemented in assembly. When assembly is not being
140
  * compiled, they are unnecessary and can be NOPs */
141
 #else
142
 extern "C" {
143
-int x265_cpu_cpuid_test(void) { return 0; }
144
-void x265_cpu_emms(void) {}
145
-void x265_cpu_cpuid(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
146
-void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {}
147
+int PFX(cpu_cpuid_test)(void) { return 0; }
148
+void PFX(cpu_emms)(void) {}
149
+void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
150
+void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}
151
+void PFX(cpu_neon_test)(void) {}
152
+int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
153
 }
154
 #endif
155
x265_1.7.tar.gz/source/common/primitives.h -> x265_1.8.tar.gz/source/common/primitives.h Changed
108
 
1
@@ -33,7 +33,7 @@
2
 #include "common.h"
3
 #include "cpu.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 enum LumaPU
10
@@ -112,6 +112,8 @@
11
 
12
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
13
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
14
+typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
15
+typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
16
 typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
17
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
18
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
19
@@ -173,6 +175,13 @@
20
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
21
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
22
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
23
+
24
+typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
25
+typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
26
+typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
27
+typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
28
+typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
29
+
30
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
31
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
32
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
33
@@ -182,6 +191,10 @@
34
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
35
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
36
 
37
+typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
38
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
39
+typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
40
+
41
 /* Function pointers to optimized encoder primitives. Each pointer can reference
42
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
43
 struct EncoderPrimitives
44
@@ -242,8 +255,9 @@
45
         copy_pp_t       copy_pp;       // alias to pu[].copy_pp
46
 
47
         var_t           var;           // block internal variance
48
-        pixelcmp_t      sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
49
-        pixelcmp_ss_t   sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
50
+
51
+        pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
52
+        pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
53
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
54
         pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
55
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
56
@@ -289,12 +303,19 @@
57
     saoCuOrgE3_t          saoCuOrgE3[2];
58
     saoCuOrgB0_t          saoCuOrgB0;
59
 
60
+    saoCuStatsBO_t        saoCuStatsBO;
61
+    saoCuStatsE0_t        saoCuStatsE0;
62
+    saoCuStatsE1_t        saoCuStatsE1;
63
+    saoCuStatsE2_t        saoCuStatsE2;
64
+    saoCuStatsE3_t        saoCuStatsE3;
65
+
66
     downscale_t           frameInitLowres;
67
     cutree_propagate_cost propagateCost;
68
 
69
     extendCURowBorder_t   extendRowBorder;
70
     planecopy_cp_t        planecopy_cp;
71
     planecopy_sp_t        planecopy_sp;
72
+    planecopy_sp_t        planecopy_sp_shl;
73
 
74
     weightp_sp_t          weight_sp;
75
     weightp_pp_t          weight_pp;
76
@@ -303,6 +324,11 @@
77
     scanPosLast_t         scanPosLast;
78
     findPosFirstLast_t    findPosFirstLast;
79
 
80
+    costCoeffNxN_t        costCoeffNxN;
81
+    costCoeffRemain_t     costCoeffRemain;
82
+    costC1C2Flag_t        costC1C2Flag;
83
+
84
+
85
     /* There is one set of chroma primitives per color space. An encoder will
86
      * have just a single color space and thus it will only ever use one entry
87
      * in this array. However we always fill all entries in the array in case
88
@@ -335,7 +361,7 @@
89
         struct CUChroma
90
         {
91
             pixelcmp_t     sa8d;    // if chroma CU is not multiple of 8x8, will use satd
92
-            pixelcmp_t     sse_pp;
93
+            pixel_sse_t    sse_pp;
94
             pixel_sub_ps_t sub_ps;
95
             pixel_add_ps_t add_ps;
96
 
97
@@ -377,4 +403,10 @@
98
 void setupAliasPrimitives(EncoderPrimitives &p);
99
 }
100
 
101
+#if !EXPORT_C_API
102
+extern const int   PFX(max_bit_depth);
103
+extern const char* PFX(version_str);
104
+extern const char* PFX(build_info_str);
105
+#endif
106
+
107
 #endif // ifndef X265_PRIMITIVES_H
108
x265_1.7.tar.gz/source/common/quant.cpp -> x265_1.8.tar.gz/source/common/quant.cpp Changed
201
 
1
@@ -30,7 +30,7 @@
2
 #include "cudata.h"
3
 #include "contexts.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
9
 
10
@@ -204,7 +204,6 @@
11
     m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
12
     m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
13
     m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
14
-    m_tqBypass = false;
15
 
16
     return m_resiDctCoeff && m_fencShortBuf;
17
 }
18
@@ -228,9 +227,6 @@
19
 
20
 void Quant::setQPforQuant(const CUData& ctu, int qp)
21
 {
22
-    m_tqBypass = !!ctu.m_tqBypass[0];
23
-    if (m_tqBypass)
24
-        return;
25
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
26
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
27
     setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
28
@@ -251,30 +247,63 @@
29
 }
30
 
31
 /* To minimize the distortion only. No rate is considered */
32
-uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
33
+uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams, uint32_t log2TrSize)
34
 {
35
-    const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
36
+    uint32_t trSize = 1 << log2TrSize;
37
     const uint16_t* scan = codeParams.scan;
38
-    bool lastCG = true;
39
 
40
-    for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
41
+    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
42
+    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
43
+    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
44
+
45
+#if CHECKED_BUILD || _DEBUG
46
+    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
47
+    memset(coeffNum, 0, sizeof(coeffNum));
48
+    memset(coeffSign, 0, sizeof(coeffNum));
49
+    memset(coeffFlag, 0, sizeof(coeffNum));
50
+#endif
51
+    const int lastScanPos = primitives.scanPosLast(codeParams.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
52
+    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
53
+    unsigned long tmp;
54
+
55
+    // first CG need specially processing
56
+    const uint32_t correctOffset = 0x0F & (lastScanPos ^ 0xF);
57
+    coeffFlag[cgLastScanPos] <<= correctOffset;
58
+
59
+    for (int cg = cgLastScanPos; cg >= 0; cg--)
60
     {
61
         int cgStartPos = cg << LOG2_SCAN_SET_SIZE;
62
         int n;
63
 
64
+#if CHECKED_BUILD || _DEBUG
65
         for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
66
             if (coeff[scan[n + cgStartPos]])
67
                 break;
68
-        if (n < 0)
69
-            continue;
70
+        int lastNZPosInCG0 = n;
71
+#endif
72
 
73
-        int lastNZPosInCG = n;
74
+        if (coeffNum[cg] == 0)
75
+        {
76
+            X265_CHECK(lastNZPosInCG0 < 0, "all zero block check failure\n");
77
+            continue;
78
+        }
79
 
80
+#if CHECKED_BUILD || _DEBUG
81
         for (n = 0;; n++)
82
             if (coeff[scan[n + cgStartPos]])
83
                 break;
84
 
85
-        int firstNZPosInCG = n;
86
+        int firstNZPosInCG0 = n;
87
+#endif
88
+
89
+        CLZ(tmp, coeffFlag[cg]);
90
+        const int firstNZPosInCG = (15 ^ tmp);
91
+
92
+        CTZ(tmp, coeffFlag[cg]);
93
+        const int lastNZPosInCG = (15 ^ tmp);
94
+
95
+        X265_CHECK(firstNZPosInCG0 == firstNZPosInCG, "firstNZPosInCG0 check failure\n");
96
+        X265_CHECK(lastNZPosInCG0 == lastNZPosInCG, "lastNZPosInCG0 check failure\n");
97
 
98
         if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
99
         {
100
@@ -287,12 +316,17 @@
101
             if (signbit != (absSum & 0x1)) // compare signbit with sum_parity
102
             {
103
                 int minCostInc = MAX_INT,  minPos = -1, curCost = MAX_INT;
104
-                int16_t finalChange = 0, curChange = 0;
105
+                int32_t finalChange = 0, curChange = 0;
106
+                uint32_t cgFlags = coeffFlag[cg];
107
+                if (cg == cgLastScanPos)
108
+                    cgFlags >>= correctOffset;
109
 
110
-                for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
111
+                for (n = (cg == cgLastScanPos ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
112
                 {
113
                     uint32_t blkPos = scan[n + cgStartPos];
114
-                    if (coeff[blkPos])
115
+                    X265_CHECK(!!coeff[blkPos] == !!(cgFlags & 1), "non zero coeff check failure\n");
116
+
117
+                    if (cgFlags & 1)
118
                     {
119
                         if (deltaU[blkPos] > 0)
120
                         {
121
@@ -301,8 +335,11 @@
122
                         }
123
                         else
124
                         {
125
-                            if (n == firstNZPosInCG && abs(coeff[blkPos]) == 1)
126
+                            if ((cgFlags == 1) && (abs(coeff[blkPos]) == 1))
127
+                            {
128
+                                X265_CHECK(n == firstNZPosInCG, "firstNZPosInCG position check failure\n");
129
                                 curCost = MAX_INT;
130
+                            }
131
                             else
132
                             {
133
                                 curCost = deltaU[blkPos];
134
@@ -312,8 +349,9 @@
135
                     }
136
                     else
137
                     {
138
-                        if (n < firstNZPosInCG)
139
+                        if (cgFlags == 0)
140
                         {
141
+                            X265_CHECK(n < firstNZPosInCG, "firstNZPosInCG position check failure\n");
142
                             uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1;
143
                             if (thisSignBit != signbit)
144
                                 curCost = MAX_INT;
145
@@ -336,6 +374,7 @@
146
                         finalChange = curChange;
147
                         minPos = blkPos;
148
                     }
149
+                    cgFlags>>=1;
150
                 }
151
 
152
                 /* do not allow change to violate coeff clamp */
153
@@ -347,14 +386,12 @@
154
                 else if (finalChange == -1 && abs(coeff[minPos]) == 1)
155
                     numSig--;
156
 
157
-                if (m_resiDctCoeff[minPos] >= 0)
158
-                    coeff[minPos] += finalChange;
159
-                else
160
-                    coeff[minPos] -= finalChange;
161
+                {
162
+                    const int16_t sigMask = ((int16_t)m_resiDctCoeff[minPos]) >> 15;
163
+                    coeff[minPos] += ((int16_t)finalChange ^ sigMask) - sigMask;
164
+                }
165
             }
166
         }
167
-
168
-        lastCG = false;
169
     }
170
 
171
     return numSig;
172
@@ -364,7 +401,8 @@
173
                              coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
174
 {
175
     const uint32_t sizeIdx = log2TrSize - 2;
176
-    if (m_tqBypass)
177
+
178
+    if (cu.m_tqBypass[0])
179
     {
180
         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
181
         return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
182
@@ -437,18 +475,19 @@
183
         {
184
             TUEntropyCodingParameters codeParams;
185
             cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma);
186
-            return signBitHidingHDQ(coeff, deltaU, numSig, codeParams);
187
+            return signBitHidingHDQ(coeff, deltaU, numSig, codeParams, log2TrSize);
188
         }
189
         else
190
             return numSig;
191
     }
192
 }
193
 
194
-void Quant::invtransformNxN(int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
195
+void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
196
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
197
 {
198
     const uint32_t sizeIdx = log2TrSize - 2;
199
-    if (m_tqBypass)
200
+
201
x265_1.7.tar.gz/source/common/quant.h -> x265_1.8.tar.gz/source/common/quant.h Changed
84
 
1
@@ -28,7 +28,7 @@
2
 #include "scalinglist.h"
3
 #include "contexts.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class CUData;
10
@@ -41,7 +41,7 @@
11
     int per;
12
     int qp;
13
     int64_t lambda2; /* FIX8 */
14
-    int32_t lambda;  /* FIX8, dynamic range is 18-bits in 8bpp and 20-bits in 16bpp */
15
+    int32_t lambda;  /* FIX8, dynamic range is 18-bits in Main and 20-bits in Main10 */
16
 
17
     QpParam() : qp(MAX_INT) {}
18
 
19
@@ -68,9 +68,9 @@
20
     /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
21
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
22
      * Intra 0..7 - Inter 8..15 */
23
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
24
-    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
25
+    ALIGN_VAR_16(uint32_t, residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
26
     uint32_t count[MAX_NUM_TR_CATEGORIES];
27
+    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
28
 };
29
 
30
 class Quant
31
@@ -94,7 +94,6 @@
32
 
33
     NoiseReduction*    m_nr;
34
     NoiseReduction*    m_frameNr; // Array of NR structures, one for each frameEncoder
35
-    bool               m_tqBypass;
36
 
37
     Quant();
38
     ~Quant();
39
@@ -109,7 +108,7 @@
40
     uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
41
                           uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
42
 
43
-    void invtransformNxN(int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
44
+    void invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
45
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
46
 
47
     /* Pattern decision for context derivation process of significant_coeff_flag */
48
@@ -126,9 +125,9 @@
49
         const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (cgBlkPos + 1)); // just need lowest 7-bits valid
50
 
51
         // TODO: instruction BT is faster, but _bittest64 still generate instruction 'BT m, r' in VS2012
52
-        const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1);
53
-        const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2;
54
-        return sigRight + sigLower;
55
+        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
56
+        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
57
+        return sigRight + sigLower * 2;
58
     }
59
 
60
     /* Context derivation process of coeff_abs_significant_flag */
61
@@ -137,10 +136,10 @@
62
         X265_CHECK(cgBlkPos < 64, "cgBlkPos is too large\n");
63
         // NOTE: unsafe shift operator, see NOTE in calcPatternSigCtx
64
         const uint32_t sigPos = (uint32_t)(cgGroupMask >> (cgBlkPos + 1)); // just need lowest 8-bits valid
65
-        const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
66
-        const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
67
+        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
68
+        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
69
 
70
-        return (sigRight | sigLower) & 1;
71
+        return (sigRight | sigLower);
72
     }
73
 
74
     /* static methods shared with entropy.cpp */
75
@@ -150,7 +149,7 @@
76
 
77
     void setChromaQP(int qpin, TextType ttype, int chFmt);
78
 
79
-    uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
80
+    uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
81
 
82
     uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
83
 };
84
x265_1.7.tar.gz/source/common/scalinglist.cpp -> x265_1.8.tar.gz/source/common/scalinglist.cpp Changed
37
 
1
@@ -80,7 +80,7 @@
2
     },
3
 };
4
 
5
-int quantTSDefault4x4[16] =
6
+static int quantTSDefault4x4[16] =
7
 {
8
     16, 16, 16, 16,
9
     16, 16, 16, 16,
10
@@ -88,7 +88,7 @@
11
     16, 16, 16, 16
12
 };
13
 
14
-int quantIntraDefault8x8[64] =
15
+static int quantIntraDefault8x8[64] =
16
 {
17
     16, 16, 16, 16, 17, 18, 21, 24,
18
     16, 16, 16, 16, 17, 19, 22, 25,
19
@@ -100,7 +100,7 @@
20
     24, 25, 29, 36, 47, 65, 88, 115
21
 };
22
 
23
-int quantInterDefault8x8[64] =
24
+static int quantInterDefault8x8[64] =
25
 {
26
     16, 16, 16, 16, 17, 18, 20, 24,
27
     16, 16, 16, 17, 18, 20, 24, 25,
28
@@ -114,7 +114,7 @@
29
 
30
 }
31
 
32
-namespace x265 {
33
+namespace X265_NS {
34
 // private namespace
35
 
36
 const int     ScalingList::s_numCoefPerSize[NUM_SIZES] = { 16, 64, 256, 1024 };
37
x265_1.7.tar.gz/source/common/scalinglist.h -> x265_1.8.tar.gz/source/common/scalinglist.h Changed
10
 
1
@@ -26,7 +26,7 @@
2
 
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class ScalingList
10
x265_1.7.tar.gz/source/common/shortyuv.cpp -> x265_1.8.tar.gz/source/common/shortyuv.cpp Changed
10
 
1
@@ -28,7 +28,7 @@
2
 
3
 #include "x265.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 ShortYuv::ShortYuv()
9
 {
10
x265_1.7.tar.gz/source/common/shortyuv.h -> x265_1.8.tar.gz/source/common/shortyuv.h Changed
10
 
1
@@ -28,7 +28,7 @@
2
 
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class Yuv;
10
x265_1.7.tar.gz/source/common/slice.cpp -> x265_1.8.tar.gz/source/common/slice.cpp Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include "picyuv.h"
3
 #include "slice.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 void Slice::setRefPicList(PicList& picList)
9
 {
10
x265_1.7.tar.gz/source/common/slice.h -> x265_1.8.tar.gz/source/common/slice.h Changed
18
 
1
@@ -26,7 +26,7 @@
2
 
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class Frame;
10
@@ -111,6 +111,7 @@
11
     bool     frameOnlyConstraintFlag;
12
     bool     profileCompatibilityFlag[32];
13
     bool     intraConstraintFlag;
14
+    bool     onePictureOnlyConstraintFlag;
15
     bool     lowerBitRateConstraintFlag;
16
     int      profileIdc;
17
     int      levelIdc;
18
x265_1.7.tar.gz/source/common/threading.cpp -> x265_1.8.tar.gz/source/common/threading.cpp Changed
98
 
1
@@ -21,21 +21,73 @@
2
  * For more information, contact us at license @ x265.com
3
  *****************************************************************************/
4
 
5
+#include "common.h"
6
 #include "threading.h"
7
+#include "cpu.h"
8
 
9
-namespace x265 {
10
+namespace X265_NS {
11
 // x265 private namespace
12
 
13
 #if X265_ARCH_X86 && !defined(X86_64) && ENABLE_ASSEMBLY && defined(__GNUC__)
14
-extern "C" intptr_t x265_stack_align(void (*func)(), ...);
15
-#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__)
16
+extern "C" intptr_t PFX(stack_align)(void (*func)(), ...);
17
+#define STACK_ALIGN(func, ...) PFX(stack_align)((void (*)())func, __VA_ARGS__)
18
 #else
19
-#define x265_stack_align(func, ...) func(__VA_ARGS__)
20
+#define STACK_ALIGN(func, ...) func(__VA_ARGS__)
21
+#endif
22
+
23
+#if NO_ATOMICS
24
+pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
25
+
26
+int no_atomic_or(int* ptr, int mask)
27
+{ 
28
+    pthread_mutex_lock(&g_mutex);
29
+    int ret = *ptr;
30
+    *ptr |= mask;
31
+    pthread_mutex_unlock(&g_mutex);
32
+    return ret;
33
+}
34
+
35
+int no_atomic_and(int* ptr, int mask)
36
+{
37
+    pthread_mutex_lock(&g_mutex);
38
+    int ret = *ptr;
39
+    *ptr &= mask;
40
+    pthread_mutex_unlock(&g_mutex);
41
+    return ret;
42
+}
43
+
44
+int no_atomic_inc(int* ptr)
45
+{
46
+    pthread_mutex_lock(&g_mutex);
47
+    *ptr += 1;
48
+    int ret = *ptr;
49
+    pthread_mutex_unlock(&g_mutex);
50
+    return ret;
51
+}
52
+
53
+int no_atomic_dec(int* ptr)
54
+{
55
+    pthread_mutex_lock(&g_mutex);
56
+    *ptr -= 1;
57
+    int ret = *ptr;
58
+    pthread_mutex_unlock(&g_mutex);
59
+    return ret;
60
+}
61
+
62
+int no_atomic_add(int* ptr, int val)
63
+{
64
+    pthread_mutex_lock(&g_mutex);
65
+    *ptr += val;
66
+    int ret = *ptr;
67
+    pthread_mutex_unlock(&g_mutex);
68
+    return ret;
69
+}
70
 #endif
71
 
72
 /* C shim for forced stack alignment */
73
 static void stackAlignMain(Thread *instance)
74
 {
75
+    // defer processing to the virtual function implemented in the derived class
76
     instance->threadMain();
77
 }
78
 
79
@@ -43,8 +95,7 @@
80
 
81
 static DWORD WINAPI ThreadShim(Thread *instance)
82
 {
83
-    // defer processing to the virtual function implemented in the derived class
84
-    x265_stack_align(stackAlignMain, instance);
85
+    STACK_ALIGN(stackAlignMain, instance);
86
 
87
     return 0;
88
 }
89
@@ -77,7 +128,7 @@
90
     // defer processing to the virtual function implemented in the derived class
91
     Thread *instance = reinterpret_cast<Thread *>(opaque);
92
 
93
-    x265_stack_align(stackAlignMain, instance);
94
+    STACK_ALIGN(stackAlignMain, instance);
95
 
96
     return NULL;
97
 }
98
x265_1.7.tar.gz/source/common/threading.h -> x265_1.8.tar.gz/source/common/threading.h Changed
50
 
1
@@ -42,7 +42,30 @@
2
 #include <sys/sysctl.h>
3
 #endif
4
 
5
-#ifdef __GNUC__               /* GCCs builtin atomics */
6
+#if NO_ATOMICS
7
+
8
+#include <sys/time.h>
9
+#include <unistd.h>
10
+
11
+namespace X265_NS {
12
+// x265 private namespace
13
+int no_atomic_or(int* ptr, int mask);
14
+int no_atomic_and(int* ptr, int mask);
15
+int no_atomic_inc(int* ptr);
16
+int no_atomic_dec(int* ptr);
17
+int no_atomic_add(int* ptr, int val);
18
+}
19
+
20
+#define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
21
+#define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
22
+#define ATOMIC_OR(ptr, mask)  no_atomic_or((int*)ptr, mask)
23
+#define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask)
24
+#define ATOMIC_INC(ptr)       no_atomic_inc((int*)ptr)
25
+#define ATOMIC_DEC(ptr)       no_atomic_dec((int*)ptr)
26
+#define ATOMIC_ADD(ptr, val)  no_atomic_add((int*)ptr, val)
27
+#define GIVE_UP_TIME()        usleep(0)
28
+
29
+#elif __GNUC__               /* GCCs builtin atomics */
30
 
31
 #include <sys/time.h>
32
 #include <unistd.h>
33
@@ -71,7 +94,7 @@
34
 
35
 #endif // ifdef __GNUC__
36
 
37
-namespace x265 {
38
+namespace X265_NS {
39
 // x265 private namespace
40
 
41
 #ifdef _WIN32
42
@@ -463,6 +486,6 @@
43
 
44
     void stop();
45
 };
46
-} // end namespace x265
47
+} // end namespace X265_NS
48
 
49
 #endif // ifndef X265_THREADING_H
50
x265_1.7.tar.gz/source/common/threadpool.cpp -> x265_1.8.tar.gz/source/common/threadpool.cpp Changed
25
 
1
@@ -60,7 +60,7 @@
2
 #include <numa.h>
3
 #endif
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 class WorkerThread : public Thread
10
@@ -310,7 +310,7 @@
11
     ThreadPool *pools = new ThreadPool[numPools];
12
     if (pools)
13
     {
14
-        int maxProviders = (p->frameNumThreads + 1 + numPools - 1) / numPools; /* +1 is Lookahead */
15
+        int maxProviders = (p->frameNumThreads + numPools - 1) / numPools + 1; /* +1 is Lookahead, always assigned to threadpool 0 */
16
         int node = 0;
17
         for (int i = 0; i < numPools; i++)
18
         {
19
@@ -480,4 +480,4 @@
20
 #endif
21
 }
22
 
23
-} // end namespace x265
24
+} // end namespace X265_NS
25
x265_1.7.tar.gz/source/common/threadpool.h -> x265_1.8.tar.gz/source/common/threadpool.h Changed
27
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "threading.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 class ThreadPool;
10
@@ -113,7 +113,7 @@
11
  * called. If it returns non-zero then some number of slave worker threads are
12
  * already in the process of calling your processTasks() function. The master
13
  * thread should participate and call processTasks() itself. When
14
- * waitForExit() returns, all bonded peer threads are quarunteed to have
15
+ * waitForExit() returns, all bonded peer threads are guaranteed to have
16
  * exitied processTasks(). Since the thread count is small, it uses explicit
17
  * locking instead of atomic counters and bitmasks */
18
 class BondedTaskGroup
19
@@ -167,6 +167,6 @@
20
     virtual void processTasks(int workerThreadId) = 0;
21
 };
22
 
23
-} // end namespace x265
24
+} // end namespace X265_NS
25
 
26
 #endif // ifndef X265_THREADPOOL_H
27
x265_1.7.tar.gz/source/common/vec/dct-sse3.cpp -> x265_1.8.tar.gz/source/common/vec/dct-sse3.cpp Changed
63
 
1
@@ -33,19 +33,13 @@
2
 #include <xmmintrin.h> // SSE
3
 #include <pmmintrin.h> // SSE3
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
-namespace {
9
 #define SHIFT1  7
10
 #define ADD1    64
11
 
12
-#if HIGH_BIT_DEPTH
13
-#define SHIFT2  10
14
-#define ADD2    512
15
-#else
16
-#define SHIFT2  12
17
-#define ADD2    2048
18
-#endif
19
+#define SHIFT2  (12 - (X265_DEPTH - 8))
20
+#define ADD2    (1 << ((SHIFT2) - 1))
21
 
22
 ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
23
 {
24
@@ -62,7 +56,8 @@
25
     {  83,  36,  83,  36, 83,  36, 83,  36 },
26
     {  36, -83,  36, -83, 36, -83, 36, -83 }
27
 };
28
-void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
29
+
30
+static void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
31
 {
32
     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
33
     __m128i T00, T01, T02, T03, T04, T05, T06, T07;
34
@@ -299,7 +294,7 @@
35
     _mm_storeh_pi((__m64*)&dst[7 * stride +  4], _mm_castsi128_ps(T11));
36
 }
37
 
38
-void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
39
+static void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
40
 {
41
 #define READ_UNPACKHILO(offset)\
42
     const __m128i T_00_00A = _mm_unpacklo_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\
43
@@ -677,7 +672,7 @@
44
 #undef UNPACKHILO
45
 #undef READ_UNPACKHILO
46
 
47
-void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
48
+static void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
49
 {
50
     //Odd
51
     const __m128i c16_p90_p90   = _mm_set1_epi32(0x005A005A); //column 0
52
@@ -1418,9 +1413,7 @@
53
     }
54
 }
55
 
56
-}
57
-
58
-namespace x265 {
59
+namespace X265_NS {
60
 void setupIntrinsicDCT_sse3(EncoderPrimitives &p)
61
 {
62
     /* Note: We have AVX2 assembly for these functions, but since AVX2 is still
63
x265_1.7.tar.gz/source/common/vec/dct-sse41.cpp -> x265_1.8.tar.gz/source/common/vec/dct-sse41.cpp Changed
25
 
1
@@ -33,10 +33,9 @@
2
 #include <xmmintrin.h> // SSE
3
 #include <smmintrin.h> // SSE4.1
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
-namespace {
9
-void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
10
+static void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
11
 {
12
     X265_CHECK(num <= 32 * 32, "dequant num too large\n");
13
 
14
@@ -100,9 +99,8 @@
15
         }
16
     }
17
 }
18
-}
19
 
20
-namespace x265 {
21
+namespace X265_NS {
22
 void setupIntrinsicDCT_sse41(EncoderPrimitives &p)
23
 {
24
     p.dequant_scaling = dequant_scaling;
25
x265_1.7.tar.gz/source/common/vec/dct-ssse3.cpp -> x265_1.8.tar.gz/source/common/vec/dct-ssse3.cpp Changed
201
 
1
@@ -34,9 +34,20 @@
2
 #include <pmmintrin.h> // SSE3
3
 #include <tmmintrin.h> // SSSE3
4
 
5
-using namespace x265;
6
+#define DCT16_SHIFT1  (3 + X265_DEPTH - 8)
7
+#define DCT16_ADD1    (1 << ((DCT16_SHIFT1) - 1))
8
+
9
+#define DCT16_SHIFT2  10
10
+#define DCT16_ADD2    (1 << ((DCT16_SHIFT2) - 1))
11
+
12
+#define DCT32_SHIFT1  (DCT16_SHIFT1 + 1)
13
+#define DCT32_ADD1    (1 << ((DCT32_SHIFT1) - 1))
14
+
15
+#define DCT32_SHIFT2  (DCT16_SHIFT2 + 1)
16
+#define DCT32_ADD2    (1 << ((DCT32_SHIFT2) - 1))
17
+
18
+using namespace X265_NS;
19
 
20
-namespace {
21
 ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
22
 {
23
     { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A },
24
@@ -99,22 +110,11 @@
25
 #undef MAKE_COEF
26
 };
27
 
28
-void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
29
+static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
30
 {
31
-#if HIGH_BIT_DEPTH
32
-#define SHIFT1  5
33
-#define ADD1    16
34
-#else
35
-#define SHIFT1  3
36
-#define ADD1    4
37
-#endif
38
-
39
-#define SHIFT2  10
40
-#define ADD2    512
41
-
42
     // Const
43
-    __m128i c_4     = _mm_set1_epi32(ADD1);
44
-    __m128i c_512   = _mm_set1_epi32(ADD2);
45
+    __m128i c_4     = _mm_set1_epi32(DCT16_ADD1);
46
+    __m128i c_512   = _mm_set1_epi32(DCT16_ADD2);
47
 
48
     int i;
49
 
50
@@ -202,29 +202,29 @@
51
 
52
         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
53
         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
54
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
55
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
56
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
57
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
58
         T70  = _mm_packs_epi32(T60, T61);
59
         _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
60
 
61
         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
62
         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
63
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
64
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
65
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
66
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
67
         T70  = _mm_packs_epi32(T60, T61);
68
         _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
69
 
70
         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
71
         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
72
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
73
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
74
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
75
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
76
         T70  = _mm_packs_epi32(T60, T61);
77
         _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
78
 
79
         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
80
         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
81
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
82
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
83
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
84
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
85
         T70  = _mm_packs_epi32(T60, T61);
86
         _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
87
 
88
@@ -234,8 +234,8 @@
89
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
90
         T60  = _mm_hadd_epi32(T60, T61);
91
         T61  = _mm_hadd_epi32(T62, T63);
92
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
93
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
94
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
95
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
96
         T70  = _mm_packs_epi32(T60, T61);
97
         _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
98
 
99
@@ -245,8 +245,8 @@
100
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
101
         T60  = _mm_hadd_epi32(T60, T61);
102
         T61  = _mm_hadd_epi32(T62, T63);
103
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
104
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
105
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
106
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
107
         T70  = _mm_packs_epi32(T60, T61);
108
         _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
109
 
110
@@ -256,8 +256,8 @@
111
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
112
         T60  = _mm_hadd_epi32(T60, T61);
113
         T61  = _mm_hadd_epi32(T62, T63);
114
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
115
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
116
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
117
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
118
         T70  = _mm_packs_epi32(T60, T61);
119
         _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
120
 
121
@@ -267,8 +267,8 @@
122
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
123
         T60  = _mm_hadd_epi32(T60, T61);
124
         T61  = _mm_hadd_epi32(T62, T63);
125
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
126
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
127
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
128
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
129
         T70  = _mm_packs_epi32(T60, T61);
130
         _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
131
 
132
@@ -287,8 +287,8 @@
133
     T63  = _mm_hadd_epi32(T66, T67); \
134
     T60  = _mm_hadd_epi32(T60, T61); \
135
     T61  = _mm_hadd_epi32(T62, T63); \
136
-    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
137
-    T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
138
+    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1); \
139
+    T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1); \
140
     T70  = _mm_packs_epi32(T60, T61); \
141
     _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
142
 
143
@@ -352,8 +352,8 @@
144
 
145
         T40  = _mm_hadd_epi32(T30, T31);
146
         T41  = _mm_hsub_epi32(T30, T31);
147
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
148
-        T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
149
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
150
+        T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), DCT16_SHIFT2);
151
         T40  = _mm_packs_epi32(T40, T40);
152
         T41  = _mm_packs_epi32(T41, T41);
153
         _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
154
@@ -377,7 +377,7 @@
155
         T31  = _mm_hadd_epi32(T32, T33);
156
 
157
         T40  = _mm_hadd_epi32(T30, T31);
158
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
159
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
160
         T40  = _mm_packs_epi32(T40, T40);
161
         _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
162
 
163
@@ -399,7 +399,7 @@
164
         T31  = _mm_hadd_epi32(T32, T33);
165
 
166
         T40  = _mm_hadd_epi32(T30, T31);
167
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
168
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
169
         T40  = _mm_packs_epi32(T40, T40);
170
         _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
171
 
172
@@ -421,7 +421,7 @@
173
         T31  = _mm_hadd_epi32(T32, T33);
174
 
175
         T40  = _mm_hadd_epi32(T30, T31);
176
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
177
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
178
         T40  = _mm_packs_epi32(T40, T40);
179
         _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
180
 
181
@@ -443,7 +443,7 @@
182
         T31  = _mm_hadd_epi32(T32, T33);
183
 
184
         T40  = _mm_hadd_epi32(T30, T31);
185
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
186
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
187
         T40  = _mm_packs_epi32(T40, T40);
188
         _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
189
 
190
@@ -465,7 +465,7 @@
191
         T31  = _mm_hadd_epi32(T32, T33);
192
 
193
         T40  = _mm_hadd_epi32(T30, T31);
194
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
195
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
196
         T40  = _mm_packs_epi32(T40, T40);
197
         _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
198
 
199
@@ -487,7 +487,7 @@
200
         T31  = _mm_hadd_epi32(T32, T33);
201
x265_1.7.tar.gz/source/common/vec/vec-primitives.cpp -> x265_1.8.tar.gz/source/common/vec/vec-primitives.cpp Changed
26
 
1
@@ -32,12 +32,13 @@
2
 #define HAVE_SSE4
3
 #define HAVE_AVX2
4
 #elif defined(__GNUC__)
5
-#if __clang__ || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 3)
6
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
7
+#if __clang__ || GCC_VERSION >= 40300 /* gcc_version >= gcc-4.3.0 */
8
 #define HAVE_SSE3
9
 #define HAVE_SSSE3
10
 #define HAVE_SSE4
11
 #endif
12
-#if __clang__ || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 7)
13
+#if __clang__ || GCC_VERSION >= 40700 /* gcc_version >= gcc-4.7.0 */
14
 #define HAVE_AVX2
15
 #endif
16
 #elif defined(_MSC_VER)
17
@@ -50,7 +51,7 @@
18
 #endif // compiler checks
19
 #endif // if X265_ARCH_X86
20
 
21
-namespace x265 {
22
+namespace X265_NS {
23
 // private x265 namespace
24
 
25
 void setupIntrinsicDCT_sse3(EncoderPrimitives&);
26
x265_1.7.tar.gz/source/common/version.cpp -> x265_1.8.tar.gz/source/common/version.cpp Changed
138
 
1
@@ -23,71 +23,109 @@
2
 
3
 #include "x265.h"
4
 #include "common.h"
5
+#include "primitives.h"
6
 
7
 #define XSTR(x) STR(x)
8
 #define STR(x) #x
9
 
10
 #if defined(__clang__)
11
-#define NVM_COMPILEDBY  "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]"
12
+#define COMPILEDBY  "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]"
13
 #ifdef __IA64__
14
-#define NVM_ONARCH    "[on 64-bit] "
15
+#define ONARCH    "[on 64-bit] "
16
 #else
17
-#define NVM_ONARCH    "[on 32-bit] "
18
+#define ONARCH    "[on 32-bit] "
19
 #endif
20
 #endif
21
 
22
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
23
-#define NVM_COMPILEDBY  "[GCC " XSTR(__GNUC__) "." XSTR(__GNUC_MINOR__) "." XSTR(__GNUC_PATCHLEVEL__) "]"
24
+#define COMPILEDBY  "[GCC " XSTR(__GNUC__) "." XSTR(__GNUC_MINOR__) "." XSTR(__GNUC_PATCHLEVEL__) "]"
25
 #ifdef __IA64__
26
-#define NVM_ONARCH    "[on 64-bit] "
27
+#define ONARCH    "[on 64-bit] "
28
 #else
29
-#define NVM_ONARCH    "[on 32-bit] "
30
+#define ONARCH    "[on 32-bit] "
31
 #endif
32
 #endif
33
 
34
 #ifdef __INTEL_COMPILER
35
-#define NVM_COMPILEDBY  "[ICC " XSTR(__INTEL_COMPILER) "]"
36
+#define COMPILEDBY "[ICC " XSTR(__INTEL_COMPILER) "]"
37
 #elif  _MSC_VER
38
-#define NVM_COMPILEDBY  "[MSVC " XSTR(_MSC_VER) "]"
39
+#define COMPILEDBY "[MSVC " XSTR(_MSC_VER) "]"
40
 #endif
41
 
42
-#ifndef NVM_COMPILEDBY
43
-#define NVM_COMPILEDBY "[Unk-CXX]"
44
+#ifndef COMPILEDBY
45
+#define COMPILEDBY "[Unk-CXX]"
46
 #endif
47
 
48
 #ifdef _WIN32
49
-#define NVM_ONOS        "[Windows]"
50
+#define ONOS    "[Windows]"
51
 #elif  __linux
52
-#define NVM_ONOS        "[Linux]"
53
+#define ONOS    "[Linux]"
54
 #elif __OpenBSD__
55
-#define NVM_ONOS        "[OpenBSD]"
56
+#define ONOS    "[OpenBSD]"
57
 #elif  __CYGWIN__
58
-#define NVM_ONOS        "[Cygwin]"
59
+#define ONOS    "[Cygwin]"
60
 #elif __APPLE__
61
-#define NVM_ONOS        "[Mac OS X]"
62
+#define ONOS    "[Mac OS X]"
63
 #else
64
-#define NVM_ONOS "[Unk-OS]"
65
+#define ONOS    "[Unk-OS]"
66
 #endif
67
 
68
 #if X86_64
69
-#define NVM_BITS        "[64 bit]"
70
+#define BITS    "[64 bit]"
71
 #else
72
-#define NVM_BITS        "[32 bit]"
73
+#define BITS    "[32 bit]"
74
+#endif
75
+
76
+#if defined(ENABLE_ASSEMBLY)
77
+#define ASM     ""
78
+#else
79
+#define ASM     "[noasm]"
80
+#endif
81
+ 
82
+#if NO_ATOMICS
83
+#define ATOMICS "[no-atomics]"
84
+#else
85
+#define ATOMICS ""
86
 #endif
87
 
88
 #if CHECKED_BUILD
89
-#define CHECKED         "[CHECKED] "
90
+#define CHECKED "[CHECKED] "
91
 #else
92
-#define CHECKED         " "
93
+#define CHECKED " "
94
 #endif
95
 
96
-#if HIGH_BIT_DEPTH
97
-#define BITDEPTH "16bpp"
98
-const int x265_max_bit_depth = 10;
99
+#if X265_DEPTH == 12
100
+
101
+#define BITDEPTH "12bit"
102
+const int PFX(max_bit_depth) = 12;
103
+
104
+#elif X265_DEPTH == 10
105
+
106
+#define BITDEPTH "10bit"
107
+const int PFX(max_bit_depth) = 10;
108
+
109
+#elif X265_DEPTH == 8
110
+
111
+#define BITDEPTH "8bit"
112
+const int PFX(max_bit_depth) = 8;
113
+
114
+#endif
115
+
116
+#if LINKED_8BIT
117
+#define ADD8 "+8bit"
118
+#else
119
+#define ADD8 ""
120
+#endif
121
+#if LINKED_10BIT
122
+#define ADD10 "+10bit"
123
+#else
124
+#define ADD10 ""
125
+#endif
126
+#if LINKED_12BIT
127
+#define ADD12 "+12bit"
128
 #else
129
-#define BITDEPTH "8bpp"
130
-const int x265_max_bit_depth = 8;
131
+#define ADD12 ""
132
 #endif
133
 
134
-const char *x265_version_str = XSTR(X265_VERSION);
135
-const char *x265_build_info_str = NVM_ONOS NVM_COMPILEDBY NVM_BITS CHECKED BITDEPTH;
136
+const char* PFX(version_str) = XSTR(X265_VERSION);
137
+const char* PFX(build_info_str) = ONOS COMPILEDBY BITS ASM ATOMICS CHECKED BITDEPTH ADD8 ADD10 ADD12;
138
x265_1.7.tar.gz/source/common/wavefront.cpp -> x265_1.8.tar.gz/source/common/wavefront.cpp Changed
10
 
1
@@ -26,7 +26,7 @@
2
 #include "wavefront.h"
3
 #include "common.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 bool WaveFront::init(int numRows)
10
x265_1.7.tar.gz/source/common/wavefront.h -> x265_1.8.tar.gz/source/common/wavefront.h Changed
18
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "threadpool.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 // Generic wave-front scheduler, manages busy-state of CU rows as a priority
10
@@ -92,6 +92,6 @@
11
     // derived classes.
12
     virtual void processRow(int row, int threadId) = 0;
13
 };
14
-} // end namespace x265
15
+} // end namespace X265_NS
16
 
17
 #endif // ifndef X265_WAVEFRONT_H
18
x265_1.7.tar.gz/source/common/winxp.cpp -> x265_1.8.tar.gz/source/common/winxp.cpp Changed
19
 
1
@@ -25,7 +25,7 @@
2
 
3
 #if defined(_WIN32) && (_WIN32_WINNT < 0x0600) // _WIN32_WINNT_VISTA
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 /* Mimic CONDITION_VARIABLE functions only supported on Vista+ */
8
 
9
 int WINAPI cond_init(ConditionVariable *cond)
10
@@ -121,7 +121,7 @@
11
     DeleteCriticalSection(&cond->broadcastMutex);
12
     DeleteCriticalSection(&cond->waiterCountMutex);
13
 }
14
-} // namespace x265
15
+} // namespace X265_NS
16
 
17
 #elif defined(_MSC_VER)
18
 
19
x265_1.7.tar.gz/source/common/winxp.h -> x265_1.8.tar.gz/source/common/winxp.h Changed
32
 
1
@@ -30,7 +30,7 @@
2
 #include <intrin.h> // _InterlockedCompareExchange64
3
 #endif
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 /* non-native condition variable */
8
 typedef struct
9
 {
10
@@ -49,14 +49,14 @@
11
 void cond_destroy(ConditionVariable *cond);
12
 
13
 /* map missing API symbols to our structure and functions */
14
-#define CONDITION_VARIABLE          x265::ConditionVariable
15
-#define InitializeConditionVariable x265::cond_init
16
-#define SleepConditionVariableCS    x265::cond_wait
17
-#define WakeConditionVariable       x265::cond_signal
18
-#define WakeAllConditionVariable    x265::cond_broadcast
19
-#define XP_CONDITION_VAR_FREE       x265::cond_destroy
20
+#define CONDITION_VARIABLE          X265_NS::ConditionVariable
21
+#define InitializeConditionVariable X265_NS::cond_init
22
+#define SleepConditionVariableCS    X265_NS::cond_wait
23
+#define WakeConditionVariable       X265_NS::cond_signal
24
+#define WakeAllConditionVariable    X265_NS::cond_broadcast
25
+#define XP_CONDITION_VAR_FREE       X265_NS::cond_destroy
26
 
27
-} // namespace x265
28
+} // namespace X265_NS
29
 
30
 #else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600)
31
 
32
x265_1.7.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.8.tar.gz/source/common/x86/asm-primitives.cpp Changed
201
 
1
@@ -28,6 +28,83 @@
2
 #include "x265.h"
3
 #include "cpu.h"
4
 
5
+#define FUNCDEF_TU(ret, name, cpu, ...) \
6
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
7
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
8
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
9
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
10
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
11
+
12
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
13
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
14
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
15
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
16
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
17
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
18
+
19
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
20
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
21
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
22
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
23
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
24
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
25
+
26
+#define FUNCDEF_PU(ret, name, cpu, ...) \
27
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
28
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
29
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
30
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
31
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
32
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
33
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
34
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
35
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
36
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
37
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
38
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
39
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
40
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
41
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
42
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
43
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
44
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
45
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
46
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
47
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
48
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
49
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
50
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
51
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
52
+
53
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
54
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
55
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
56
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
57
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
58
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
59
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
60
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
61
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
62
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
63
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
64
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
65
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
66
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
67
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
68
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
69
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
70
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
71
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
72
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
73
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
74
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
75
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
76
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
77
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
78
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
79
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
80
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
81
+
82
 extern "C" {
83
 #include "pixel.h"
84
 #include "pixel-util.h"
85
@@ -40,31 +117,31 @@
86
 }
87
 
88
 #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
89
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
90
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
91
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
92
-    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu
93
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
94
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
95
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
96
+    p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu)
97
 #define ALL_LUMA_CU_TYPED_S(prim, fncdef, fname, cpu) \
98
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## 8_ ## cpu; \
99
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \
100
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu; \
101
-    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## 64_ ## cpu
102
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## 8_ ## cpu); \
103
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## 16_ ## cpu); \
104
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## 32_ ## cpu); \
105
+    p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## 64_ ## cpu)
106
 #define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
107
-    p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
108
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
109
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
110
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu
111
+    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
112
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
113
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
114
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu)
115
 #define ALL_LUMA_TU_TYPED_S(prim, fncdef, fname, cpu) \
116
-    p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## 4_ ## cpu; \
117
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## 8_ ## cpu; \
118
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \
119
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu
120
+    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## 4_ ## cpu); \
121
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## 8_ ## cpu); \
122
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## 16_ ## cpu); \
123
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## 32_ ## cpu)
124
 #define ALL_LUMA_BLOCKS_TYPED(prim, fncdef, fname, cpu) \
125
-    p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
126
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
127
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
128
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
129
-    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu;
130
+    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
131
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
132
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
133
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
134
+    p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu);
135
 #define ALL_LUMA_CU(prim, fname, cpu)      ALL_LUMA_CU_TYPED(prim, , fname, cpu)
136
 #define ALL_LUMA_CU_S(prim, fname, cpu)    ALL_LUMA_CU_TYPED_S(prim, , fname, cpu)
137
 #define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
138
@@ -72,30 +149,30 @@
139
 #define ALL_LUMA_TU_S(prim, fname, cpu)    ALL_LUMA_TU_TYPED_S(prim, , fname, cpu)
140
 
141
 #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
142
-    p.pu[LUMA_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
143
-    p.pu[LUMA_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
144
-    p.pu[LUMA_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
145
-    p.pu[LUMA_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu; \
146
-    p.pu[LUMA_8x4].prim   = fncdef x265_ ## fname ## _8x4_ ## cpu; \
147
-    p.pu[LUMA_4x8].prim   = fncdef x265_ ## fname ## _4x8_ ## cpu; \
148
-    p.pu[LUMA_16x8].prim  = fncdef x265_ ## fname ## _16x8_ ## cpu; \
149
-    p.pu[LUMA_8x16].prim  = fncdef x265_ ## fname ## _8x16_ ## cpu; \
150
-    p.pu[LUMA_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
151
-    p.pu[LUMA_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
152
-    p.pu[LUMA_64x32].prim = fncdef x265_ ## fname ## _64x32_ ## cpu; \
153
-    p.pu[LUMA_32x64].prim = fncdef x265_ ## fname ## _32x64_ ## cpu; \
154
-    p.pu[LUMA_16x12].prim = fncdef x265_ ## fname ## _16x12_ ## cpu; \
155
-    p.pu[LUMA_12x16].prim = fncdef x265_ ## fname ## _12x16_ ## cpu; \
156
-    p.pu[LUMA_16x4].prim  = fncdef x265_ ## fname ## _16x4_ ## cpu; \
157
-    p.pu[LUMA_4x16].prim  = fncdef x265_ ## fname ## _4x16_ ## cpu; \
158
-    p.pu[LUMA_32x24].prim = fncdef x265_ ## fname ## _32x24_ ## cpu; \
159
-    p.pu[LUMA_24x32].prim = fncdef x265_ ## fname ## _24x32_ ## cpu; \
160
-    p.pu[LUMA_32x8].prim  = fncdef x265_ ## fname ## _32x8_ ## cpu; \
161
-    p.pu[LUMA_8x32].prim  = fncdef x265_ ## fname ## _8x32_ ## cpu; \
162
-    p.pu[LUMA_64x48].prim = fncdef x265_ ## fname ## _64x48_ ## cpu; \
163
-    p.pu[LUMA_48x64].prim = fncdef x265_ ## fname ## _48x64_ ## cpu; \
164
-    p.pu[LUMA_64x16].prim = fncdef x265_ ## fname ## _64x16_ ## cpu; \
165
-    p.pu[LUMA_16x64].prim = fncdef x265_ ## fname ## _16x64_ ## cpu
166
+    p.pu[LUMA_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
167
+    p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
168
+    p.pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
169
+    p.pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu); \
170
+    p.pu[LUMA_8x4].prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
171
+    p.pu[LUMA_4x8].prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
172
+    p.pu[LUMA_16x8].prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
173
+    p.pu[LUMA_8x16].prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
174
+    p.pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
175
+    p.pu[LUMA_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
176
+    p.pu[LUMA_64x32].prim = fncdef PFX(fname ## _64x32_ ## cpu); \
177
+    p.pu[LUMA_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \
178
+    p.pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
179
+    p.pu[LUMA_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \
180
+    p.pu[LUMA_16x4].prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
181
+    p.pu[LUMA_4x16].prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
182
+    p.pu[LUMA_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
183
+    p.pu[LUMA_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
184
+    p.pu[LUMA_32x8].prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
185
+    p.pu[LUMA_8x32].prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
186
+    p.pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## cpu); \
187
+    p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
188
+    p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
189
+    p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
190
 #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
191
 
192
 #define ALL_LUMA_PU_T(prim, fname) \
193
@@ -125,237 +202,237 @@
194
     p.pu[LUMA_16x64].prim = fname<LUMA_16x64>
195
 
196
 #define ALL_CHROMA_420_CU_TYPED(prim, fncdef, fname, cpu) \
197
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
198
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
199
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
200
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu
201
x265_1.7.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.8.tar.gz/source/common/x86/blockcopy8.asm Changed
175
 
1
@@ -3043,43 +3043,31 @@
2
 ;-----------------------------------------------------------------------------
3
 %macro BLOCKCOPY_PS_W32_H4_avx2 2
4
 INIT_YMM avx2
5
-cglobal blockcopy_ps_%1x%2, 4, 7, 3
6
+cglobal blockcopy_ps_%1x%2, 4, 7, 2
7
     add     r1, r1
8
     mov     r4d, %2/4
9
     lea     r5, [3 * r3]
10
     lea     r6, [3 * r1]
11
-    pxor    m0, m0
12
-
13
 .loop:
14
-    movu          m1, [r2]
15
-    punpcklbw     m2, m1, m0
16
-    punpckhbw     m1, m1, m0
17
-    vperm2i128    m3, m2, m1, 00100000b
18
-    vperm2i128    m2, m2, m1, 00110001b
19
-    movu          [r0], m3
20
-    movu          [r0 + 32], m2
21
-    movu          m1, [r2 + r3]
22
-    punpcklbw     m2, m1, m0
23
-    punpckhbw     m1, m1, m0
24
-    vperm2i128    m3, m2, m1, 00100000b
25
-    vperm2i128    m2, m2, m1, 00110001b
26
-    movu          [r0 + r1], m3
27
-    movu          [r0 + r1 + 32], m2
28
-    movu          m1, [r2 + 2 * r3]
29
-    punpcklbw     m2, m1, m0
30
-    punpckhbw     m1, m1, m0
31
-    vperm2i128    m3, m2, m1, 00100000b
32
-    vperm2i128    m2, m2, m1, 00110001b
33
-    movu          [r0 + 2 * r1], m3
34
-    movu          [r0 + 2 * r1 + 32], m2
35
-    movu          m1, [r2 + r5]
36
-    punpcklbw     m2, m1, m0
37
-    punpckhbw     m1, m1, m0
38
-    vperm2i128    m3, m2, m1, 00100000b
39
-    vperm2i128    m2, m2, m1, 00110001b
40
-    movu          [r0 + r6], m3
41
-    movu          [r0 + r6 + 32], m2
42
-
43
+    pmovzxbw      m0, [r2 +  0]
44
+    pmovzxbw      m1, [r2 + 16]
45
+    movu          [r0 +  0], m0
46
+    movu          [r0 + 32], m1
47
+
48
+    pmovzxbw      m0, [r2 + r3 +  0]
49
+    pmovzxbw      m1, [r2 + r3 + 16]
50
+    movu          [r0 + r1 +  0], m0
51
+    movu          [r0 + r1 + 32], m1
52
+
53
+    pmovzxbw      m0, [r2 + r3 * 2 +  0]
54
+    pmovzxbw      m1, [r2 + r3 * 2 + 16]
55
+    movu          [r0 + r1 * 2 +  0], m0
56
+    movu          [r0 + r1 * 2 + 32], m1
57
+
58
+    pmovzxbw      m0, [r2 + r5 +  0]
59
+    pmovzxbw      m1, [r2 + r5 + 16]
60
+    movu          [r0 + r6 +  0], m0
61
+    movu          [r0 + r6 + 32], m1
62
     lea           r0, [r0 + 4 * r1]
63
     lea           r2, [r2 + 4 * r3]
64
     dec           r4d
65
@@ -3228,71 +3216,49 @@
66
 INIT_YMM avx2
67
 cglobal blockcopy_ps_64x64, 4, 7, 4
68
     add     r1, r1
69
-    mov     r4d, 64/4
70
+    mov     r4d, 64/8
71
     lea     r5, [3 * r3]
72
     lea     r6, [3 * r1]
73
-    pxor    m0, m0
74
-
75
 .loop:
76
-    movu          m1, [r2]
77
-    punpcklbw     m2, m1, m0
78
-    punpckhbw     m1, m1, m0
79
-    vperm2i128    m3, m2, m1, 00100000b
80
-    vperm2i128    m2, m2, m1, 00110001b
81
-    movu          [r0], m3
82
-    movu          [r0 + 32], m2
83
-    movu          m1, [r2 + 32]
84
-    punpcklbw     m2, m1, m0
85
-    punpckhbw     m1, m1, m0
86
-    vperm2i128    m3, m2, m1, 00100000b
87
-    vperm2i128    m2, m2, m1, 00110001b
88
-    movu          [r0 + 64], m3
89
-    movu          [r0 + 96], m2
90
-    movu          m1, [r2 + r3]
91
-    punpcklbw     m2, m1, m0
92
-    punpckhbw     m1, m1, m0
93
-    vperm2i128    m3, m2, m1, 00100000b
94
-    vperm2i128    m2, m2, m1, 00110001b
95
-    movu          [r0 + r1], m3
96
-    movu          [r0 + r1 + 32], m2
97
-    movu          m1, [r2 + r3 + 32]
98
-    punpcklbw     m2, m1, m0
99
-    punpckhbw     m1, m1, m0
100
-    vperm2i128    m3, m2, m1, 00100000b
101
-    vperm2i128    m2, m2, m1, 00110001b
102
-    movu          [r0 + r1 + 64], m3
103
-    movu          [r0 + r1 + 96], m2
104
-    movu          m1, [r2 + 2 * r3]
105
-    punpcklbw     m2, m1, m0
106
-    punpckhbw     m1, m1, m0
107
-    vperm2i128    m3, m2, m1, 00100000b
108
-    vperm2i128    m2, m2, m1, 00110001b
109
-    movu          [r0 + 2 * r1], m3
110
-    movu          [r0 + 2 * r1 + 32], m2
111
-    movu          m1, [r2 + 2 * r3 + 32]
112
-    punpcklbw     m2, m1, m0
113
-    punpckhbw     m1, m1, m0
114
-    vperm2i128    m3, m2, m1, 00100000b
115
-    vperm2i128    m2, m2, m1, 00110001b
116
-    movu          [r0 + 2 * r1 + 64], m3
117
-    movu          [r0 + 2 * r1 + 96], m2
118
-    movu          m1, [r2 + r5]
119
-    punpcklbw     m2, m1, m0
120
-    punpckhbw     m1, m1, m0
121
-    vperm2i128    m3, m2, m1, 00100000b
122
-    vperm2i128    m2, m2, m1, 00110001b
123
-    movu          [r0 + r6], m3
124
-    movu          [r0 + r6 + 32], m2
125
-    movu          m1, [r2 + r5 + 32]
126
-    punpcklbw     m2, m1, m0
127
-    punpckhbw     m1, m1, m0
128
-    vperm2i128    m3, m2, m1, 00100000b
129
-    vperm2i128    m2, m2, m1, 00110001b
130
-    movu          [r0 + r6 + 64], m3
131
-    movu          [r0 + r6 + 96], m2
132
-
133
+%rep 2
134
+    pmovzxbw      m0, [r2 +  0]
135
+    pmovzxbw      m1, [r2 + 16]
136
+    pmovzxbw      m2, [r2 + 32]
137
+    pmovzxbw      m3, [r2 + 48]
138
+    movu          [r0 +  0], m0
139
+    movu          [r0 + 32], m1
140
+    movu          [r0 + 64], m2
141
+    movu          [r0 + 96], m3
142
+
143
+    pmovzxbw      m0, [r2 + r3 +  0]
144
+    pmovzxbw      m1, [r2 + r3 + 16]
145
+    pmovzxbw      m2, [r2 + r3 + 32]
146
+    pmovzxbw      m3, [r2 + r3 + 48]
147
+    movu          [r0 + r1 +  0], m0
148
+    movu          [r0 + r1 + 32], m1
149
+    movu          [r0 + r1 + 64], m2
150
+    movu          [r0 + r1 + 96], m3
151
+
152
+    pmovzxbw      m0, [r2 + r3 * 2 +  0]
153
+    pmovzxbw      m1, [r2 + r3 * 2 + 16]
154
+    pmovzxbw      m2, [r2 + r3 * 2 + 32]
155
+    pmovzxbw      m3, [r2 + r3 * 2 + 48]
156
+    movu          [r0 + r1 * 2 +  0], m0
157
+    movu          [r0 + r1 * 2 + 32], m1
158
+    movu          [r0 + r1 * 2 + 64], m2
159
+    movu          [r0 + r1 * 2 + 96], m3
160
+
161
+    pmovzxbw      m0, [r2 + r5 +  0]
162
+    pmovzxbw      m1, [r2 + r5 + 16]
163
+    pmovzxbw      m2, [r2 + r5 + 32]
164
+    pmovzxbw      m3, [r2 + r5 + 48]
165
+    movu          [r0 + r6 +  0], m0
166
+    movu          [r0 + r6 + 32], m1
167
+    movu          [r0 + r6 + 64], m2
168
+    movu          [r0 + r6 + 96], m3
169
     lea           r0, [r0 + 4 * r1]
170
     lea           r2, [r2 + 4 * r3]
171
+%endrep
172
     dec           r4d
173
     jnz           .loop
174
     RET
175
x265_1.7.tar.gz/source/common/x86/blockcopy8.h -> x265_1.8.tar.gz/source/common/x86/blockcopy8.h Changed
201
 
1
@@ -24,240 +24,40 @@
2
 #ifndef X265_BLOCKCOPY8_H
3
 #define X265_BLOCKCOPY8_H
4
 
5
-void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
6
-void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
7
-void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
8
-void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
9
-void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
10
-void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
11
-void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
12
-void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
13
-void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
14
-void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
15
-void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
16
-void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
17
-void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
18
-void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
19
-void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
20
-void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
21
-void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
22
-void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
23
-void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
24
-void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
25
-void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
26
-void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
27
-void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
28
-void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
29
-void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
30
-void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
31
-void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
32
-void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
33
-void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
34
-void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
35
-uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
36
-uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
37
-uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
38
-uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
39
-uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
40
-uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
41
-uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
42
-uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
43
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
44
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
45
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
46
 
47
-#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
48
-    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
49
-    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \
50
-    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
51
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
52
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
53
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
54
 
55
-#define SETUP_BLOCKCOPY_PS(W, H, cpu) \
56
-    void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
57
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
58
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
59
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
60
 
61
-#define SETUP_BLOCKCOPY_SP(W, H, cpu) \
62
-    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
63
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
64
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
65
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
66
 
67
-#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
68
-    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
69
-    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
70
+FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
71
+FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
72
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
73
 
74
-#define BLOCKCOPY_COMMON(cpu) \
75
-    SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
76
-    SETUP_BLOCKCOPY_FUNC(4, 2, cpu); \
77
-    SETUP_BLOCKCOPY_FUNC(8, 8, cpu); \
78
-    SETUP_BLOCKCOPY_FUNC(8, 4, cpu); \
79
-    SETUP_BLOCKCOPY_FUNC(4, 8, cpu); \
80
-    SETUP_BLOCKCOPY_FUNC(8, 6, cpu); \
81
-    SETUP_BLOCKCOPY_FUNC(8, 2, cpu); \
82
-    SETUP_BLOCKCOPY_FUNC(16, 16, cpu); \
83
-    SETUP_BLOCKCOPY_FUNC(16, 8, cpu); \
84
-    SETUP_BLOCKCOPY_FUNC(8, 16, cpu); \
85
-    SETUP_BLOCKCOPY_FUNC(16, 12, cpu); \
86
-    SETUP_BLOCKCOPY_FUNC(12, 16, cpu); \
87
-    SETUP_BLOCKCOPY_FUNC(16, 4, cpu); \
88
-    SETUP_BLOCKCOPY_FUNC(4, 16, cpu); \
89
-    SETUP_BLOCKCOPY_FUNC(32, 32, cpu); \
90
-    SETUP_BLOCKCOPY_FUNC(32, 16, cpu); \
91
-    SETUP_BLOCKCOPY_FUNC(16, 32, cpu); \
92
-    SETUP_BLOCKCOPY_FUNC(32, 24, cpu); \
93
-    SETUP_BLOCKCOPY_FUNC(24, 32, cpu); \
94
-    SETUP_BLOCKCOPY_FUNC(32, 8, cpu); \
95
-    SETUP_BLOCKCOPY_FUNC(8, 32, cpu); \
96
-    SETUP_BLOCKCOPY_FUNC(64, 64, cpu); \
97
-    SETUP_BLOCKCOPY_FUNC(64, 32, cpu); \
98
-    SETUP_BLOCKCOPY_FUNC(32, 64, cpu); \
99
-    SETUP_BLOCKCOPY_FUNC(64, 48, cpu); \
100
-    SETUP_BLOCKCOPY_FUNC(48, 64, cpu); \
101
-    SETUP_BLOCKCOPY_FUNC(64, 16, cpu); \
102
-    SETUP_BLOCKCOPY_FUNC(16, 64, cpu);
103
+FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
104
+FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
105
 
106
-#define BLOCKCOPY_SP(cpu) \
107
-    SETUP_BLOCKCOPY_SP(2, 4, cpu); \
108
-    SETUP_BLOCKCOPY_SP(2, 8, cpu); \
109
-    SETUP_BLOCKCOPY_SP(6, 8, cpu); \
110
-    \
111
-    SETUP_BLOCKCOPY_SP(2, 16, cpu); \
112
-    SETUP_BLOCKCOPY_SP(4, 32, cpu); \
113
-    SETUP_BLOCKCOPY_SP(6, 16, cpu); \
114
-    SETUP_BLOCKCOPY_SP(8, 12, cpu); \
115
-    SETUP_BLOCKCOPY_SP(8, 64, cpu); \
116
-    SETUP_BLOCKCOPY_SP(12, 32, cpu); \
117
-    SETUP_BLOCKCOPY_SP(16, 24, cpu); \
118
-    SETUP_BLOCKCOPY_SP(24, 64, cpu); \
119
-    SETUP_BLOCKCOPY_SP(32, 48, cpu);
120
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
121
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
122
 
123
-#define BLOCKCOPY_SS_PP(cpu) \
124
-    SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \
125
-    SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \
126
-    SETUP_BLOCKCOPY_SS_PP(6, 8, cpu); \
127
-    \
128
-    SETUP_BLOCKCOPY_SS_PP(2, 16, cpu); \
129
-    SETUP_BLOCKCOPY_SS_PP(4, 32, cpu); \
130
-    SETUP_BLOCKCOPY_SS_PP(6, 16, cpu); \
131
-    SETUP_BLOCKCOPY_SS_PP(8, 12, cpu); \
132
-    SETUP_BLOCKCOPY_SS_PP(8, 64, cpu); \
133
-    SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \
134
-    SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \
135
-    SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \
136
-    SETUP_BLOCKCOPY_SS_PP(32, 48, cpu);
137
-    
138
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
139
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
140
 
141
-#define BLOCKCOPY_PS(cpu) \
142
-    SETUP_BLOCKCOPY_PS(2, 4, cpu); \
143
-    SETUP_BLOCKCOPY_PS(2, 8, cpu); \
144
-    SETUP_BLOCKCOPY_PS(4, 2, cpu); \
145
-    SETUP_BLOCKCOPY_PS(4, 4, cpu); \
146
-    SETUP_BLOCKCOPY_PS(4, 8, cpu); \
147
-    SETUP_BLOCKCOPY_PS(4, 16, cpu); \
148
-    SETUP_BLOCKCOPY_PS(6, 8, cpu); \
149
-    SETUP_BLOCKCOPY_PS(8, 2, cpu); \
150
-    SETUP_BLOCKCOPY_PS(8, 4, cpu); \
151
-    SETUP_BLOCKCOPY_PS(8, 6, cpu); \
152
-    SETUP_BLOCKCOPY_PS(8, 8, cpu); \
153
-    SETUP_BLOCKCOPY_PS(8, 16, cpu); \
154
-    SETUP_BLOCKCOPY_PS(8, 32, cpu); \
155
-    SETUP_BLOCKCOPY_PS(12, 16, cpu); \
156
-    SETUP_BLOCKCOPY_PS(16, 4, cpu); \
157
-    SETUP_BLOCKCOPY_PS(16, 8, cpu); \
158
-    SETUP_BLOCKCOPY_PS(16, 12, cpu); \
159
-    SETUP_BLOCKCOPY_PS(16, 16, cpu); \
160
-    SETUP_BLOCKCOPY_PS(16, 32, cpu); \
161
-    SETUP_BLOCKCOPY_PS(24, 32, cpu); \
162
-    SETUP_BLOCKCOPY_PS(32,  8, cpu); \
163
-    SETUP_BLOCKCOPY_PS(32, 16, cpu); \
164
-    SETUP_BLOCKCOPY_PS(32, 24, cpu); \
165
-    SETUP_BLOCKCOPY_PS(32, 32, cpu); \
166
-    SETUP_BLOCKCOPY_PS(16, 64, cpu); \
167
-    SETUP_BLOCKCOPY_PS(32, 64, cpu); \
168
-    SETUP_BLOCKCOPY_PS(48, 64, cpu); \
169
-    SETUP_BLOCKCOPY_PS(64, 16, cpu); \
170
-    SETUP_BLOCKCOPY_PS(64, 32, cpu); \
171
-    SETUP_BLOCKCOPY_PS(64, 48, cpu); \
172
-    SETUP_BLOCKCOPY_PS(64, 64, cpu); \
173
-    \
174
-    SETUP_BLOCKCOPY_PS(2, 16, cpu); \
175
-    SETUP_BLOCKCOPY_PS(4, 32, cpu); \
176
-    SETUP_BLOCKCOPY_PS(6, 16, cpu); \
177
-    SETUP_BLOCKCOPY_PS(8, 12, cpu); \
178
-    SETUP_BLOCKCOPY_PS(8, 64, cpu); \
179
-    SETUP_BLOCKCOPY_PS(12, 32, cpu); \
180
-    SETUP_BLOCKCOPY_PS(16, 24, cpu); \
181
-    SETUP_BLOCKCOPY_PS(24, 64, cpu); \
182
-    SETUP_BLOCKCOPY_PS(32, 48, cpu);
183
-
184
-BLOCKCOPY_COMMON(_sse2);
185
-BLOCKCOPY_SS_PP(_sse2);
186
-BLOCKCOPY_SP(_sse4);
187
-BLOCKCOPY_PS(_sse4);
188
-
189
-BLOCKCOPY_SP(_sse2);
190
-
191
-void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val);
192
-void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
193
-void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
194
-void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
195
-void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
196
-void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
197
-void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
198
-void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
199
-void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
200
-void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
201
x265_1.7.tar.gz/source/common/x86/const-a.asm -> x265_1.8.tar.gz/source/common/x86/const-a.asm Changed
86
 
1
@@ -41,7 +41,7 @@
2
 const pb_16,                times 32 db 16
3
 const pb_32,                times 32 db 32
4
 const pb_64,                times 32 db 64
5
-const pb_128,               times 16 db 128
6
+const pb_128,               times 32 db 128
7
 const pb_a1,                times 16 db 0xa1
8
 
9
 const pb_01,                times  8 db   0,   1
10
@@ -62,7 +62,9 @@
11
 ;; 16-bit constants
12
 
13
 const pw_1,                 times 16 dw 1
14
-const pw_2,                 times  8 dw 2
15
+const pw_2,                 times 16 dw 2
16
+const pw_3,                 times 16 dw 3
17
+const pw_7,                 times 16 dw 7
18
 const pw_m2,                times  8 dw -2
19
 const pw_4,                 times  8 dw 4
20
 const pw_8,                 times  8 dw 8
21
@@ -75,9 +77,11 @@
22
 const pw_256,               times 16 dw 256
23
 const pw_257,               times 16 dw 257
24
 const pw_512,               times 16 dw 512
25
-const pw_1023,              times  8 dw 1023
26
+const pw_1023,              times 16 dw 1023
27
 const pw_1024,              times 16 dw 1024
28
+const pw_2048,              times 16 dw 2048
29
 const pw_4096,              times 16 dw 4096
30
+const pw_8192,              times  8 dw 8192
31
 const pw_00ff,              times 16 dw 0x00ff
32
 const pw_ff00,              times  8 dw 0xff00
33
 const pw_2000,              times 16 dw 0x2000
34
@@ -90,7 +94,7 @@
35
 const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
36
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
37
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
38
-const pw_pmpmpmpm,          times  1 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
39
+const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
40
 const pw_pmmpzzzz,          times  1 dw   1,  -1,  -1,   1,   0,   0,   0,   0
41
 const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
42
 const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
43
@@ -100,7 +104,9 @@
44
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
45
 const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
46
 const pw_FFFFFFFFFFFFFFF0,           dw 0x00
47
-                            times 7  dw 0xff
48
+                            times  7 dw 0xff
49
+const hmul_16p,             times 16 db   1
50
+                            times  8 db   1,  -1
51
 
52
 
53
 ;; 32-bit constants
54
@@ -109,8 +115,9 @@
55
 const pd_2,                 times  8 dd 2
56
 const pd_4,                 times  4 dd 4
57
 const pd_8,                 times  4 dd 8
58
-const pd_16,                times  4 dd 16
59
-const pd_32,                times  4 dd 32
60
+const pd_16,                times  8 dd 16
61
+const pd_31,                times  4 dd 31
62
+const pd_32,                times  8 dd 32
63
 const pd_64,                times  4 dd 64
64
 const pd_128,               times  4 dd 128
65
 const pd_256,               times  4 dd 256
66
@@ -119,10 +126,11 @@
67
 const pd_2048,              times  4 dd 2048
68
 const pd_ffff,              times  4 dd 0xffff
69
 const pd_32767,             times  4 dd 32767
70
-const pd_n32768,            times  4 dd 0xffff8000
71
+const pd_524416,            times  4 dd 524416
72
+const pd_n32768,            times  8 dd 0xffff8000
73
+const pd_n131072,           times  4 dd 0xfffe0000
74
 
75
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
76
-const deinterleave_shufd,   times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
77
 
78
 const popcnt_table
79
 %assign x 0
80
@@ -131,5 +139,3 @@
81
 db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
82
 %assign x x+1
83
 %endrep
84
-
85
-const sw_64,       dd 64
86
x265_1.7.tar.gz/source/common/x86/dct8.asm -> x265_1.8.tar.gz/source/common/x86/dct8.asm Changed
201
 
1
@@ -157,7 +157,7 @@
2
 
3
 idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7
4
 
5
-idct8_shuf2:    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
6
+const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
7
 
8
 idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
9
 
10
@@ -332,20 +332,48 @@
11
 cextern pd_2048
12
 cextern pw_ppppmmmm
13
 cextern trans8_shuf
14
+
15
+
16
+%if BIT_DEPTH == 12
17
+    %define     DCT4_SHIFT          5
18
+    %define     DCT4_ROUND          16
19
+    %define    IDCT_SHIFT           8
20
+    %define    IDCT_ROUND           128
21
+    %define     DST4_SHIFT          5
22
+    %define     DST4_ROUND          16
23
+    %define     DCT8_SHIFT1         6
24
+    %define     DCT8_ROUND1         32
25
+%elif BIT_DEPTH == 10
26
+    %define     DCT4_SHIFT          3
27
+    %define     DCT4_ROUND          4
28
+    %define    IDCT_SHIFT           10
29
+    %define    IDCT_ROUND           512
30
+    %define     DST4_SHIFT          3
31
+    %define     DST4_ROUND          4
32
+    %define     DCT8_SHIFT1         4
33
+    %define     DCT8_ROUND1         8
34
+%elif BIT_DEPTH == 8
35
+    %define     DCT4_SHIFT          1
36
+    %define     DCT4_ROUND          1
37
+    %define    IDCT_SHIFT           12
38
+    %define    IDCT_ROUND           2048
39
+    %define     DST4_SHIFT          1
40
+    %define     DST4_ROUND          1
41
+    %define     DCT8_SHIFT1         2
42
+    %define     DCT8_ROUND1         2
43
+%else
44
+    %error Unsupported BIT_DEPTH!
45
+%endif
46
+
47
+%define         DCT8_ROUND2         256
48
+%define         DCT8_SHIFT2         9
49
+
50
 ;------------------------------------------------------
51
 ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
52
 ;------------------------------------------------------
53
 INIT_XMM sse2
54
 cglobal dct4, 3, 4, 8
55
-%if BIT_DEPTH == 10
56
-  %define       DCT_SHIFT 3
57
-  mova          m7, [pd_4]
58
-%elif BIT_DEPTH == 8
59
-  %define       DCT_SHIFT 1
60
-  mova          m7, [pd_1]
61
-%else
62
-  %error Unsupported BIT_DEPTH!
63
-%endif
64
+    mova        m7, [pd_ %+ DCT4_ROUND]
65
     add         r2d, r2d
66
     lea         r3, [tab_dct4]
67
 
68
@@ -372,19 +400,19 @@
69
     psubw       m2, m0
70
     pmaddwd     m0, m1, m4
71
     paddd       m0, m7
72
-    psrad       m0, DCT_SHIFT
73
+    psrad       m0, DCT4_SHIFT
74
     pmaddwd     m3, m2, m5
75
     paddd       m3, m7
76
-    psrad       m3, DCT_SHIFT
77
+    psrad       m3, DCT4_SHIFT
78
     packssdw    m0, m3
79
     pshufd      m0, m0, 0xD8
80
     pshufhw     m0, m0, 0xB1
81
     pmaddwd     m1, m6
82
     paddd       m1, m7
83
-    psrad       m1, DCT_SHIFT
84
+    psrad       m1, DCT4_SHIFT
85
     pmaddwd     m2, [r3 + 3 * 16]
86
     paddd       m2, m7
87
-    psrad       m2, DCT_SHIFT
88
+    psrad       m2, DCT4_SHIFT
89
     packssdw    m1, m2
90
     pshufd      m1, m1, 0xD8
91
     pshufhw     m1, m1, 0xB1
92
@@ -431,15 +459,7 @@
93
 ; - r2:     source stride
94
 INIT_YMM avx2
95
 cglobal dct4, 3, 4, 8, src, dst, srcStride
96
-%if BIT_DEPTH == 10
97
-    %define DCT_SHIFT 3
98
-    vbroadcasti128 m7, [pd_4]
99
-%elif BIT_DEPTH == 8
100
-    %define DCT_SHIFT 1
101
-    vbroadcasti128 m7, [pd_1]
102
-%else
103
-    %error Unsupported BIT_DEPTH!
104
-%endif
105
+    vbroadcasti128  m7, [pd_ %+ DCT4_ROUND]
106
     add             r2d, r2d
107
     lea             r3, [avx2_dct4]
108
 
109
@@ -461,11 +481,11 @@
110
 
111
     pmaddwd         m2, m5
112
     paddd           m2, m7
113
-    psrad           m2, DCT_SHIFT
114
+    psrad           m2, DCT4_SHIFT
115
 
116
     pmaddwd         m0, m6
117
     paddd           m0, m7
118
-    psrad           m0, DCT_SHIFT
119
+    psrad           m0, DCT4_SHIFT
120
 
121
     packssdw        m2, m0
122
     pshufb          m2, m4
123
@@ -493,30 +513,19 @@
124
 ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
125
 ;-------------------------------------------------------
126
 INIT_XMM sse2
127
-cglobal idct4, 3, 4, 7
128
-%if BIT_DEPTH == 8
129
-  %define IDCT4_OFFSET  [pd_2048]
130
-  %define IDCT4_SHIFT   12
131
-%elif BIT_DEPTH == 10
132
-  %define IDCT4_OFFSET  [pd_512]
133
-  %define IDCT4_SHIFT   10
134
-%else
135
-  %error Unsupported BIT_DEPTH!
136
-%endif
137
+cglobal idct4, 3, 4, 6
138
     add         r2d, r2d
139
     lea         r3, [tab_dct4]
140
 
141
-    mova        m6, [pd_64]
142
-
143
     movu        m0, [r0 + 0 * 16]
144
     movu        m1, [r0 + 1 * 16]
145
 
146
     punpcklwd   m2, m0, m1
147
     pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
148
-    paddd       m3, m6
149
+    paddd       m3, [pd_64]
150
 
151
     pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
152
-    paddd       m2, m6
153
+    paddd       m2, [pd_64]
154
 
155
     punpckhwd   m0, m1
156
     pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
157
@@ -540,29 +549,27 @@
158
     punpcklwd   m0, m1, m4                  ; m0 = m128iA
159
     punpckhwd   m1, m4                      ; m1 = m128iD
160
 
161
-    mova        m6, IDCT4_OFFSET
162
-
163
     punpcklwd   m2, m0, m1
164
     pmaddwd     m3, m2, [r3 + 0 * 16]
165
-    paddd       m3, m6                      ; m3 = E1
166
+    paddd       m3, [pd_ %+ IDCT_ROUND]     ; m3 = E1
167
 
168
     pmaddwd     m2, [r3 + 2 * 16]
169
-    paddd       m2, m6                      ; m2 = E2
170
+    paddd       m2, [pd_ %+ IDCT_ROUND]     ; m2 = E2
171
 
172
     punpckhwd   m0, m1
173
     pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
174
     pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
175
 
176
     paddd       m4, m3, m1
177
-    psrad       m4, IDCT4_SHIFT             ; m4 = m128iA
178
+    psrad       m4, IDCT_SHIFT              ; m4 = m128iA
179
     paddd       m5, m2, m0
180
-    psrad       m5, IDCT4_SHIFT
181
+    psrad       m5, IDCT_SHIFT
182
     packssdw    m4, m5                      ; m4 = m128iA
183
 
184
     psubd       m2, m0
185
-    psrad       m2, IDCT4_SHIFT
186
+    psrad       m2, IDCT_SHIFT
187
     psubd       m3, m1
188
-    psrad       m3, IDCT4_SHIFT
189
+    psrad       m3, IDCT_SHIFT
190
     packssdw    m2, m3                      ; m2 = m128iD
191
 
192
     punpcklwd   m1, m4, m2
193
@@ -576,7 +583,139 @@
194
     movlps      [r1 + 2 * r2], m1
195
     lea         r1, [r1 + 2 * r2]
196
     movhps      [r1 + r2], m1
197
+    RET
198
+
199
+;------------------------------------------------------
200
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
201
x265_1.7.tar.gz/source/common/x86/dct8.h -> x265_1.8.tar.gz/source/common/x86/dct8.h Changed
45
 
1
@@ -23,27 +23,23 @@
2
 
3
 #ifndef X265_DCT8_H
4
 #define X265_DCT8_H
5
-void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
6
-void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
7
-void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
8
-void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
9
-void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
10
-void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
11
-void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
12
-void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
13
-void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
14
 
15
-void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
16
-void x265_idst4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
17
-void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
18
-void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
19
-void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
20
-void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
21
-void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
22
-void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
23
-void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
24
+FUNCDEF_TU_S2(void, dct, sse2, const int16_t* src, int16_t* dst, intptr_t srcStride);
25
+FUNCDEF_TU_S2(void, dct, ssse3, const int16_t* src, int16_t* dst, intptr_t srcStride);
26
+FUNCDEF_TU_S2(void, dct, sse4, const int16_t* src, int16_t* dst, intptr_t srcStride);
27
+FUNCDEF_TU_S2(void, dct, avx2, const int16_t* src, int16_t* dst, intptr_t srcStride);
28
 
29
-void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
30
-void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
31
+FUNCDEF_TU_S2(void, idct, sse2, const int16_t* src, int16_t* dst, intptr_t dstStride);
32
+FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
33
+FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
34
+FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
35
+
36
+void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
37
+void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
38
+void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
39
+void PFX(dst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
40
+void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
41
+void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
42
+void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
43
 
44
 #endif // ifndef X265_DCT8_H
45
x265_1.7.tar.gz/source/common/x86/intrapred.h -> x265_1.8.tar.gz/source/common/x86/intrapred.h Changed
201
 
1
@@ -26,262 +26,68 @@
2
 #ifndef X265_INTRAPRED_H
3
 #define X265_INTRAPRED_H
4
 
5
-void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
6
-void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
7
-void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
8
-void x265_intra_pred_dc32_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
9
-void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
10
-void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
11
-void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
12
-void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
13
-void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
14
-
15
-void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
16
-void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
17
-void x265_intra_pred_planar16_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
18
-void x265_intra_pred_planar32_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
19
-void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
20
-void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
21
-void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
22
-void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
23
-void x265_intra_pred_planar16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
24
-void x265_intra_pred_planar32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
25
-
26
 #define DECL_ANG(bsize, mode, cpu) \
27
-    void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
28
+    void PFX(intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu)(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
29
+
30
+#define DECL_ANGS(bsize, cpu) \
31
+    DECL_ANG(bsize, 2, cpu); \
32
+    DECL_ANG(bsize, 3, cpu); \
33
+    DECL_ANG(bsize, 4, cpu); \
34
+    DECL_ANG(bsize, 5, cpu); \
35
+    DECL_ANG(bsize, 6, cpu); \
36
+    DECL_ANG(bsize, 7, cpu); \
37
+    DECL_ANG(bsize, 8, cpu); \
38
+    DECL_ANG(bsize, 9, cpu); \
39
+    DECL_ANG(bsize, 10, cpu); \
40
+    DECL_ANG(bsize, 11, cpu); \
41
+    DECL_ANG(bsize, 12, cpu); \
42
+    DECL_ANG(bsize, 13, cpu); \
43
+    DECL_ANG(bsize, 14, cpu); \
44
+    DECL_ANG(bsize, 15, cpu); \
45
+    DECL_ANG(bsize, 16, cpu); \
46
+    DECL_ANG(bsize, 17, cpu); \
47
+    DECL_ANG(bsize, 18, cpu); \
48
+    DECL_ANG(bsize, 19, cpu); \
49
+    DECL_ANG(bsize, 20, cpu); \
50
+    DECL_ANG(bsize, 21, cpu); \
51
+    DECL_ANG(bsize, 22, cpu); \
52
+    DECL_ANG(bsize, 23, cpu); \
53
+    DECL_ANG(bsize, 24, cpu); \
54
+    DECL_ANG(bsize, 25, cpu); \
55
+    DECL_ANG(bsize, 26, cpu); \
56
+    DECL_ANG(bsize, 27, cpu); \
57
+    DECL_ANG(bsize, 28, cpu); \
58
+    DECL_ANG(bsize, 29, cpu); \
59
+    DECL_ANG(bsize, 30, cpu); \
60
+    DECL_ANG(bsize, 31, cpu); \
61
+    DECL_ANG(bsize, 32, cpu); \
62
+    DECL_ANG(bsize, 33, cpu); \
63
+    DECL_ANG(bsize, 34, cpu)
64
 
65
-DECL_ANG(4, 2, sse2);
66
-DECL_ANG(4, 3, sse2);
67
-DECL_ANG(4, 4, sse2);
68
-DECL_ANG(4, 5, sse2);
69
-DECL_ANG(4, 6, sse2);
70
-DECL_ANG(4, 7, sse2);
71
-DECL_ANG(4, 8, sse2);
72
-DECL_ANG(4, 9, sse2);
73
-DECL_ANG(4, 10, sse2);
74
-DECL_ANG(4, 11, sse2);
75
-DECL_ANG(4, 12, sse2);
76
-DECL_ANG(4, 13, sse2);
77
-DECL_ANG(4, 14, sse2);
78
-DECL_ANG(4, 15, sse2);
79
-DECL_ANG(4, 16, sse2);
80
-DECL_ANG(4, 17, sse2);
81
-DECL_ANG(4, 18, sse2);
82
-DECL_ANG(4, 26, sse2);
83
+#define DECL_ALL(cpu) \
84
+    FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
85
+    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
86
+    DECL_ANGS(4, cpu); \
87
+    DECL_ANGS(8, cpu); \
88
+    DECL_ANGS(16, cpu); \
89
+    DECL_ANGS(32, cpu)
90
 
91
-DECL_ANG(4, 2, ssse3);
92
-DECL_ANG(4, 3, sse4);
93
-DECL_ANG(4, 4, sse4);
94
-DECL_ANG(4, 5, sse4);
95
-DECL_ANG(4, 6, sse4);
96
-DECL_ANG(4, 7, sse4);
97
-DECL_ANG(4, 8, sse4);
98
-DECL_ANG(4, 9, sse4);
99
-DECL_ANG(4, 10, sse4);
100
-DECL_ANG(4, 11, sse4);
101
-DECL_ANG(4, 12, sse4);
102
-DECL_ANG(4, 13, sse4);
103
-DECL_ANG(4, 14, sse4);
104
-DECL_ANG(4, 15, sse4);
105
-DECL_ANG(4, 16, sse4);
106
-DECL_ANG(4, 17, sse4);
107
-DECL_ANG(4, 18, sse4);
108
-DECL_ANG(4, 26, sse4);
109
-DECL_ANG(8, 2, ssse3);
110
-DECL_ANG(8, 3, sse4);
111
-DECL_ANG(8, 4, sse4);
112
-DECL_ANG(8, 5, sse4);
113
-DECL_ANG(8, 6, sse4);
114
-DECL_ANG(8, 7, sse4);
115
-DECL_ANG(8, 8, sse4);
116
-DECL_ANG(8, 9, sse4);
117
-DECL_ANG(8, 10, sse4);
118
-DECL_ANG(8, 11, sse4);
119
-DECL_ANG(8, 12, sse4);
120
-DECL_ANG(8, 13, sse4);
121
-DECL_ANG(8, 14, sse4);
122
-DECL_ANG(8, 15, sse4);
123
-DECL_ANG(8, 16, sse4);
124
-DECL_ANG(8, 17, sse4);
125
-DECL_ANG(8, 18, sse4);
126
-DECL_ANG(8, 19, sse4);
127
-DECL_ANG(8, 20, sse4);
128
-DECL_ANG(8, 21, sse4);
129
-DECL_ANG(8, 22, sse4);
130
-DECL_ANG(8, 23, sse4);
131
-DECL_ANG(8, 24, sse4);
132
-DECL_ANG(8, 25, sse4);
133
-DECL_ANG(8, 26, sse4);
134
-DECL_ANG(8, 27, sse4);
135
-DECL_ANG(8, 28, sse4);
136
-DECL_ANG(8, 29, sse4);
137
-DECL_ANG(8, 30, sse4);
138
-DECL_ANG(8, 31, sse4);
139
-DECL_ANG(8, 32, sse4);
140
-DECL_ANG(8, 33, sse4);
141
+FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
142
+FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
143
+FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
144
 
145
-DECL_ANG(16, 2, ssse3);
146
-DECL_ANG(16, 3, sse4);
147
-DECL_ANG(16, 4, sse4);
148
-DECL_ANG(16, 5, sse4);
149
-DECL_ANG(16, 6, sse4);
150
-DECL_ANG(16, 7, sse4);
151
-DECL_ANG(16, 8, sse4);
152
-DECL_ANG(16, 9, sse4);
153
-DECL_ANG(16, 10, sse4);
154
-DECL_ANG(16, 11, sse4);
155
-DECL_ANG(16, 12, sse4);
156
-DECL_ANG(16, 13, sse4);
157
-DECL_ANG(16, 14, sse4);
158
-DECL_ANG(16, 15, sse4);
159
-DECL_ANG(16, 16, sse4);
160
-DECL_ANG(16, 17, sse4);
161
-DECL_ANG(16, 18, sse4);
162
-DECL_ANG(16, 19, sse4);
163
-DECL_ANG(16, 20, sse4);
164
-DECL_ANG(16, 21, sse4);
165
-DECL_ANG(16, 22, sse4);
166
-DECL_ANG(16, 23, sse4);
167
-DECL_ANG(16, 24, sse4);
168
-DECL_ANG(16, 25, sse4);
169
-DECL_ANG(16, 26, sse4);
170
-DECL_ANG(16, 27, sse4);
171
-DECL_ANG(16, 28, sse4);
172
-DECL_ANG(16, 29, sse4);
173
-DECL_ANG(16, 30, sse4);
174
-DECL_ANG(16, 31, sse4);
175
-DECL_ANG(16, 32, sse4);
176
-DECL_ANG(16, 33, sse4);
177
+FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
178
+FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
179
+FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
180
 
181
-DECL_ANG(32, 2, ssse3);
182
-DECL_ANG(32, 3, sse4);
183
-DECL_ANG(32, 4, sse4);
184
-DECL_ANG(32, 5, sse4);
185
-DECL_ANG(32, 6, sse4);
186
-DECL_ANG(32, 7, sse4);
187
-DECL_ANG(32, 8, sse4);
188
-DECL_ANG(32, 9, sse4);
189
-DECL_ANG(32, 10, sse4);
190
-DECL_ANG(32, 11, sse4);
191
-DECL_ANG(32, 12, sse4);
192
-DECL_ANG(32, 13, sse4);
193
-DECL_ANG(32, 14, sse4);
194
-DECL_ANG(32, 15, sse4);
195
-DECL_ANG(32, 16, sse4);
196
-DECL_ANG(32, 17, sse4);
197
-DECL_ANG(32, 18, sse4);
198
-DECL_ANG(32, 19, sse4);
199
-DECL_ANG(32, 20, sse4);
200
-DECL_ANG(32, 21, sse4);
201
x265_1.7.tar.gz/source/common/x86/intrapred16.asm -> x265_1.8.tar.gz/source/common/x86/intrapred16.asm Changed
201
 
1
@@ -35,39 +35,52 @@
2
 %assign x x+1
3
 %endrep
4
 
5
-const shuf_mode_13_23,      db  0,  0, 14, 15,  6,  7,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
6
-const shuf_mode_14_22,      db 14, 15, 10, 11,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
7
-const shuf_mode_15_21,      db 12, 13,  8,  9,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
8
-const shuf_mode_16_20,      db  2,  3,  0,  1, 14, 15, 12, 13,  8,  9,  6,  7,  2,  3,  0,  1
9
-const shuf_mode_17_19,      db  0,  1, 14, 15, 12, 13, 10, 11,  6,  7,  4,  5,  2,  3,  0,  1
10
-const shuf_mode32_18,       db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
11
-const pw_punpcklwd,         db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
12
-const c_mode32_10_0,        db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
13
-
14
-const pw_unpackwdq, times 8 db 0,1
15
-const pw_ang8_12,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
16
-const pw_ang8_13,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
17
-const pw_ang8_14,   db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
18
-const pw_ang8_15,   db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1
19
-const pw_ang8_16,   db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
20
-const pw_ang8_17,   db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
21
-const pw_swap16,    db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
22
+const ang_table_avx2
23
+%assign x 0
24
+%rep 32
25
+    times 8 dw (32-x), x
26
+%assign x x+1
27
+%endrep
28
 
29
-const pw_ang16_13,   db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
30
-const pw_ang16_16,   db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
31
+const pw_ang16_12_24,               db  0,  0,  0,  0,  0,  0,  0,  0, 14, 15, 14, 15,  0,  1,  0,  1
32
+const pw_ang16_13_23,               db  2,  3,  2,  3, 14, 15, 14, 15,  6,  7,  6,  7,  0,  1,  0,  1
33
+const pw_ang16_14_22,               db  2,  3,  2,  3, 10, 11, 10, 11,  6,  7,  6,  7,  0,  1,  0,  1
34
+const pw_ang16_15_21,               db 12, 13, 12, 13,  8,  9,  8,  9,  4,  5,  4,  5,  0,  1,  0,  1
35
+const pw_ang16_16_20,               db  8,  9,  8,  9,  6,  7,  6,  7,  2,  3,  2,  3,  0,  1,  0,  1
36
+
37
+const pw_ang32_12_24,               db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
38
+const pw_ang32_13_23,               db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  6,  7,  0,  1
39
+const pw_ang32_14_22,               db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 11,  6,  7,  0,  1
40
+const pw_ang32_15_21,               db  0,  0,  0,  0,  0,  0,  0,  0, 12, 13,  8,  9,  4,  5,  0,  1
41
+const pw_ang32_16_20,               db  0,  0,  0,  0,  0,  0,  0,  0,  8,  9,  6,  7,  2,  3,  0,  1
42
+const pw_ang32_17_19_0,             db  0,  0,  0,  0, 12, 13, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
43
+
44
+const shuf_mode_13_23,              db  0,  0, 14, 15,  6,  7,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
45
+const shuf_mode_14_22,              db 14, 15, 10, 11,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
46
+const shuf_mode_15_21,              db 12, 13,  8,  9,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
47
+const shuf_mode_16_20,              db  2,  3,  0,  1, 14, 15, 12, 13,  8,  9,  6,  7,  2,  3,  0,  1
48
+const shuf_mode_17_19,              db  0,  1, 14, 15, 12, 13, 10, 11,  6,  7,  4,  5,  2,  3,  0,  1
49
+const shuf_mode32_18,               db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
50
+const pw_punpcklwd,                 db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
51
+const c_mode32_10_0,                db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
52
+
53
+const pw_ang8_12,                   db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 13,  0,  1
54
+const pw_ang8_13,                   db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  8,  9,  0,  1
55
+const pw_ang8_14,                   db  0,  0,  0,  0,  0,  0,  0,  0, 14, 15, 10, 11,  4,  5,  0,  1
56
+const pw_ang8_15,                   db  0,  0,  0,  0,  0,  0,  0,  0, 12, 13,  8,  9,  4,  5,  0,  1
57
+const pw_ang8_16,                   db  0,  0,  0,  0,  0,  0, 12, 13, 10, 11,  6,  7,  4,  5,  0,  1
58
+const pw_ang8_17,                   db  0,  0, 14, 15, 12, 13, 10, 11,  8,  9,  4,  5,  2,  3,  0,  1
59
+const pw_swap16,            times 2 db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
60
+
61
+const pw_ang16_13,                  db 14, 15,  8,  9,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
62
+const pw_ang16_16,                  db  0,  0,  0,  0,  0,  0, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
63
+
64
+intra_filter4_shuf0:                db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
65
+intra_filter4_shuf1:                db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
66
+intra_filter4_shuf2:        times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
67
 
68
 ;; (blkSize - 1 - x)
69
-pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
70
-pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
71
-pw_planar8_0:         dw 7,  6,  5,  4,  3,  2,  1,  0
72
-pw_planar8_1:         dw 7,  7,  7,  7,  7,  7,  7,  7
73
-pw_planar16_0:        dw 15, 14, 13, 12, 11, 10,  9, 8
74
-pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
75
-pd_planar32_1:        dd 31, 31, 31, 31
76
-
77
-pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
78
-pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
79
-pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
80
+pw_planar4_0:                       dw  3,  2,  1,  0,  3,  2,  1,  0
81
 
82
 const planar32_table
83
 %assign x 31
84
@@ -85,16 +98,22 @@
85
 
86
 SECTION .text
87
 
88
+cextern pb_01
89
 cextern pw_1
90
 cextern pw_2
91
+cextern pw_3
92
+cextern pw_7
93
 cextern pw_4
94
 cextern pw_8
95
+cextern pw_15
96
 cextern pw_16
97
+cextern pw_31
98
 cextern pw_32
99
-cextern pw_1023
100
 cextern pd_16
101
+cextern pd_31
102
 cextern pd_32
103
 cextern pw_4096
104
+cextern pw_pixel_max
105
 cextern multiL
106
 cextern multiH
107
 cextern multiH2
108
@@ -103,6 +122,8 @@
109
 cextern pw_swap
110
 cextern pb_unpackwq1
111
 cextern pb_unpackwq2
112
+cextern pw_planar16_mul
113
+cextern pw_planar32_mul
114
 
115
 ;-----------------------------------------------------------------------------------
116
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
117
@@ -121,7 +142,7 @@
118
     test        r4d,            r4d
119
 
120
     paddw       m0,             [pw_4]
121
-    psraw       m0,             3
122
+    psrlw       m0,             3
123
 
124
     ; store DC 4x4
125
     movh        [r0],           m0
126
@@ -140,7 +161,7 @@
127
     ; filter top
128
     movh        m1,             [r2 + 2]
129
     paddw       m1,             m0
130
-    psraw       m1,             2
131
+    psrlw       m1,             2
132
     movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
133
 
134
     ; filter top-left
135
@@ -155,7 +176,7 @@
136
     ; filter left
137
     movu        m1,             [r2 + 20]
138
     paddw       m1,             m0
139
-    psraw       m1,             2
140
+    psrlw       m1,             2
141
     movd        r3d,            m1
142
     mov         [r0 + r1 * 2],  r3w
143
     shr         r3d,            16
144
@@ -181,7 +202,7 @@
145
     pmaddwd         m0,            [pw_1]
146
 
147
     paddw           m0,            [pw_8]
148
-    psraw           m0,            4              ; sum = sum / 16
149
+    psrlw           m0,            4              ; sum = sum / 16
150
     pshuflw         m0,            m0, 0
151
     pshufd          m0,            m0, 0          ; m0 = word [dc_val ...]
152
 
153
@@ -214,7 +235,7 @@
154
     ; filter top
155
     movu            m0,            [r2 + 2]
156
     paddw           m0,            m1
157
-    psraw           m0,            2
158
+    psrlw           m0,            2
159
     movu            [r0],          m0
160
 
161
     ; filter top-left
162
@@ -229,7 +250,7 @@
163
     ; filter left
164
     movu            m0,            [r2 + 36]
165
     paddw           m0,            m1
166
-    psraw           m0,            2
167
+    psrlw           m0,            2
168
     movh            r3,            m0
169
     mov             [r0 + r1 * 2], r3w
170
     shr             r3,            16
171
@@ -263,14 +284,10 @@
172
     paddw           m0,                  m1
173
     paddw           m2,                  m3
174
     paddw           m0,                  m2
175
-    movhlps         m1,                  m0
176
-    paddw           m0,                  m1
177
-    pshuflw         m1,                  m0, 0x6E
178
-    paddw           m0,                  m1
179
-    pmaddwd         m0,                  [pw_1]
180
+    HADDUW          m0,                  m1
181
+    paddd           m0,                  [pd_16]
182
+    psrld           m0,                  5
183
 
184
-    paddw           m0,                  [pw_16]
185
-    psraw           m0,                  5
186
     movd            r5d,                 m0
187
     pshuflw         m0,                  m0, 0 ; m0 = word [dc_val ...]
188
     pshufd          m0,                  m0, 0
189
@@ -326,11 +343,11 @@
190
     ; filter top
191
     movu            m2,                  [r2 + 2]
192
     paddw           m2,                  m1
193
-    psraw           m2,                  2
194
+    psrlw           m2,                  2
195
     movu            [r0],                m2
196
     movu            m3,                  [r2 + 18]
197
     paddw           m3,                  m1
198
-    psraw           m3,                  2
199
+    psrlw           m3,                  2
200
     movu            [r0 + 16],           m3
201
x265_1.7.tar.gz/source/common/x86/intrapred8.asm -> x265_1.8.tar.gz/source/common/x86/intrapred8.asm Changed
201
 
1
@@ -30,6 +30,10 @@
2
 intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
3
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
4
 
5
+intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
6
+intra_filter4_shuf1:  times 2 db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
7
+intra_filter4_shuf2:  times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
8
+
9
 pb_0_8        times 8 db  0,  8
10
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
11
 pb_swap8:     times 2 db  7,  6,  5,  4,  3,  2,  1,  0
12
@@ -191,16 +195,6 @@
13
 intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
14
 
15
 ALIGN 32
16
-c_ang16_mode_8:       db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
17
-                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
18
-                      db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
19
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
20
-                      db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
21
-                      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
22
-                      db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
23
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
24
-
25
-ALIGN 32
26
 c_ang16_mode_29:     db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9,  14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
27
                      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
28
                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
29
@@ -212,16 +206,6 @@
30
                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
31
 
32
 ALIGN 32
33
-c_ang16_mode_7:      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
34
-                     db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
35
-                     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
36
-                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
37
-                     db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
38
-                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
39
-                     db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
40
-                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
41
-
42
-ALIGN 32
43
 c_ang16_mode_30:      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
44
                       db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
45
                       db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
46
@@ -232,18 +216,6 @@
47
                       db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
48
                       db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
49
 
50
-
51
-
52
-ALIGN 32
53
-c_ang16_mode_6:       db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
54
-                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
55
-                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
56
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
57
-                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
58
-                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
59
-                      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
60
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
61
-
62
 ALIGN 32
63
 c_ang16_mode_31:      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
64
                       db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
65
@@ -255,66 +227,6 @@
66
                       db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
67
                       db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
68
 
69
-
70
-ALIGN 32
71
-c_ang16_mode_5:       db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
72
-                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
73
-                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
74
-                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
75
-                      db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
76
-                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
77
-                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
78
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
79
-
80
-ALIGN 32
81
-c_ang16_mode_32:      db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
82
-                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
83
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
84
-                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
85
-                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
86
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
87
-                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
88
-                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
89
-                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
90
-                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
91
-                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
92
-
93
-ALIGN 32
94
-c_ang16_mode_4:       db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
95
-                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
96
-                      db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
97
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
98
-                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
99
-                      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
100
-                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
101
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
102
-
103
-ALIGN 32
104
-c_ang16_mode_33:     db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
105
-                     db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
106
-                     db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
107
-                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
108
-                     db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
109
-                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
110
-                     db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
111
-                     db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
112
-                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
113
-                     db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
114
-                     db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
115
-                     db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
116
-                     db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
117
-                     db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
118
-
119
-ALIGN 32
120
-c_ang16_mode_3:      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
121
-                     db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
122
-                     db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
123
-                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
124
-                     db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
125
-                     db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
126
-                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
127
-                     db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
128
-
129
 ALIGN 32
130
 c_ang16_mode_24:     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
131
                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
132
@@ -476,38 +388,6 @@
133
                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
134
                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
135
 
136
-
137
-ALIGN 32
138
-c_ang32_mode_33:   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
139
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
140
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
141
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
142
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
143
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
144
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
145
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
146
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
147
-                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
148
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
149
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
150
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
151
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
152
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
153
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
154
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
155
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
156
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
157
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
158
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
159
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
160
-                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
161
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
162
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
163
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
164
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
165
-
166
-
167
-
168
 ALIGN 32
169
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
170
                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
171
@@ -526,8 +406,6 @@
172
                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
173
                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
174
 
175
-
176
-
177
 ALIGN 32
178
 c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
179
                    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
180
@@ -664,15 +542,6 @@
181
 ALIGN 32
182
 ;; (blkSize - 1 - x)
183
 pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
184
-pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
185
-pw_planar8_0:         dw 7,  6,  5,  4,  3,  2,  1,  0
186
-pw_planar8_1:         dw 7,  7,  7,  7,  7,  7,  7,  7
187
-pw_planar16_0:        dw 15, 14, 13, 12, 11, 10, 9,  8
188
-pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
189
-pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
190
-pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
191
-pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
192
-
193
 ALIGN 32
194
 c_ang8_mode_13:       db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
195
                       db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
196
@@ -704,6 +573,13 @@
197
 %assign x x+1
198
 %endrep
199
 
200
+const ang_table_avx2
201
x265_1.7.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.8.tar.gz/source/common/x86/ipfilter16.asm Changed
201
 
1
@@ -3,6 +3,7 @@
2
 ;*
3
 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
4
 ;*          Murugan Vairavel <murugan@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -25,10 +26,28 @@
10
 %include "x86inc.asm"
11
 %include "x86util.asm"
12
 
13
+
14
+%define INTERP_OFFSET_PP        pd_32
15
+%define INTERP_SHIFT_PP         6
16
+
17
+%if BIT_DEPTH == 10
18
+    %define INTERP_SHIFT_PS         2
19
+    %define INTERP_OFFSET_PS        pd_n32768
20
+    %define INTERP_SHIFT_SP         10
21
+    %define INTERP_OFFSET_SP        pd_524800
22
+%elif BIT_DEPTH == 12
23
+    %define INTERP_SHIFT_PS         4
24
+    %define INTERP_OFFSET_PS        pd_n131072
25
+    %define INTERP_SHIFT_SP         8
26
+    %define INTERP_OFFSET_SP        pd_524416
27
+%else
28
+    %error Unsupport bit depth!
29
+%endif
30
+
31
+
32
 SECTION_RODATA 32
33
 
34
-tab_c_32:         times 4 dd 32
35
-tab_c_n32768:     times 4 dd -32768
36
+tab_c_32:         times 8 dd 32
37
 tab_c_524800:     times 4 dd 524800
38
 tab_c_n8192:      times 8 dw -8192
39
 pd_524800:        times 8 dd 524800
40
@@ -44,29 +63,53 @@
41
                   dw -2, 16, 54, -4
42
                   dw -2, 10, 58, -2
43
 
44
-tab_ChromaCoeffV: times 4 dw 0, 64
45
-                  times 4 dw 0, 0
46
+const tab_ChromaCoeffV,  times 8 dw 0, 64
47
+                         times 8 dw 0, 0
48
+
49
+                         times 8 dw -2, 58
50
+                         times 8 dw 10, -2
51
+
52
+                         times 8 dw -4, 54
53
+                         times 8 dw 16, -2
54
+
55
+                         times 8 dw -6, 46
56
+                         times 8 dw 28, -4
57
+
58
+                         times 8 dw -4, 36
59
+                         times 8 dw 36, -4
60
 
61
-                  times 4 dw -2, 58
62
-                  times 4 dw 10, -2
63
+                         times 8 dw -4, 28
64
+                         times 8 dw 46, -6
65
 
66
-                  times 4 dw -4, 54
67
-                  times 4 dw 16, -2
68
+                         times 8 dw -2, 16
69
+                         times 8 dw 54, -4
70
 
71
-                  times 4 dw -6, 46 
72
-                  times 4 dw 28, -4
73
+                         times 8 dw -2, 10
74
+                         times 8 dw 58, -2
75
 
76
-                  times 4 dw -4, 36
77
-                  times 4 dw 36, -4
78
+tab_ChromaCoeffVer: times 8 dw 0, 64
79
+                    times 8 dw 0, 0
80
 
81
-                  times 4 dw -4, 28
82
-                  times 4 dw 46, -6
83
+                    times 8 dw -2, 58
84
+                    times 8 dw 10, -2
85
 
86
-                  times 4 dw -2, 16
87
-                  times 4 dw 54, -4
88
+                    times 8 dw -4, 54
89
+                    times 8 dw 16, -2
90
 
91
-                  times 4 dw -2, 10
92
-                  times 4 dw 58, -2
93
+                    times 8 dw -6, 46
94
+                    times 8 dw 28, -4
95
+
96
+                    times 8 dw -4, 36
97
+                    times 8 dw 36, -4
98
+
99
+                    times 8 dw -4, 28
100
+                    times 8 dw 46, -6
101
+
102
+                    times 8 dw -2, 16
103
+                    times 8 dw 54, -4
104
+
105
+                    times 8 dw -2, 10
106
+                    times 8 dw 58, -2
107
 
108
 tab_LumaCoeff:    dw   0, 0,  0,  64,  0,   0,  0,  0
109
                   dw  -1, 4, -10, 58,  17, -5,  1,  0
110
@@ -115,11 +158,1024 @@
111
 
112
 const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7
113
 
114
+const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
115
+                            db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
116
+
117
+const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
118
+                db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
119
+
120
+
121
 SECTION .text
122
+cextern pd_8
123
 cextern pd_32
124
 cextern pw_pixel_max
125
+cextern pd_524416
126
 cextern pd_n32768
127
+cextern pd_n131072
128
 cextern pw_2000
129
+cextern idct8_shuf2
130
+
131
+%macro FILTER_LUMA_HOR_4_sse2 1
132
+    movu        m4,     [r0 + %1]       ; m4 = src[0-7]
133
+    movu        m5,     [r0 + %1 + 2]   ; m5 = src[1-8]
134
+    pmaddwd     m4,     m0
135
+    pmaddwd     m5,     m0
136
+    pshufd      m2,     m4,     q2301
137
+    paddd       m4,     m2
138
+    pshufd      m2,     m5,     q2301
139
+    paddd       m5,     m2
140
+    pshufd      m4,     m4,     q3120
141
+    pshufd      m5,     m5,     q3120
142
+    punpcklqdq  m4,     m5
143
+
144
+    movu        m5,     [r0 + %1 + 4]   ; m5 = src[2-9]
145
+    movu        m3,     [r0 + %1 + 6]   ; m3 = src[3-10]
146
+    pmaddwd     m5,     m0
147
+    pmaddwd     m3,     m0
148
+    pshufd      m2,     m5,     q2301
149
+    paddd       m5,     m2
150
+    pshufd      m2,     m3,     q2301
151
+    paddd       m3,     m2
152
+    pshufd      m5,     m5,     q3120
153
+    pshufd      m3,     m3,     q3120
154
+    punpcklqdq  m5,     m3
155
+
156
+    pshufd      m2,     m4,     q2301
157
+    paddd       m4,     m2
158
+    pshufd      m2,     m5,     q2301
159
+    paddd       m5,     m2
160
+    pshufd      m4,     m4,     q3120
161
+    pshufd      m5,     m5,     q3120
162
+    punpcklqdq  m4,     m5
163
+    paddd       m4,     m1
164
+%endmacro
165
+
166
+%macro FILTER_LUMA_HOR_8_sse2 1
167
+    movu        m4,     [r0 + %1]       ; m4 = src[0-7]
168
+    movu        m5,     [r0 + %1 + 2]   ; m5 = src[1-8]
169
+    pmaddwd     m4,     m0
170
+    pmaddwd     m5,     m0
171
+    pshufd      m2,     m4,     q2301
172
+    paddd       m4,     m2
173
+    pshufd      m2,     m5,     q2301
174
+    paddd       m5,     m2
175
+    pshufd      m4,     m4,     q3120
176
+    pshufd      m5,     m5,     q3120
177
+    punpcklqdq  m4,     m5
178
+
179
+    movu        m5,     [r0 + %1 + 4]   ; m5 = src[2-9]
180
+    movu        m3,     [r0 + %1 + 6]   ; m3 = src[3-10]
181
+    pmaddwd     m5,     m0
182
+    pmaddwd     m3,     m0
183
+    pshufd      m2,     m5,     q2301
184
+    paddd       m5,     m2
185
+    pshufd      m2,     m3,     q2301
186
+    paddd       m3,     m2
187
+    pshufd      m5,     m5,     q3120
188
+    pshufd      m3,     m3,     q3120
189
+    punpcklqdq  m5,     m3
190
+
191
+    pshufd      m2,     m4,     q2301
192
+    paddd       m4,     m2
193
+    pshufd      m2,     m5,     q2301
194
+    paddd       m5,     m2
195
+    pshufd      m4,     m4,     q3120
196
+    pshufd      m5,     m5,     q3120
197
+    punpcklqdq  m4,     m5
198
+    paddd       m4,     m1
199
+
200
+    movu        m5,     [r0 + %1 + 8]   ; m5 = src[4-11]
201
x265_1.7.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.8.tar.gz/source/common/x86/ipfilter8.asm Changed
201
 
1
@@ -301,6 +301,7 @@
2
 cextern pw_32
3
 cextern pw_512
4
 cextern pw_2000
5
+cextern pw_8192
6
 
7
 %macro FILTER_H4_w2_2_sse2 0
8
     pxor        m3, m3
9
@@ -330,80 +331,38 @@
10
 %endmacro
11
 
12
 ;-----------------------------------------------------------------------------
13
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
14
-;-----------------------------------------------------------------------------
15
-INIT_XMM sse3
16
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
17
-    mov         r4d,        r4m
18
-    mova        m5,         [pw_32]
19
-
20
-%ifdef PIC
21
-    lea         r5,          [tabw_ChromaCoeff]
22
-    movddup     m4,         [r5 + r4 * 8]
23
-%else
24
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
25
-%endif
26
-
27
-    FILTER_H4_w2_2_sse2
28
-    lea         srcq,       [srcq + srcstrideq * 2]
29
-    lea         dstq,       [dstq + dststrideq * 2]
30
-    FILTER_H4_w2_2_sse2
31
-
32
-    RET
33
-
34
-;-----------------------------------------------------------------------------
35
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
36
+; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
37
 ;-----------------------------------------------------------------------------
38
+%macro FILTER_H4_W2xN_sse3 1
39
 INIT_XMM sse3
40
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
41
-    mov         r4d,        r4m
42
-    mova        m5,         [pw_32]
43
+cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride
44
+    mov         r4d,    r4m
45
+    mova        m5,     [pw_32]
46
 
47
 %ifdef PIC
48
-    lea         r5,          [tabw_ChromaCoeff]
49
-    movddup     m4,         [r5 + r4 * 8]
50
+    lea         r5,     [tabw_ChromaCoeff]
51
+    movddup     m4,     [r5 + r4 * 8]
52
 %else
53
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
54
+    movddup     m4,     [tabw_ChromaCoeff + r4 * 8]
55
 %endif
56
 
57
 %assign x 1
58
-%rep 4
59
+%rep %1/2
60
     FILTER_H4_w2_2_sse2
61
-%if x < 4
62
-    lea         srcq,       [srcq + srcstrideq * 2]
63
-    lea         dstq,       [dstq + dststrideq * 2]
64
+%if x < %1/2
65
+    lea         srcq,   [srcq + srcstrideq * 2]
66
+    lea         dstq,   [dstq + dststrideq * 2]
67
 %endif
68
 %assign x x+1
69
 %endrep
70
 
71
     RET
72
 
73
-;-----------------------------------------------------------------------------
74
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
75
-;-----------------------------------------------------------------------------
76
-INIT_XMM sse3
77
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6, src, srcstride, dst, dststride
78
-    mov         r4d,        r4m
79
-    mova        m5,         [pw_32]
80
-
81
-%ifdef PIC
82
-    lea         r5,         [tabw_ChromaCoeff]
83
-    movddup     m4,         [r5 + r4 * 8]
84
-%else
85
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
86
-%endif
87
-
88
-%assign x 1
89
-%rep 8
90
-    FILTER_H4_w2_2_sse2
91
-%if x < 8
92
-    lea         srcq,       [srcq + srcstrideq * 2]
93
-    lea         dstq,       [dstq + dststrideq * 2]
94
-%endif
95
-%assign x x+1
96
-%endrep
97
+%endmacro
98
 
99
-    RET
100
+    FILTER_H4_W2xN_sse3 4
101
+    FILTER_H4_W2xN_sse3 8
102
+    FILTER_H4_W2xN_sse3 16
103
 
104
 %macro FILTER_H4_w4_2_sse2 0
105
     pxor        m5, m5
106
@@ -447,143 +406,41 @@
107
 %endmacro
108
 
109
 ;-----------------------------------------------------------------------------
110
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
111
-;-----------------------------------------------------------------------------
112
-INIT_XMM sse3
113
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 8, src, srcstride, dst, dststride
114
-    mov         r4d,        r4m
115
-    mova        m7,         [pw_32]
116
-
117
-%ifdef PIC
118
-    lea         r5,         [tabw_ChromaCoeff]
119
-    movddup     m4,         [r5 + r4 * 8]
120
-%else
121
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
122
-%endif
123
-
124
-    FILTER_H4_w4_2_sse2
125
-
126
-    RET
127
-
128
-;-----------------------------------------------------------------------------
129
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
130
-;-----------------------------------------------------------------------------
131
-INIT_XMM sse3
132
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 8, src, srcstride, dst, dststride
133
-    mov         r4d,        r4m
134
-    mova        m7,         [pw_32]
135
-
136
-%ifdef PIC
137
-    lea         r5,         [tabw_ChromaCoeff]
138
-    movddup     m4,         [r5 + r4 * 8]
139
-%else
140
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
141
-%endif
142
-
143
-    FILTER_H4_w4_2_sse2
144
-    lea         srcq,       [srcq + srcstrideq * 2]
145
-    lea         dstq,       [dstq + dststrideq * 2]
146
-    FILTER_H4_w4_2_sse2
147
-
148
-    RET
149
-
150
-;-----------------------------------------------------------------------------
151
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
152
-;-----------------------------------------------------------------------------
153
-INIT_XMM sse3
154
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 8, src, srcstride, dst, dststride
155
-    mov         r4d,        r4m
156
-    mova        m7,         [pw_32]
157
-
158
-%ifdef PIC
159
-    lea         r5,         [tabw_ChromaCoeff]
160
-    movddup     m4,         [r5 + r4 * 8]
161
-%else
162
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
163
-%endif
164
-
165
-%assign x 1
166
-%rep 4
167
-    FILTER_H4_w4_2_sse2
168
-%if x < 4
169
-    lea         srcq,       [srcq + srcstrideq * 2]
170
-    lea         dstq,       [dstq + dststrideq * 2]
171
-%endif
172
-%assign x x+1
173
-%endrep
174
-
175
-    RET
176
-
177
-;-----------------------------------------------------------------------------
178
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
179
-;-----------------------------------------------------------------------------
180
-INIT_XMM sse3
181
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 8, src, srcstride, dst, dststride
182
-    mov         r4d,        r4m
183
-    mova        m7,         [pw_32]
184
-
185
-%ifdef PIC
186
-    lea         r5,         [tabw_ChromaCoeff]
187
-    movddup     m4,         [r5 + r4 * 8]
188
-%else
189
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
190
-%endif
191
-
192
-%assign x 1
193
-%rep 8
194
-    FILTER_H4_w4_2_sse2
195
-%if x < 8
196
-    lea         srcq,       [srcq + srcstrideq * 2]
197
-    lea         dstq,       [dstq + dststrideq * 2]
198
-%endif
199
-%assign x x+1
200
-%endrep
201
x265_1.7.tar.gz/source/common/x86/ipfilter8.h -> x265_1.8.tar.gz/source/common/x86/ipfilter8.h Changed
201
 
1
@@ -24,912 +24,26 @@
2
 #ifndef X265_IPFILTER8_H
3
 #define X265_IPFILTER8_H
4
 
5
-#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
6
-    void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
7
-    void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
8
-    void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
9
-    void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
10
-
11
-#define LUMA_FILTERS(cpu) \
12
-    SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
13
-    SETUP_LUMA_FUNC_DEF(8,   8, cpu); \
14
-    SETUP_LUMA_FUNC_DEF(8,   4, cpu); \
15
-    SETUP_LUMA_FUNC_DEF(4,   8, cpu); \
16
-    SETUP_LUMA_FUNC_DEF(16, 16, cpu); \
17
-    SETUP_LUMA_FUNC_DEF(16,  8, cpu); \
18
-    SETUP_LUMA_FUNC_DEF(8,  16, cpu); \
19
-    SETUP_LUMA_FUNC_DEF(16, 12, cpu); \
20
-    SETUP_LUMA_FUNC_DEF(12, 16, cpu); \
21
-    SETUP_LUMA_FUNC_DEF(16,  4, cpu); \
22
-    SETUP_LUMA_FUNC_DEF(4,  16, cpu); \
23
-    SETUP_LUMA_FUNC_DEF(32, 32, cpu); \
24
-    SETUP_LUMA_FUNC_DEF(32, 16, cpu); \
25
-    SETUP_LUMA_FUNC_DEF(16, 32, cpu); \
26
-    SETUP_LUMA_FUNC_DEF(32, 24, cpu); \
27
-    SETUP_LUMA_FUNC_DEF(24, 32, cpu); \
28
-    SETUP_LUMA_FUNC_DEF(32,  8, cpu); \
29
-    SETUP_LUMA_FUNC_DEF(8,  32, cpu); \
30
-    SETUP_LUMA_FUNC_DEF(64, 64, cpu); \
31
-    SETUP_LUMA_FUNC_DEF(64, 32, cpu); \
32
-    SETUP_LUMA_FUNC_DEF(32, 64, cpu); \
33
-    SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
34
-    SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
35
-    SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
36
-    SETUP_LUMA_FUNC_DEF(16, 64, cpu)
37
-
38
-#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
39
-    void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
40
-
41
-#define LUMA_SP_FILTERS(cpu) \
42
-    SETUP_LUMA_SP_FUNC_DEF(4,   4, cpu); \
43
-    SETUP_LUMA_SP_FUNC_DEF(8,   8, cpu); \
44
-    SETUP_LUMA_SP_FUNC_DEF(8,   4, cpu); \
45
-    SETUP_LUMA_SP_FUNC_DEF(4,   8, cpu); \
46
-    SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \
47
-    SETUP_LUMA_SP_FUNC_DEF(16,  8, cpu); \
48
-    SETUP_LUMA_SP_FUNC_DEF(8,  16, cpu); \
49
-    SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \
50
-    SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \
51
-    SETUP_LUMA_SP_FUNC_DEF(16,  4, cpu); \
52
-    SETUP_LUMA_SP_FUNC_DEF(4,  16, cpu); \
53
-    SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \
54
-    SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \
55
-    SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \
56
-    SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \
57
-    SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \
58
-    SETUP_LUMA_SP_FUNC_DEF(32,  8, cpu); \
59
-    SETUP_LUMA_SP_FUNC_DEF(8,  32, cpu); \
60
-    SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \
61
-    SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \
62
-    SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \
63
-    SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \
64
-    SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \
65
-    SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \
66
-    SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
67
-
68
-#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
69
-    void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
70
-
71
-#define LUMA_SS_FILTERS(cpu) \
72
-    SETUP_LUMA_SS_FUNC_DEF(4,   4, cpu); \
73
-    SETUP_LUMA_SS_FUNC_DEF(8,   8, cpu); \
74
-    SETUP_LUMA_SS_FUNC_DEF(8,   4, cpu); \
75
-    SETUP_LUMA_SS_FUNC_DEF(4,   8, cpu); \
76
-    SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \
77
-    SETUP_LUMA_SS_FUNC_DEF(16,  8, cpu); \
78
-    SETUP_LUMA_SS_FUNC_DEF(8,  16, cpu); \
79
-    SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \
80
-    SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \
81
-    SETUP_LUMA_SS_FUNC_DEF(16,  4, cpu); \
82
-    SETUP_LUMA_SS_FUNC_DEF(4,  16, cpu); \
83
-    SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \
84
-    SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \
85
-    SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \
86
-    SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \
87
-    SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \
88
-    SETUP_LUMA_SS_FUNC_DEF(32,  8, cpu); \
89
-    SETUP_LUMA_SS_FUNC_DEF(8,  32, cpu); \
90
-    SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \
91
-    SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \
92
-    SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \
93
-    SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \
94
-    SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \
95
-    SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
96
-    SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
97
-
98
-#if HIGH_BIT_DEPTH
99
-
100
-#define SETUP_CHROMA_420_VERT_FUNC_DEF(W, H, cpu) \
101
-    void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
102
-    void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
103
-    void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
104
-    void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
105
-
106
-#define CHROMA_420_VERT_FILTERS(cpu) \
107
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 4, cpu); \
108
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
109
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
110
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
111
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 6, cpu); \
112
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 2, cpu); \
113
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
114
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
115
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
116
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 12, cpu); \
117
-    SETUP_CHROMA_420_VERT_FUNC_DEF(12, 16, cpu); \
118
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 4, cpu); \
119
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
120
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
121
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
122
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
123
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 24, cpu); \
124
-    SETUP_CHROMA_420_VERT_FUNC_DEF(24, 32, cpu); \
125
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 8, cpu); \
126
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu)
127
-
128
-#define CHROMA_420_VERT_FILTERS_SSE4(cpu) \
129
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 4, cpu); \
130
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
131
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 2, cpu); \
132
-    SETUP_CHROMA_420_VERT_FUNC_DEF(6, 8, cpu);
133
-
134
-#define CHROMA_422_VERT_FILTERS(cpu) \
135
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
136
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
137
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
138
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
139
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 12, cpu); \
140
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
141
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
142
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
143
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu); \
144
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 24, cpu); \
145
-    SETUP_CHROMA_420_VERT_FUNC_DEF(12, 32, cpu); \
146
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
147
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 32, cpu); \
148
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 64, cpu); \
149
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
150
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 64, cpu); \
151
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 48, cpu); \
152
-    SETUP_CHROMA_420_VERT_FUNC_DEF(24, 64, cpu); \
153
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
154
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 64, cpu);
155
-
156
-#define CHROMA_422_VERT_FILTERS_SSE4(cpu) \
157
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
158
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 16, cpu); \
159
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 4, cpu); \
160
-    SETUP_CHROMA_420_VERT_FUNC_DEF(6, 16, cpu);
161
-
162
-#define CHROMA_444_VERT_FILTERS(cpu) \
163
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
164
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
165
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
166
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
167
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
168
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
169
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 12, cpu); \
170
-    SETUP_CHROMA_420_VERT_FUNC_DEF(12, 16, cpu); \
171
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 4, cpu); \
172
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
173
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
174
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
175
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
176
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 24, cpu); \
177
-    SETUP_CHROMA_420_VERT_FUNC_DEF(24, 32, cpu); \
178
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 8, cpu); \
179
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu); \
180
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 64, cpu); \
181
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 32, cpu); \
182
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 64, cpu); \
183
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 48, cpu); \
184
-    SETUP_CHROMA_420_VERT_FUNC_DEF(48, 64, cpu); \
185
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 16, cpu); \
186
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 64, cpu)
187
-
188
-#define SETUP_CHROMA_420_HORIZ_FUNC_DEF(W, H, cpu) \
189
-    void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
190
-    void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
191
-
192
-#define CHROMA_420_HORIZ_FILTERS(cpu) \
193
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 4, cpu); \
194
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 2, cpu); \
195
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(2, 4, cpu); \
196
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 8, cpu); \
197
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 4, cpu); \
198
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 8, cpu); \
199
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 6, cpu); \
200
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(6, 8, cpu); \
201
x265_1.7.tar.gz/source/common/x86/loopfilter.asm -> x265_1.8.tar.gz/source/common/x86/loopfilter.asm Changed
201
 
1
@@ -29,6 +29,7 @@
2
 
3
 SECTION_RODATA 32
4
 pb_31:      times 32 db 31
5
+pb_124:     times 32 db 124
6
 pb_15:      times 32 db 15
7
 pb_movemask_32:  times 32 db 0x00
8
                  times 32 db 0xFF
9
@@ -38,13 +39,118 @@
10
 cextern pb_128
11
 cextern pb_2
12
 cextern pw_2
13
+cextern pw_pixel_max
14
 cextern pb_movemask
15
+cextern pw_1
16
+cextern hmul_16p
17
+cextern pb_4
18
 
19
 
20
 ;============================================================================================================
21
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
22
 ;============================================================================================================
23
 INIT_XMM sse4
24
+%if HIGH_BIT_DEPTH
25
+cglobal saoCuOrgE0, 4,5,9
26
+    mov         r4d, r4m
27
+    movh        m6,  [r1]
28
+    movzx       r1d, byte [r3]
29
+    pxor        m5, m5
30
+    neg         r1b
31
+    movd        m0, r1d
32
+    lea         r1, [r0 + r4 * 2]
33
+    mov         r4d, r2d
34
+
35
+.loop:
36
+    movu        m7, [r0]
37
+    movu        m8, [r0 + 16]
38
+    movu        m2, [r0 + 2]
39
+    movu        m1, [r0 + 18]
40
+
41
+    pcmpgtw     m3, m7, m2
42
+    pcmpgtw     m2, m7
43
+    pcmpgtw     m4, m8, m1
44
+    pcmpgtw     m1, m8 
45
+
46
+    packsswb    m3, m4
47
+    packsswb    m2, m1
48
+
49
+    pand        m3, [pb_1]
50
+    por         m3, m2
51
+
52
+    palignr     m2, m3, m5, 15
53
+    por         m2, m0
54
+
55
+    mova        m4, [pw_pixel_max]
56
+    psignb      m2, [pb_128]                ; m2 = signLeft
57
+    pxor        m0, m0
58
+    palignr     m0, m3, 15
59
+    paddb       m3, m2
60
+    paddb       m3, [pb_2]                  ; m2 = uiEdgeType
61
+    pshufb      m2, m6, m3
62
+    pmovsxbw    m3, m2                      ; offsetEo
63
+    punpckhbw   m2, m2
64
+    psraw       m2, 8
65
+    paddw       m7, m3
66
+    paddw       m8, m2
67
+    pmaxsw      m7, m5
68
+    pmaxsw      m8, m5
69
+    pminsw      m7, m4
70
+    pminsw      m8, m4
71
+    movu        [r0], m7
72
+    movu        [r0 + 16], m8
73
+
74
+    add         r0q, 32
75
+    sub         r2d, 16
76
+    jnz        .loop
77
+
78
+    movzx       r3d, byte [r3 + 1]
79
+    neg         r3b
80
+    movd        m0, r3d
81
+.loopH:
82
+    movu        m7, [r1]
83
+    movu        m8, [r1 + 16]
84
+    movu        m2, [r1 + 2]
85
+    movu        m1, [r1 + 18]
86
+
87
+    pcmpgtw     m3, m7, m2
88
+    pcmpgtw     m2, m7
89
+    pcmpgtw     m4, m8, m1
90
+    pcmpgtw     m1, m8 
91
+
92
+    packsswb    m3, m4
93
+    packsswb    m2, m1
94
+
95
+    pand        m3, [pb_1]
96
+    por         m3, m2
97
+
98
+    palignr     m2, m3, m5, 15
99
+    por         m2, m0
100
+
101
+    mova        m4, [pw_pixel_max]
102
+    psignb      m2, [pb_128]                ; m2 = signLeft
103
+    pxor        m0, m0
104
+    palignr     m0, m3, 15
105
+    paddb       m3, m2
106
+    paddb       m3, [pb_2]                  ; m2 = uiEdgeType
107
+    pshufb      m2, m6, m3
108
+    pmovsxbw    m3, m2                      ; offsetEo
109
+    punpckhbw   m2, m2
110
+    psraw       m2, 8
111
+    paddw       m7, m3
112
+    paddw       m8, m2
113
+    pmaxsw      m7, m5
114
+    pmaxsw      m8, m5
115
+    pminsw      m7, m4
116
+    pminsw      m8, m4
117
+    movu        [r1], m7
118
+    movu        [r1 + 16], m8
119
+
120
+    add         r1q, 32
121
+    sub         r4d, 16
122
+    jnz        .loopH
123
+    RET
124
+%else ; HIGH_BIT_DEPTH
125
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
126
 
127
     mov         r4d, r4m
128
@@ -130,8 +236,70 @@
129
     sub         r4d, 16
130
     jnz        .loopH
131
     RET
132
+%endif
133
 
134
 INIT_YMM avx2
135
+%if HIGH_BIT_DEPTH
136
+cglobal saoCuOrgE0, 4,4,9
137
+    vbroadcasti128  m6, [r1]
138
+    movzx           r1d, byte [r3]
139
+    neg             r1b
140
+    movd            xm0, r1d
141
+    movzx           r1d, byte [r3 + 1]
142
+    neg             r1b
143
+    movd            xm1, r1d
144
+    vinserti128     m0, m0, xm1, 1
145
+    mova            m5, [pw_pixel_max]
146
+    mov             r1d, r4m
147
+    add             r1d, r1d
148
+    shr             r2d, 4
149
+
150
+.loop:
151
+    movu            m7, [r0]
152
+    movu            m8, [r0 + r1]
153
+    movu            m2, [r0 + 2]
154
+    movu            m1, [r0 + r1 + 2]
155
+
156
+    pcmpgtw         m3, m7, m2
157
+    pcmpgtw         m2, m7
158
+    pcmpgtw         m4, m8, m1
159
+    pcmpgtw         m1, m8
160
+
161
+    packsswb        m3, m4
162
+    packsswb        m2, m1
163
+    vpermq          m3, m3, 11011000b
164
+    vpermq          m2, m2, 11011000b
165
+
166
+    pand            m3, [pb_1]
167
+    por             m3, m2
168
+
169
+    pslldq          m2, m3, 1
170
+    por             m2, m0
171
+
172
+    psignb          m2, [pb_128]                ; m2 = signLeft
173
+    pxor            m0, m0
174
+    palignr         m0, m3, 15
175
+    paddb           m3, m2
176
+    paddb           m3, [pb_2]                  ; m3 = uiEdgeType
177
+    pshufb          m2, m6, m3
178
+    pmovsxbw        m3, xm2                     ; offsetEo
179
+    vextracti128    xm2, m2, 1
180
+    pmovsxbw        m2, xm2
181
+    pxor            m4, m4
182
+    paddw           m7, m3
183
+    paddw           m8, m2
184
+    pmaxsw          m7, m4
185
+    pmaxsw          m8, m4
186
+    pminsw          m7, m5
187
+    pminsw          m8, m5
188
+    movu            [r0], m7
189
+    movu            [r0 + r1], m8
190
+
191
+    add             r0q, 32
192
+    dec             r2d
193
+    jnz             .loop
194
+    RET
195
+%else ; HIGH_BIT_DEPTH
196
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
197
 
198
     mov                 r4d,        r4m
199
@@ -184,11 +352,68 @@
200
     sub                 r2d,        16
201
x265_1.7.tar.gz/source/common/x86/loopfilter.h -> x265_1.8.tar.gz/source/common/x86/loopfilter.h Changed
42
 
1
@@ -25,21 +25,24 @@
2
 #ifndef X265_LOOPFILTER_H
3
 #define X265_LOOPFILTER_H
4
 
5
-void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
6
-void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
7
-void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
8
-void x265_saoCuOrgE1_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
9
-void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
10
-void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
11
-void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
12
-void x265_saoCuOrgE2_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
13
-void x265_saoCuOrgE2_32_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
14
-void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
15
-void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
16
-void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
17
-void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
18
-void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
19
-void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
20
-void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
21
+#define DECL_SAO(cpu) \
22
+    void PFX(saoCuOrgE0_ ## cpu)(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride); \
23
+    void PFX(saoCuOrgE1_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
24
+    void PFX(saoCuOrgE1_2Rows_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
25
+    void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
26
+    void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
27
+    void PFX(saoCuOrgE2_32_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
28
+    void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
29
+    void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
30
+    void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
31
+    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
32
+    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
33
+    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
34
+    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
35
+    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
36
+    void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
37
+
38
+DECL_SAO(sse4);
39
+DECL_SAO(avx2);
40
 
41
 #endif // ifndef X265_LOOPFILTER_H
42
x265_1.7.tar.gz/source/common/x86/mc-a.asm -> x265_1.8.tar.gz/source/common/x86/mc-a.asm Changed
201
 
1
@@ -32,6 +32,19 @@
2
 %include "x86inc.asm"
3
 %include "x86util.asm"
4
 
5
+%if BIT_DEPTH==8
6
+    %define ADDAVG_FACTOR       256
7
+    %define ADDAVG_ROUND        128
8
+%elif BIT_DEPTH==10
9
+    %define ADDAVG_FACTOR       1024
10
+    %define ADDAVG_ROUND        512
11
+%elif BIT_DEPTH==12
12
+    %define ADDAVG_FACTOR       4096
13
+    %define ADDAVG_ROUND        2048
14
+%else
15
+    %error Unsupport bit depth!
16
+%endif
17
+
18
 SECTION_RODATA 32
19
 
20
 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
21
@@ -54,11 +67,12 @@
22
 cextern pw_512
23
 cextern pw_1023
24
 cextern pw_1024
25
+cextern pw_2048
26
+cextern pw_4096
27
 cextern pw_00ff
28
 cextern pw_pixel_max
29
-cextern sw_64
30
 cextern pd_32
31
-cextern deinterleave_shufd
32
+cextern pd_64
33
 
34
 ;====================================================================================================================
35
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
36
@@ -93,23 +107,24 @@
37
     punpcklqdq    m1,          m2
38
     punpcklqdq    m3,          m5
39
     paddw         m1,          m3
40
-    pmulhrsw      m1,          [pw_1024]
41
-    paddw         m1,          [pw_512]
42
+    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
43
+    paddw         m1,          [pw_ %+ ADDAVG_ROUND]
44
 
45
     pxor          m0,          m0
46
     pmaxsw        m1,          m0
47
-    pminsw        m1,          [pw_1023]
48
+    pminsw        m1,          [pw_pixel_max]
49
     movd          [r2],        m1
50
     pextrd        [r2 + r5],   m1, 1
51
     lea           r2,          [r2 + 2 * r5]
52
     pextrd        [r2],        m1, 2
53
     pextrd        [r2 + r5],   m1, 3
54
-
55
     RET
56
+
57
+
58
 ;-----------------------------------------------------------------------------
59
 INIT_XMM sse4
60
 cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
61
-    mova          m0,          [pw_512]
62
+    mova          m0,          [pw_ %+ ADDAVG_ROUND]
63
     pxor          m7,          m7
64
     add           r3,          r3
65
     add           r4,          r4
66
@@ -137,11 +152,11 @@
67
     punpcklqdq    m1,          m2
68
     punpcklqdq    m3,          m5
69
     paddw         m1,          m3
70
-    pmulhrsw      m1,          [pw_1024]
71
+    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
72
     paddw         m1,          m0
73
 
74
     pmaxsw        m1,          m7
75
-    pminsw        m1,          [pw_1023]
76
+    pminsw        m1,          [pw_pixel_max]
77
     movd          [r2],        m1
78
     pextrd        [r2 + r5],   m1, 1
79
     lea           r2,          [r2 + 2 * r5]
80
@@ -157,8 +172,8 @@
81
 ;-----------------------------------------------------------------------------
82
 INIT_XMM sse4
83
 cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
84
-    mova        m6,         [pw_1023]
85
-    mova        m7,         [pw_1024]
86
+    mova        m6,         [pw_pixel_max]
87
+    mova        m7,         [pw_ %+ ADDAVG_FACTOR]
88
     mov         r6d,        16/4
89
     add         r3,         r3
90
     add         r4,         r4
91
@@ -184,7 +199,7 @@
92
     punpcklqdq  m3,         m5
93
     paddw       m1,         m3
94
     pmulhrsw    m1,         m7
95
-    paddw       m1,         [pw_512]
96
+    paddw       m1,         [pw_ %+ ADDAVG_ROUND]
97
     pxor        m0,         m0
98
     pmaxsw      m1,         m0
99
     pminsw      m1,         m6
100
@@ -214,21 +229,21 @@
101
     punpcklqdq     m0,          m1
102
     punpcklqdq     m2,          m3
103
     paddw          m0,          m2
104
-    pmulhrsw       m0,          [pw_1024]
105
-    paddw          m0,          [pw_512]
106
+    pmulhrsw       m0,          [pw_ %+ ADDAVG_FACTOR]
107
+    paddw          m0,          [pw_ %+ ADDAVG_ROUND]
108
 
109
     pxor           m6,          m6
110
     pmaxsw         m0,          m6
111
-    pminsw         m0,          [pw_1023]
112
+    pminsw         m0,          [pw_pixel_max]
113
     movh           [r2],        m0
114
     movhps         [r2 + r5],   m0
115
     RET
116
 ;-----------------------------------------------------------------------------
117
 INIT_XMM sse4
118
 cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
119
-    mova        m4,             [pw_512]
120
-    mova        m5,             [pw_1023]
121
-    mova        m7,             [pw_1024]
122
+    mova        m4,             [pw_ %+ ADDAVG_ROUND]
123
+    mova        m5,             [pw_pixel_max]
124
+    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
125
     pxor        m6,             m6
126
     add         r3,             r3
127
     add         r4,             r4
128
@@ -265,9 +280,9 @@
129
 ;-----------------------------------------------------------------------------
130
 INIT_XMM sse4
131
 cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
132
-    mova        m4,             [pw_512]
133
-    mova        m5,             [pw_1023]
134
-    mova        m7,             [pw_1024]
135
+    mova        m4,             [pw_ %+ ADDAVG_ROUND]
136
+    mova        m5,             [pw_pixel_max]
137
+    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
138
     pxor        m6,             m6
139
     mov         r6d,            16/2
140
     add         r3,             r3
141
@@ -301,9 +316,9 @@
142
 ;-----------------------------------------------------------------------------
143
 INIT_XMM sse4
144
 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
145
-    mova        m4,          [pw_512]
146
-    mova        m5,          [pw_1023]
147
-    mova        m7,          [pw_1024]
148
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
149
+    mova        m5,          [pw_pixel_max]
150
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
151
     pxor        m6,          m6
152
     add         r3,          r3
153
     add         r4,          r4
154
@@ -332,9 +347,9 @@
155
 ;-----------------------------------------------------------------------------
156
 INIT_XMM sse4
157
 cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
158
-    mova        m4,          [pw_512]
159
-    mova        m5,          [pw_1023]
160
-    mova        m7,          [pw_1024]
161
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
162
+    mova        m5,          [pw_pixel_max]
163
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
164
     pxor        m6,          m6
165
     add         r3,          r3
166
     add         r4,          r4
167
@@ -371,9 +386,9 @@
168
 %macro ADDAVG_W4_H4 1
169
 INIT_XMM sse4
170
 cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
171
-    mova           m4,          [pw_512]
172
-    mova           m5,          [pw_1023]
173
-    mova           m7,          [pw_1024]
174
+    mova           m4,          [pw_ %+ ADDAVG_ROUND]
175
+    mova           m5,          [pw_pixel_max]
176
+    mova           m7,          [pw_ %+ ADDAVG_FACTOR]
177
     pxor           m6,          m6
178
     add            r3,          r3
179
     add            r4,          r4
180
@@ -421,9 +436,9 @@
181
 %macro ADDAVG_W8_H4 1
182
 INIT_XMM sse4
183
 cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
184
-    mova        m4,          [pw_512]
185
-    mova        m5,          [pw_1023]
186
-    mova        m7,          [pw_1024]
187
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
188
+    mova        m5,          [pw_pixel_max]
189
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
190
     pxor        m6,          m6
191
     add         r3,          r3
192
     add         r4,          r4
193
@@ -471,9 +486,9 @@
194
 %macro ADDAVG_W12_H4 1
195
 INIT_XMM sse4
196
 cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
197
-    mova           m4,             [pw_512]
198
-    mova           m5,             [pw_1023]
199
-    mova           m7,             [pw_1024]
200
+    mova           m4,             [pw_ %+ ADDAVG_ROUND]
201
x265_1.7.tar.gz/source/common/x86/mc-a2.asm -> x265_1.8.tar.gz/source/common/x86/mc-a2.asm Changed
82
 
1
@@ -692,7 +692,7 @@
2
 %endmacro
3
 
4
 %macro FILT32x4U 4
5
-    mova      m1, [r0+r5]
6
+    movu      m1, [r0+r5]
7
     pavgb     m0, m1, [r0]
8
     movu      m3, [r0+r5+1]
9
     pavgb     m2, m3, [r0+1]
10
@@ -701,7 +701,7 @@
11
     pavgb     m0, m2
12
     pavgb     m1, m3
13
 
14
-    mova      m3, [r0+r5+mmsize]
15
+    movu      m3, [r0+r5+mmsize]
16
     pavgb     m2, m3, [r0+mmsize]
17
     movu      m5, [r0+r5+1+mmsize]
18
     pavgb     m4, m5, [r0+1+mmsize]
19
@@ -722,10 +722,10 @@
20
     vpermq    m1, m4, q3120
21
     vpermq    m2, m2, q3120
22
     vpermq    m3, m5, q3120
23
-    mova    [%1], m0
24
-    mova    [%2], m1
25
-    mova    [%3], m2
26
-    mova    [%4], m3
27
+    movu    [%1], m0
28
+    movu    [%2], m1
29
+    movu    [%3], m2
30
+    movu    [%4], m3
31
 %endmacro
32
 
33
 %macro FILT16x2 4
34
@@ -796,8 +796,8 @@
35
 %endmacro
36
 
37
 %macro FILT8xA 4
38
-    mova      m3, [r0+%4+mmsize]
39
-    mova      m2, [r0+%4]
40
+    movu      m3, [r0+%4+mmsize]
41
+    movu      m2, [r0+%4]
42
     pavgw     m3, [r0+%4+r5+mmsize]
43
     pavgw     m2, [r0+%4+r5]
44
     PALIGNR   %1, m3, 2, m6
45
@@ -815,9 +815,13 @@
46
     packssdw  m3, %1
47
     packssdw  m5, m4
48
 %endif
49
-    mova    [%2], m3
50
-    mova    [%3], m5
51
-    mova      %1, m2
52
+%if cpuflag(avx2)
53
+    vpermq     m3, m3, q3120
54
+    vpermq     m5, m5, q3120
55
+%endif
56
+    movu    [%2], m3
57
+    movu    [%3], m5
58
+    movu      %1, m2
59
 %endmacro
60
 
61
 ;-----------------------------------------------------------------------------
62
@@ -871,8 +875,8 @@
63
 .vloop:
64
     mov      r6d, r7m
65
 %ifnidn cpuname, mmx2
66
-    mova      m0, [r0]
67
-    mova      m1, [r0+r5]
68
+    movu      m0, [r0]
69
+    movu      m1, [r0+r5]
70
     pavgw     m0, m1
71
     pavgw     m1, [r0+r5*2]
72
 %endif
73
@@ -977,7 +981,7 @@
74
 FRAME_INIT_LOWRES
75
 INIT_XMM xop
76
 FRAME_INIT_LOWRES
77
-%if HIGH_BIT_DEPTH==0
78
+%if ARCH_X86_64 == 1
79
 INIT_YMM avx2
80
 FRAME_INIT_LOWRES
81
 %endif
82
x265_1.7.tar.gz/source/common/x86/mc.h -> x265_1.8.tar.gz/source/common/x86/mc.h Changed
49
 
1
@@ -25,45 +25,15 @@
2
 #define X265_MC_H
3
 
4
 #define LOWRES(cpu) \
5
-    void x265_frame_init_lowres_core_ ## cpu(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
6
+    void PFX(frame_init_lowres_core_ ## cpu)(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
7
                                              intptr_t src_stride, intptr_t dst_stride, int width, int height);
8
 LOWRES(mmx2)
9
 LOWRES(sse2)
10
 LOWRES(ssse3)
11
 LOWRES(avx)
12
+LOWRES(avx2)
13
 LOWRES(xop)
14
 
15
-#define DECL_SUF(func, args) \
16
-    void func ## _mmx2 args; \
17
-    void func ## _sse2 args; \
18
-    void func ## _ssse3 args;
19
-DECL_SUF(x265_pixel_avg_64x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
20
-DECL_SUF(x265_pixel_avg_64x48, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
21
-DECL_SUF(x265_pixel_avg_64x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
22
-DECL_SUF(x265_pixel_avg_64x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
23
-DECL_SUF(x265_pixel_avg_48x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
24
-DECL_SUF(x265_pixel_avg_32x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
25
-DECL_SUF(x265_pixel_avg_32x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
26
-DECL_SUF(x265_pixel_avg_32x24, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
27
-DECL_SUF(x265_pixel_avg_32x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
28
-DECL_SUF(x265_pixel_avg_32x8,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
29
-DECL_SUF(x265_pixel_avg_24x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
30
-DECL_SUF(x265_pixel_avg_16x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
31
-DECL_SUF(x265_pixel_avg_16x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
32
-DECL_SUF(x265_pixel_avg_16x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
33
-DECL_SUF(x265_pixel_avg_16x12, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
34
-DECL_SUF(x265_pixel_avg_16x8,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
35
-DECL_SUF(x265_pixel_avg_16x4,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
36
-DECL_SUF(x265_pixel_avg_12x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
37
-DECL_SUF(x265_pixel_avg_8x32,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
38
-DECL_SUF(x265_pixel_avg_8x16,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
39
-DECL_SUF(x265_pixel_avg_8x8,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
40
-DECL_SUF(x265_pixel_avg_8x4,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
41
-DECL_SUF(x265_pixel_avg_4x16,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
42
-DECL_SUF(x265_pixel_avg_4x8,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
43
-DECL_SUF(x265_pixel_avg_4x4,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
44
-
45
 #undef LOWRES
46
-#undef DECL_SUF
47
 
48
 #endif // ifndef X265_MC_H
49
x265_1.7.tar.gz/source/common/x86/pixel-a.asm -> x265_1.8.tar.gz/source/common/x86/pixel-a.asm Changed
201
 
1
@@ -9,6 +9,7 @@
2
 ;*          Alex Izvorski <aizvorksi@gmail.com>
3
 ;*          Fiona Glaser <fiona@x264.com>
4
 ;*          Oskar Arvidsson <oskar@irock.se>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -32,8 +33,6 @@
10
 %include "x86util.asm"
11
 
12
 SECTION_RODATA 32
13
-hmul_16p:  times 16 db 1
14
-           times 8 db 1, -1
15
 hmul_8p:   times 8 db 1
16
            times 4 db 1, -1
17
            times 8 db 1
18
@@ -45,8 +44,7 @@
19
            times 2 dw 1, -1
20
            times 4 dw 1
21
            times 2 dw 1, -1
22
-ALIGN 32
23
-hmul_w:    times 2 dw 1, -1, 1, -1, 1, -1, 1, -1
24
+
25
 ALIGN 32
26
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
27
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
28
@@ -54,8 +52,6 @@
29
 sw_f0:     dq 0xfff0, 0
30
 pd_f0:     times 4 dd 0xffff0000
31
 
32
-pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
33
-
34
 SECTION .text
35
 
36
 cextern pb_0
37
@@ -72,6 +68,9 @@
38
 cextern pd_1
39
 cextern popcnt_table
40
 cextern pd_2
41
+cextern hmul_16p
42
+cextern pb_movemask
43
+cextern pw_pixel_max
44
 
45
 ;=============================================================================
46
 ; SATD
47
@@ -242,6 +241,12 @@
48
 %endif
49
     HADAMARD4_2D 4, 5, 6, 7, 3, %%n
50
     paddw m4, m6
51
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
52
+;    pxor m5, m5
53
+;    punpcklwd m6, m4, m5
54
+;    punpckhwd m4, m5
55
+;    paddd m4, m6
56
+;%endif
57
     SWAP %%n, 4
58
 %endmacro
59
 
60
@@ -257,15 +262,45 @@
61
     HADAMARD 1, max, %2, %4, %6, %7
62
 %endif
63
 %ifnidn %9, swap
64
+  %if (BIT_DEPTH == 12)
65
+    pxor m%6, m%6
66
+    punpcklwd m%7, m%2, m%6
67
+    punpckhwd m%2, m%6
68
+    paddd m%8, m%7
69
+    paddd m%8, m%2
70
+  %else
71
     paddw m%8, m%2
72
+  %endif
73
 %else
74
     SWAP %8, %2
75
+  %if (BIT_DEPTH == 12)
76
+    pxor m%6, m%6
77
+    punpcklwd m%7, m%8, m%6
78
+    punpckhwd m%8, m%6
79
+    paddd m%8, m%7
80
+  %endif
81
 %endif
82
 %if %1
83
+  %if (BIT_DEPTH == 12)
84
+    pxor m%6, m%6
85
+    punpcklwd m%7, m%4, m%6
86
+    punpckhwd m%4, m%6
87
+    paddd m%8, m%7
88
+    paddd m%8, m%4
89
+  %else
90
     paddw m%8, m%4
91
+  %endif
92
 %else
93
     HADAMARD 1, max, %3, %5, %6, %7
94
+  %if (BIT_DEPTH == 12)
95
+    pxor m%6, m%6
96
+    punpcklwd m%7, m%3, m%6
97
+    punpckhwd m%3, m%6
98
+    paddd m%8, m%7
99
+    paddd m%8, m%3
100
+  %else
101
     paddw m%8, m%3
102
+  %endif
103
 %endif
104
 %endmacro
105
 
106
@@ -281,29 +316,23 @@
107
 %endif
108
 
109
     pxor m%10, m%10
110
-    mova m%9, m%2
111
-    punpcklwd m%9, m%10
112
+    punpcklwd m%9, m%2, m%10
113
     paddd m%8, m%9
114
-    mova m%9, m%2
115
-    punpckhwd m%9, m%10
116
+    punpckhwd m%9, m%2, m%10
117
     paddd m%8, m%9
118
 
119
 %if %1
120
     pxor m%10, m%10
121
-    mova m%9, m%4
122
-    punpcklwd m%9, m%10
123
+    punpcklwd m%9, m%4, m%10
124
     paddd m%8, m%9
125
-    mova m%9, m%4
126
-    punpckhwd m%9, m%10
127
+    punpckhwd m%9, m%4, m%10
128
     paddd m%8, m%9
129
 %else
130
     HADAMARD 1, max, %3, %5, %6, %7
131
     pxor m%10, m%10
132
-    mova m%9, m%3
133
-    punpcklwd m%9, m%10
134
+    punpcklwd m%9, m%3, m%10
135
     paddd m%8, m%9
136
-    mova m%9, m%3
137
-    punpckhwd m%9, m%10
138
+    punpckhwd m%9, m%3, m%10
139
     paddd m%8, m%9
140
 %endif
141
 %endmacro
142
@@ -326,6 +355,7 @@
143
     movd       eax, m0
144
     and        eax, 0xffff
145
 %endif ; HIGH_BIT_DEPTH
146
+    EMMS
147
     RET
148
 %endmacro
149
 
150
@@ -336,136 +366,10 @@
151
 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
152
 ;-----------------------------------------------------------------------------
153
 INIT_MMX mmx2
154
-cglobal pixel_satd_16x4_internal
155
-    SATD_4x4_MMX m2,  0, 0
156
-    SATD_4x4_MMX m1,  4, 0
157
-    paddw        m0, m2
158
-    SATD_4x4_MMX m2,  8, 0
159
-    paddw        m0, m1
160
-    SATD_4x4_MMX m1, 12, 0
161
-    paddw        m0, m2
162
-    paddw        m0, m1
163
-    ret
164
-
165
-cglobal pixel_satd_8x8_internal
166
-    SATD_4x4_MMX m2,  0, 0
167
-    SATD_4x4_MMX m1,  4, 1
168
-    paddw        m0, m2
169
-    paddw        m0, m1
170
-pixel_satd_8x4_internal_mmx2:
171
-    SATD_4x4_MMX m2,  0, 0
172
-    SATD_4x4_MMX m1,  4, 0
173
-    paddw        m0, m2
174
-    paddw        m0, m1
175
-    ret
176
-
177
-%if HIGH_BIT_DEPTH
178
-%macro SATD_MxN_MMX 3
179
-cglobal pixel_satd_%1x%2, 4,7
180
-    SATD_START_MMX
181
-    pxor   m0, m0
182
-    call pixel_satd_%1x%3_internal_mmx2
183
-    HADDUW m0, m1
184
-    movd  r6d, m0
185
-%rep %2/%3-1
186
-    pxor   m0, m0
187
-    lea    r0, [r0+4*r1]
188
-    lea    r2, [r2+4*r3]
189
-    call pixel_satd_%1x%3_internal_mmx2
190
-    movd   m2, r4
191
-    HADDUW m0, m1
192
-    movd   r4, m0
193
-    add    r6, r4
194
-    movd   r4, m2
195
-%endrep
196
-    movifnidn eax, r6d
197
-    RET
198
-%endmacro
199
-
200
-SATD_MxN_MMX 16, 16, 4
201
x265_1.7.tar.gz/source/common/x86/pixel-util.h -> x265_1.8.tar.gz/source/common/x86/pixel-util.h Changed
150
 
1
@@ -24,117 +24,36 @@
2
 #ifndef X265_PIXEL_UTIL_H
3
 #define X265_PIXEL_UTIL_H
4
 
5
-void x265_getResidual4_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
6
-void x265_getResidual8_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
7
-void x265_getResidual16_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
8
-void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
9
-void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
10
-void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
11
-void x265_getResidual16_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
12
-void x265_getResidual32_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
13
-
14
-void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
15
-void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
16
-void x265_transpose16_sse2(pixel* dest, const pixel* src, intptr_t stride);
17
-void x265_transpose32_sse2(pixel* dest, const pixel* src, intptr_t stride);
18
-void x265_transpose64_sse2(pixel* dest, const pixel* src, intptr_t stride);
19
-
20
-void x265_transpose8_avx2(pixel* dest, const pixel* src, intptr_t stride);
21
-void x265_transpose16_avx2(pixel* dest, const pixel* src, intptr_t stride);
22
-void x265_transpose32_avx2(pixel* dest, const pixel* src, intptr_t stride);
23
-void x265_transpose64_avx2(pixel* dest, const pixel* src, intptr_t stride);
24
-
25
-uint32_t x265_quant_sse4(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
26
-uint32_t x265_quant_avx2(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
27
-uint32_t x265_nquant_sse4(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
28
-uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
29
-void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
30
-void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
31
-
32
-int x265_count_nonzero_4x4_ssse3(const int16_t* quantCoeff);
33
-int x265_count_nonzero_8x8_ssse3(const int16_t* quantCoeff);
34
-int x265_count_nonzero_16x16_ssse3(const int16_t* quantCoeff);
35
-int x265_count_nonzero_32x32_ssse3(const int16_t* quantCoeff);
36
-int x265_count_nonzero_4x4_avx2(const int16_t* quantCoeff);
37
-int x265_count_nonzero_8x8_avx2(const int16_t* quantCoeff);
38
-int x265_count_nonzero_16x16_avx2(const int16_t* quantCoeff);
39
-int x265_count_nonzero_32x32_avx2(const int16_t* quantCoeff);
40
-
41
-void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
42
-void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
43
-void x265_weight_sp_sse4(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
44
-
45
-void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t* pix1, intptr_t stride1,
46
-                                     const uint8_t* pix2, intptr_t stride2, int sums[2][4]);
47
-void x265_pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1,
48
-                                     const pixel* pix2, intptr_t stride2, int sums[2][4]);
49
-void x265_pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1,
50
-                                    const pixel* pix2, intptr_t stride2, int sums[2][4]);
51
-float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width);
52
-float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width);
53
-
54
-void x265_scale1D_128to64_ssse3(pixel*, const pixel*);
55
-void x265_scale1D_128to64_avx2(pixel*, const pixel*);
56
-void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
57
-void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
58
-
59
-int x265_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
60
-int x265_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
61
-uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
62
-
63
-#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
64
-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
65
-    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  src1, intptr_t srcStride0, intptr_t srcStride1);
66
-
67
-#define CHROMA_420_PIXELSUB_DEF(cpu) \
68
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
69
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
70
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
71
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu);
72
-
73
-#define CHROMA_422_PIXELSUB_DEF(cpu) \
74
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
75
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
76
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
77
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu);
78
-
79
-#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
80
-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
81
-    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  src1, intptr_t srcStride0, intptr_t srcStride1);
82
-
83
-#define LUMA_PIXELSUB_DEF(cpu) \
84
-    SETUP_LUMA_PIXELSUB_PS_FUNC(8,   8, cpu); \
85
-    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
86
-    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
87
-    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu);
88
-
89
-LUMA_PIXELSUB_DEF(_sse2);
90
-CHROMA_420_PIXELSUB_DEF(_sse2);
91
-CHROMA_422_PIXELSUB_DEF(_sse2);
92
-
93
-LUMA_PIXELSUB_DEF(_sse4);
94
-CHROMA_420_PIXELSUB_DEF(_sse4);
95
-CHROMA_422_PIXELSUB_DEF(_sse4);
96
-
97
-#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
98
-    uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(const pixel* pix, intptr_t pixstride);
99
-
100
-#define LUMA_PIXELVAR_DEF(cpu) \
101
-    SETUP_LUMA_PIXELVAR_FUNC(8,   8, cpu); \
102
-    SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu); \
103
-    SETUP_LUMA_PIXELVAR_FUNC(32, 32, cpu); \
104
-    SETUP_LUMA_PIXELVAR_FUNC(64, 64, cpu);
105
-
106
-LUMA_PIXELVAR_DEF(_sse2);
107
-LUMA_PIXELVAR_DEF(_xop);
108
-LUMA_PIXELVAR_DEF(_avx);
109
-
110
-#undef CHROMA_420_PIXELSUB_DEF
111
-#undef CHROMA_422_PIXELSUB_DEF
112
-#undef LUMA_PIXELSUB_DEF
113
-#undef LUMA_PIXELVAR_DEF
114
-#undef SETUP_CHROMA_PIXELSUB_PS_FUNC
115
-#undef SETUP_LUMA_PIXELSUB_PS_FUNC
116
-#undef SETUP_LUMA_PIXELVAR_FUNC
117
+#define DEFINE_UTILS(cpu) \
118
+    FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
119
+    FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
120
+    FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
121
+    uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
122
+    uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
123
+    void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
124
+    void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
125
+    void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
126
+    void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
127
+    void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
128
+    void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
129
+    uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
130
+    uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
131
+
132
+DEFINE_UTILS(sse2);
133
+DEFINE_UTILS(ssse3);
134
+DEFINE_UTILS(sse4);
135
+DEFINE_UTILS(avx2);
136
+
137
+#undef DEFINE_UTILS
138
+
139
+void PFX(pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
140
+void PFX(pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
141
+float PFX(pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width));
142
+float PFX(pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width));
143
+
144
+int PFX(scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
145
+int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
146
+uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
147
+uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
148
 
149
 #endif // ifndef X265_PIXEL_UTIL_H
150
x265_1.7.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.8.tar.gz/source/common/x86/pixel-util8.asm Changed
201
 
1
@@ -28,7 +28,12 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-%if BIT_DEPTH == 10
6
+%if BIT_DEPTH == 12
7
+ssim_c1:   times 4 dd 107321.76    ; .01*.01*4095*4095*64
8
+ssim_c2:   times 4 dd 60851437.92  ; .03*.03*4095*4095*64*63
9
+pf_64:     times 4 dd 64.0
10
+pf_128:    times 4 dd 128.0
11
+%elif BIT_DEPTH == 10
12
 ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
13
 ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
14
 pf_64:     times 4 dd 64.0
15
@@ -45,18 +50,15 @@
16
                         times 16 db 0
17
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
18
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
19
-hmul_16p:               times 16 db 1
20
-                        times  8 db 1, -1
21
 hmulw_16p:              times  8 dw 1
22
                         times  4 dw 1, -1
23
 
24
-trans8_shuf:            dd 0, 4, 1, 5, 2, 6, 3, 7
25
-
26
 SECTION .text
27
 
28
 cextern pw_1
29
 cextern pw_0_15
30
 cextern pb_1
31
+cextern pb_128
32
 cextern pw_00ff
33
 cextern pw_1023
34
 cextern pw_3fff
35
@@ -72,6 +74,10 @@
36
 cextern pb_16
37
 cextern pb_32
38
 cextern pb_64
39
+cextern hmul_16p
40
+cextern trans8_shuf
41
+cextern_naked private_prefix %+ _entropyStateBits
42
+cextern pb_movemask
43
 
44
 ;-----------------------------------------------------------------------------
45
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
46
@@ -627,7 +633,12 @@
47
     movd            xm6, r4d            ; m6 = qbits8
48
 
49
     ; fill offset
50
+%if UNIX64 == 0
51
     vpbroadcastd    m5, r5m             ; m5 = add
52
+%else ; Mac
53
+    movd           xm5, r5m
54
+    vpbroadcastd    m5, xm5             ; m5 = add
55
+%endif
56
 
57
     lea             r5, [pw_1]
58
 
59
@@ -699,7 +710,12 @@
60
     movd            xm6, r4d        ; m6 = qbits8
61
 
62
     ; fill offset
63
-    vpbroadcastd    m5, r5m         ; m5 = ad
64
+%if UNIX64 == 0
65
+    vpbroadcastd    m5, r5m         ; m5 = add
66
+%else ; Mac
67
+    movd           xm5, r5m
68
+    vpbroadcastd    m5, xm5         ; m5 = add
69
+%endif
70
 
71
     lea             r5, [pd_1]
72
 
73
@@ -817,7 +833,12 @@
74
 
75
 INIT_YMM avx2
76
 cglobal nquant, 3,5,7
77
+%if UNIX64 == 0
78
     vpbroadcastd m4, r4m
79
+%else ; Mac
80
+    movd         xm4, r4m
81
+    vpbroadcastd m4, xm4
82
+%endif
83
     vpbroadcastd m6, [pw_1]
84
     mov         r4d, r5m
85
     pxor        m5, m5              ; m7 = numZero
86
@@ -873,8 +894,8 @@
87
 %if HIGH_BIT_DEPTH
88
     cmp         r3d, 32767
89
     jle         .skip
90
-    shr         r3d, 2
91
-    sub         r4d, 2
92
+    shr         r3d, (BIT_DEPTH - 8)
93
+    sub         r4d, (BIT_DEPTH - 8)
94
 .skip:
95
 %endif
96
     movd        m0, r4d             ; m0 = shift
97
@@ -903,6 +924,136 @@
98
     jnz        .loop
99
     RET
100
 
101
+;----------------------------------------------------------------------------------------------------------------------
102
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
103
+;----------------------------------------------------------------------------------------------------------------------
104
+INIT_XMM sse4
105
+cglobal dequant_scaling, 6,6,6
106
+    add         r5d, 4
107
+    shr         r3d, 3          ; num/8
108
+    cmp         r5d, r4d
109
+    jle         .skip
110
+    sub         r5d, r4d
111
+    mova        m0, [pd_1]
112
+    movd        m1, r5d         ; shift - per
113
+    dec         r5d
114
+    movd        m2, r5d         ; shift - per - 1
115
+    pslld       m0, m2          ; 1 << shift - per - 1
116
+
117
+.part0:
118
+    pmovsxwd    m2, [r0]
119
+    pmovsxwd    m4, [r0 + 8]
120
+    movu        m3, [r1]
121
+    movu        m5, [r1 + 16]
122
+    pmulld      m2, m3
123
+    pmulld      m4, m5
124
+    paddd       m2, m0
125
+    paddd       m4, m0
126
+    psrad       m2, m1
127
+    psrad       m4, m1
128
+    packssdw    m2, m4
129
+    movu        [r2], m2
130
+
131
+    add         r0, 16
132
+    add         r1, 32
133
+    add         r2, 16
134
+    dec         r3d
135
+    jnz         .part0
136
+    jmp         .end
137
+
138
+.skip:
139
+    sub         r4d, r5d        ; per - shift
140
+    movd        m0, r4d
141
+
142
+.part1:
143
+    pmovsxwd    m2, [r0]
144
+    pmovsxwd    m4, [r0 + 8]
145
+    movu        m3, [r1]
146
+    movu        m5, [r1 + 16]
147
+    pmulld      m2, m3
148
+    pmulld      m4, m5
149
+    packssdw    m2, m4
150
+    pmovsxwd    m1, m2
151
+    psrldq      m2, 8
152
+    pmovsxwd    m2, m2
153
+    pslld       m1, m0
154
+    pslld       m2, m0
155
+    packssdw    m1, m2
156
+    movu        [r2], m1
157
+
158
+    add         r0, 16
159
+    add         r1, 32
160
+    add         r2, 16
161
+    dec         r3d
162
+    jnz         .part1
163
+.end:
164
+    RET
165
+
166
+;----------------------------------------------------------------------------------------------------------------------
167
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
168
+;----------------------------------------------------------------------------------------------------------------------
169
+INIT_YMM avx2
170
+cglobal dequant_scaling, 6,6,6
171
+    add         r5d, 4
172
+    shr         r3d, 4          ; num/16
173
+    cmp         r5d, r4d
174
+    jle         .skip
175
+    sub         r5d, r4d
176
+    mova        m0, [pd_1]
177
+    movd        xm1, r5d         ; shift - per
178
+    dec         r5d
179
+    movd        xm2, r5d         ; shift - per - 1
180
+    pslld       m0, xm2          ; 1 << shift - per - 1
181
+
182
+.part0:
183
+    pmovsxwd    m2, [r0]
184
+    pmovsxwd    m4, [r0 + 16]
185
+    movu        m3, [r1]
186
+    movu        m5, [r1 + 32]
187
+    pmulld      m2, m3
188
+    pmulld      m4, m5
189
+    paddd       m2, m0
190
+    paddd       m4, m0
191
+    psrad       m2, xm1
192
+    psrad       m4, xm1
193
+    packssdw    m2, m4
194
+    vpermq      m2, m2, 11011000b
195
+    movu        [r2], m2
196
+
197
+    add         r0, 32
198
+    add         r1, 64
199
+    add         r2, 32
200
+    dec         r3d
201
x265_1.7.tar.gz/source/common/x86/pixel.h -> x265_1.8.tar.gz/source/common/x86/pixel.h Changed
201
 
1
@@ -28,260 +28,41 @@
2
 #ifndef X265_I386_PIXEL_H
3
 #define X265_I386_PIXEL_H
4
 
5
-#define DECL_PIXELS(ret, name, suffix, args) \
6
-    ret x265_pixel_ ## name ## _16x64_ ## suffix args; \
7
-    ret x265_pixel_ ## name ## _16x32_ ## suffix args; \
8
-    ret x265_pixel_ ## name ## _16x16_ ## suffix args; \
9
-    ret x265_pixel_ ## name ## _16x12_ ## suffix args; \
10
-    ret x265_pixel_ ## name ## _16x8_ ## suffix args; \
11
-    ret x265_pixel_ ## name ## _16x4_ ## suffix args; \
12
-    ret x265_pixel_ ## name ## _8x32_ ## suffix args; \
13
-    ret x265_pixel_ ## name ## _8x16_ ## suffix args; \
14
-    ret x265_pixel_ ## name ## _8x8_ ## suffix args; \
15
-    ret x265_pixel_ ## name ## _8x4_ ## suffix args; \
16
-    ret x265_pixel_ ## name ## _4x16_ ## suffix args; \
17
-    ret x265_pixel_ ## name ## _4x8_ ## suffix args; \
18
-    ret x265_pixel_ ## name ## _4x4_ ## suffix args; \
19
-    ret x265_pixel_ ## name ## _32x8_ ## suffix args; \
20
-    ret x265_pixel_ ## name ## _32x16_ ## suffix args; \
21
-    ret x265_pixel_ ## name ## _32x24_ ## suffix args; \
22
-    ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
23
-    ret x265_pixel_ ## name ## _32x32_ ## suffix args; \
24
-    ret x265_pixel_ ## name ## _32x64_ ## suffix args; \
25
-    ret x265_pixel_ ## name ## _64x16_ ## suffix args; \
26
-    ret x265_pixel_ ## name ## _64x32_ ## suffix args; \
27
-    ret x265_pixel_ ## name ## _64x48_ ## suffix args; \
28
-    ret x265_pixel_ ## name ## _64x64_ ## suffix args; \
29
-    ret x265_pixel_ ## name ## _48x64_ ## suffix args; \
30
-    ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
31
-    ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
32
-
33
-#define DECL_X1(name, suffix) \
34
-    DECL_PIXELS(int, name, suffix, (const pixel*, intptr_t, const pixel*, intptr_t))
35
-
36
-#define DECL_X1_SS(name, suffix) \
37
-    DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const int16_t*, intptr_t))
38
-
39
-#define DECL_X1_SP(name, suffix) \
40
-    DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const pixel*, intptr_t))
41
-
42
-#define DECL_X4(name, suffix) \
43
-    DECL_PIXELS(void, name ## _x3, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) \
44
-    DECL_PIXELS(void, name ## _x4, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*))
45
-
46
-/* sad-a.asm */
47
-DECL_X1(sad, mmx2)
48
-DECL_X1(sad, sse2)
49
-DECL_X4(sad, sse2_misalign)
50
-DECL_X1(sad, sse3)
51
-DECL_X1(sad, sse2_aligned)
52
-DECL_X1(sad, ssse3)
53
-DECL_X1(sad, ssse3_aligned)
54
-DECL_X1(sad, avx2)
55
-DECL_X1(sad, avx2_aligned)
56
-DECL_X4(sad, mmx2)
57
-DECL_X4(sad, sse2)
58
-DECL_X4(sad, sse3)
59
-DECL_X4(sad, ssse3)
60
-DECL_X4(sad, avx)
61
-DECL_X4(sad, avx2)
62
-DECL_X1(sad, cache32_mmx2);
63
-DECL_X1(sad, cache64_mmx2);
64
-DECL_X1(sad, cache64_sse2);
65
-DECL_X1(sad, cache64_ssse3);
66
-DECL_X4(sad, cache32_mmx2);
67
-DECL_X4(sad, cache64_mmx2);
68
-DECL_X4(sad, cache64_sse2);
69
-DECL_X4(sad, cache64_ssse3);
70
-
71
-/* pixel-a.asm */
72
-DECL_X1(satd, mmx2)
73
-DECL_X1(satd, sse2)
74
-DECL_X1(satd, ssse3)
75
-DECL_X1(satd, ssse3_atom)
76
-DECL_X1(satd, sse4)
77
-DECL_X1(satd, avx)
78
-DECL_X1(satd, xop)
79
-DECL_X1(satd, avx2)
80
-int x265_pixel_satd_16x24_avx(const pixel*, intptr_t, const pixel*, intptr_t);
81
-int x265_pixel_satd_32x48_avx(const pixel*, intptr_t, const pixel*, intptr_t);
82
-int x265_pixel_satd_24x64_avx(const pixel*, intptr_t, const pixel*, intptr_t);
83
-int x265_pixel_satd_8x64_avx(const pixel*, intptr_t, const pixel*, intptr_t);
84
-int x265_pixel_satd_8x12_avx(const pixel*, intptr_t, const pixel*, intptr_t);
85
-int x265_pixel_satd_12x32_avx(const pixel*, intptr_t, const pixel*, intptr_t);
86
-int x265_pixel_satd_4x32_avx(const pixel*, intptr_t, const pixel*, intptr_t);
87
-int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
88
-int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
89
-int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
90
-int x265_pixel_satd_16x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
91
-int x265_pixel_satd_16x64_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
92
-
93
-DECL_X1(sa8d, mmx2)
94
-DECL_X1(sa8d, sse2)
95
-DECL_X1(sa8d, ssse3)
96
-DECL_X1(sa8d, ssse3_atom)
97
-DECL_X1(sa8d, sse4)
98
-DECL_X1(sa8d, avx)
99
-DECL_X1(sa8d, xop)
100
-DECL_X1(sa8d, avx2)
101
-
102
-/* ssd-a.asm */
103
-DECL_X1(ssd, mmx)
104
-DECL_X1(ssd, mmx2)
105
-DECL_X1(ssd, sse2slow)
106
-DECL_X1(ssd, sse2)
107
-DECL_X1(ssd, ssse3)
108
-DECL_X1(ssd, avx)
109
-DECL_X1(ssd, xop)
110
-DECL_X1(ssd, avx2)
111
-DECL_X1_SS(ssd_ss, mmx)
112
-DECL_X1_SS(ssd_ss, mmx2)
113
-DECL_X1_SS(ssd_ss, sse2slow)
114
-DECL_X1_SS(ssd_ss, sse2)
115
-DECL_X1_SS(ssd_ss, ssse3)
116
-DECL_X1_SS(ssd_ss, sse4)
117
-DECL_X1_SS(ssd_ss, avx)
118
-DECL_X1_SS(ssd_ss, xop)
119
-DECL_X1_SS(ssd_ss, avx2)
120
-DECL_X1_SP(ssd_sp, sse4)
121
-#define DECL_HEVC_SSD(suffix) \
122
-    int x265_pixel_ssd_32x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
123
-    int x265_pixel_ssd_16x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
124
-    int x265_pixel_ssd_32x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
125
-    int x265_pixel_ssd_32x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
126
-    int x265_pixel_ssd_16x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
127
-    int x265_pixel_ssd_32x24_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
128
-    int x265_pixel_ssd_24x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
129
-    int x265_pixel_ssd_32x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
130
-    int x265_pixel_ssd_8x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
131
-    int x265_pixel_ssd_16x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
132
-    int x265_pixel_ssd_16x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
133
-    int x265_pixel_ssd_8x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
134
-    int x265_pixel_ssd_16x12_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
135
-    int x265_pixel_ssd_16x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
136
-    int x265_pixel_ssd_8x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
137
-    int x265_pixel_ssd_8x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t);
138
-DECL_HEVC_SSD(sse2)
139
-DECL_HEVC_SSD(ssse3)
140
-DECL_HEVC_SSD(avx)
141
-
142
-int x265_pixel_ssd_12x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
143
-int x265_pixel_ssd_24x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
144
-int x265_pixel_ssd_48x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
145
-int x265_pixel_ssd_64x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
146
-int x265_pixel_ssd_64x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
147
-int x265_pixel_ssd_64x48_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
148
-int x265_pixel_ssd_64x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
149
-
150
-int x265_pixel_ssd_s_4_sse2(const int16_t*, intptr_t);
151
-int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t);
152
-int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t);
153
-int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t);
154
-int x265_pixel_ssd_s_16_avx2(const int16_t*, intptr_t);
155
-int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
156
-
157
-#define ADDAVG(func)  \
158
-    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
159
-    void x265_ ## func ## _avx2(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
160
-ADDAVG(addAvg_2x4)
161
-ADDAVG(addAvg_2x8)
162
-ADDAVG(addAvg_4x2);
163
-ADDAVG(addAvg_4x4)
164
-ADDAVG(addAvg_4x8)
165
-ADDAVG(addAvg_4x16)
166
-ADDAVG(addAvg_6x8)
167
-ADDAVG(addAvg_8x2)
168
-ADDAVG(addAvg_8x4)
169
-ADDAVG(addAvg_8x6)
170
-ADDAVG(addAvg_8x8)
171
-ADDAVG(addAvg_8x16)
172
-ADDAVG(addAvg_8x32)
173
-ADDAVG(addAvg_12x16)
174
-ADDAVG(addAvg_16x4)
175
-ADDAVG(addAvg_16x8)
176
-ADDAVG(addAvg_16x12)
177
-ADDAVG(addAvg_16x16)
178
-ADDAVG(addAvg_16x32)
179
-ADDAVG(addAvg_16x64)
180
-ADDAVG(addAvg_24x32)
181
-ADDAVG(addAvg_32x8)
182
-ADDAVG(addAvg_32x16)
183
-ADDAVG(addAvg_32x24)
184
-ADDAVG(addAvg_32x32)
185
-ADDAVG(addAvg_32x64)
186
-ADDAVG(addAvg_48x64)
187
-ADDAVG(addAvg_64x16)
188
-ADDAVG(addAvg_64x32)
189
-ADDAVG(addAvg_64x48)
190
-ADDAVG(addAvg_64x64)
191
-
192
-ADDAVG(addAvg_2x16)
193
-ADDAVG(addAvg_4x32)
194
-ADDAVG(addAvg_6x16)
195
-ADDAVG(addAvg_8x12)
196
-ADDAVG(addAvg_8x64)
197
-ADDAVG(addAvg_12x32)
198
-ADDAVG(addAvg_16x24)
199
-ADDAVG(addAvg_24x64)
200
-ADDAVG(addAvg_32x48)
201
x265_1.7.tar.gz/source/common/x86/sad-a.asm -> x265_1.8.tar.gz/source/common/x86/sad-a.asm Changed
163
 
1
@@ -7,6 +7,7 @@
2
 ;*          Fiona Glaser <fiona@x264.com>
3
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
4
 ;*          Alex Izvorski <aizvorksi@gmail.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -32,15 +33,13 @@
10
 SECTION_RODATA 32
11
 
12
 MSK:                  db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
13
-pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
14
-hpred_shuf:           db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
15
 
16
 SECTION .text
17
 
18
 cextern pb_3
19
 cextern pb_shuf8x8c
20
 cextern pw_8
21
-cextern sw_64
22
+cextern pd_64
23
 
24
 ;=============================================================================
25
 ; SAD MMX
26
@@ -2784,6 +2783,83 @@
27
 %endif
28
 %endmacro
29
 
30
+%macro SAD_X4_START_2x32P_AVX2 0
31
+    mova        m4, [r0]
32
+    movu        m0, [r1]
33
+    movu        m2, [r2]
34
+    movu        m1, [r3]
35
+    movu        m3, [r4]
36
+    psadbw      m0, m4
37
+    psadbw      m2, m4
38
+    psadbw      m1, m4
39
+    psadbw      m3, m4
40
+    packusdw    m0, m2
41
+    packusdw    m1, m3
42
+
43
+    mova        m6, [r0+FENC_STRIDE]
44
+    movu        m2, [r1+r5]
45
+    movu        m4, [r2+r5]
46
+    movu        m3, [r3+r5]
47
+    movu        m5, [r4+r5]
48
+    psadbw      m2, m6
49
+    psadbw      m4, m6
50
+    psadbw      m3, m6
51
+    psadbw      m5, m6
52
+    packusdw    m2, m4
53
+    packusdw    m3, m5
54
+    paddd       m0, m2
55
+    paddd       m1, m3
56
+%endmacro
57
+
58
+%macro SAD_X4_2x32P_AVX2 4
59
+    mova        m6, [r0+%1]
60
+    movu        m2, [r1+%2]
61
+    movu        m4, [r2+%2]
62
+    movu        m3, [r3+%2]
63
+    movu        m5, [r4+%2]
64
+    psadbw      m2, m6
65
+    psadbw      m4, m6
66
+    psadbw      m3, m6
67
+    psadbw      m5, m6
68
+    packusdw    m2, m4
69
+    packusdw    m3, m5
70
+    paddd       m0, m2
71
+    paddd       m1, m3
72
+
73
+    mova        m6, [r0+%3]
74
+    movu        m2, [r1+%4]
75
+    movu        m4, [r2+%4]
76
+    movu        m3, [r3+%4]
77
+    movu        m5, [r4+%4]
78
+    psadbw      m2, m6
79
+    psadbw      m4, m6
80
+    psadbw      m3, m6
81
+    psadbw      m5, m6
82
+    packusdw    m2, m4
83
+    packusdw    m3, m5
84
+    paddd       m0, m2
85
+    paddd       m1, m3
86
+%endmacro
87
+
88
+%macro SAD_X4_4x32P_AVX2 2
89
+%if %1==0
90
+    lea  r6, [r5*3]
91
+    SAD_X4_START_2x32P_AVX2
92
+%else
93
+    SAD_X4_2x32P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
94
+%endif
95
+    SAD_X4_2x32P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
96
+%if %1 != %2-1
97
+%if (%1&1) != 0
98
+    add  r0, 8*FENC_STRIDE
99
+%endif
100
+    lea  r1, [r1+4*r5]
101
+    lea  r2, [r2+4*r5]
102
+    lea  r3, [r3+4*r5]
103
+    lea  r4, [r4+4*r5]
104
+%endif
105
+%endmacro
106
+
107
 %macro SAD_X3_END_AVX2 0
108
     movifnidn r5, r5mp
109
     packssdw  m0, m1        ; 0 0 1 1 0 0 1 1
110
@@ -2808,6 +2884,17 @@
111
     RET
112
 %endmacro
113
 
114
+%macro SAD_X4_32P_END_AVX2 0
115
+    mov          r0, r6mp
116
+    vextracti128 xm2, m0, 1
117
+    vextracti128 xm3, m1, 1
118
+    paddd        xm0, xm2
119
+    paddd        xm1, xm3
120
+    phaddd       xm0, xm1
121
+    mova         [r0], xm0
122
+    RET
123
+%endmacro
124
+
125
 ;-----------------------------------------------------------------------------
126
 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
127
 ;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
128
@@ -3320,7 +3407,12 @@
129
     SAD_X%1_4x%2P_AVX2 x, %3/4
130
 %assign x x+1
131
 %endrep
132
+
133
+  %if (%1==4) && (%2==32)
134
+    SAD_X%1_32P_END_AVX2
135
+  %else
136
     SAD_X%1_END_AVX2
137
+  %endif
138
 %endmacro
139
 
140
 INIT_YMM avx2
141
@@ -3333,6 +3425,12 @@
142
 SAD_X_AVX2 4, 16, 12, 8
143
 SAD_X_AVX2 4, 16,  8, 8
144
 
145
+SAD_X_AVX2 4, 32,  8, 8
146
+SAD_X_AVX2 4, 32, 16, 8
147
+SAD_X_AVX2 4, 32, 24, 8
148
+SAD_X_AVX2 4, 32, 32, 8
149
+SAD_X_AVX2 4, 32, 64, 8
150
+
151
 ;=============================================================================
152
 ; SAD cacheline split
153
 ;=============================================================================
154
@@ -3440,7 +3538,7 @@
155
     jle pixel_sad_%1x%2_mmx2
156
     and    eax, 7
157
     shl    eax, 3
158
-    movd   mm6, [sw_64]
159
+    movd   mm6, [pd_64]
160
     movd   mm7, eax
161
     psubw  mm6, mm7
162
     PROLOGUE 4,5
163
x265_1.7.tar.gz/source/common/x86/sad16-a.asm -> x265_1.8.tar.gz/source/common/x86/sad16-a.asm Changed
201
 
1
@@ -6,6 +6,7 @@
2
 ;* Authors: Oskar Arvidsson <oskar@irock.se>
3
 ;*          Henrik Gramner <henrik@gramner.com>
4
 ;*          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -51,8 +52,14 @@
10
     lea     r2, [r2+2*r3]
11
     paddw   m1, m2
12
     paddw   m3, m4
13
+  %if BIT_DEPTH <= 10
14
     paddw   m0, m1
15
     paddw   m0, m3
16
+  %else
17
+    paddw   m1, m3
18
+    pmaddwd m1, [pw_1]
19
+    paddd   m0, m1
20
+  %endif
21
 %endmacro
22
 
23
 %macro SAD_INC_2x8P_MMX 0
24
@@ -70,8 +77,14 @@
25
     lea     r2, [r2+4*r3]
26
     paddw   m1, m2
27
     paddw   m3, m4
28
+  %if BIT_DEPTH <= 10
29
     paddw   m0, m1
30
     paddw   m0, m3
31
+  %else
32
+    paddw   m1, m3
33
+    pmaddwd m1, [pw_1]
34
+    paddd   m0, m1
35
+  %endif
36
 %endmacro
37
 
38
 %macro SAD_INC_2x4P_MMX 0
39
@@ -82,8 +95,14 @@
40
     ABSW2   m1, m2, m1, m2, m3, m4
41
     lea     r0, [r0+4*r1]
42
     lea     r2, [r2+4*r3]
43
+  %if BIT_DEPTH <= 10
44
     paddw   m0, m1
45
     paddw   m0, m2
46
+  %else
47
+    paddw   m1, m2
48
+    pmaddwd m1, [pw_1]
49
+    paddd   m0, m1
50
+  %endif
51
 %endmacro
52
 
53
 ;-----------------------------------------------------------------------------
54
@@ -103,9 +122,17 @@
55
     jg .loop
56
 %endif
57
 %if %1*%2 == 256
58
+  %if BIT_DEPTH <= 10
59
     HADDUW  m0, m1
60
+  %else
61
+    HADDD  m0, m1
62
+  %endif
63
 %else
64
+  %if BIT_DEPTH <= 10
65
     HADDW   m0, m1
66
+  %else
67
+    HADDD  m0, m1
68
+  %endif
69
 %endif
70
     movd   eax, m0
71
     RET
72
@@ -276,8 +303,9 @@
73
     ABSW2   m3, m4, m3, m4, m7, m5
74
     paddw   m1, m2
75
     paddw   m3, m4
76
-    paddw   m0, m1
77
-    paddw   m0, m3
78
+    paddw   m1, m3
79
+    pmaddwd m1, [pw_1]
80
+    paddd   m0, m1
81
 %else
82
     movu    m1, [r2]
83
     movu    m2, [r2+2*r3]
84
@@ -286,8 +314,9 @@
85
     ABSW2   m1, m2, m1, m2, m3, m4
86
     lea     r0, [r0+4*r1]
87
     lea     r2, [r2+4*r3]
88
-    paddw   m0, m1
89
-    paddw   m0, m2
90
+    paddw   m1, m2
91
+    pmaddwd m1, [pw_1]
92
+    paddd   m0, m1
93
 %endif
94
 %endmacro
95
 
96
@@ -307,8 +336,9 @@
97
     ABSW2   m3, m4, m3, m4, m7, m5
98
     paddw   m1, m2
99
     paddw   m3, m4
100
-    paddw   m0, m1
101
-    paddw   m8, m3
102
+    paddw   m1, m3
103
+    pmaddwd m1, [pw_1]
104
+    paddd   m0, m1
105
 %else
106
     movu    m1, [r2]
107
     movu    m2, [r2 + 2 * r3]
108
@@ -317,8 +347,9 @@
109
     ABSW2   m1, m2, m1, m2, m3, m4
110
     lea     r0, [r0 + 4 * r1]
111
     lea     r2, [r2 + 4 * r3]
112
-    paddw   m0, m1
113
-    paddw   m8, m2
114
+    paddw   m1, m2
115
+    pmaddwd m1, [pw_1]
116
+    paddd   m0, m1
117
 %endif
118
 %endmacro
119
 
120
@@ -326,7 +357,7 @@
121
 ; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t)
122
 ; ---------------------------------------------------------------------------- -
123
 %macro SAD 2
124
-cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
125
+cglobal pixel_sad_%1x%2, 4,5,8
126
     pxor    m0, m0
127
 %if %2 == 4
128
     SAD_INC_2ROW %1
129
@@ -338,12 +369,7 @@
130
     dec    r4d
131
     jg .loop
132
 %endif
133
-%if %2 == 32
134
-    HADDUWD m0, m1
135
     HADDD   m0, m1
136
-%else
137
-    HADDW   m0, m1
138
-%endif
139
     movd    eax, xm0
140
     RET
141
 %endmacro
142
@@ -352,21 +378,15 @@
143
 ; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t)
144
 ; ---------------------------------------------------------------------------- -
145
 %macro SAD_Nx64 1
146
-cglobal pixel_sad_%1x64, 4,5-(64&4/4), 9
147
+cglobal pixel_sad_%1x64, 4,5, 8
148
     pxor    m0, m0
149
-    pxor    m8, m8
150
     mov     r4d, 64 / 2
151
 .loop:
152
     SAD_INC_2ROW_Nx64 %1
153
     dec    r4d
154
     jg .loop
155
 
156
-    HADDUWD m0, m1
157
-    HADDUWD m8, m1
158
     HADDD   m0, m1
159
-    HADDD   m8, m1
160
-    paddd   m0, m8
161
-
162
     movd    eax, xm0
163
     RET
164
 %endmacro
165
@@ -392,6 +412,654 @@
166
 SAD  16, 16
167
 SAD  16, 32
168
 
169
+INIT_YMM avx2
170
+cglobal pixel_sad_16x64, 4,7,4
171
+    pxor    m0, m0
172
+    pxor    m3, m3
173
+    mov     r4d, 64 / 8
174
+    add     r3d, r3d
175
+    add     r1d, r1d
176
+    lea     r5,     [r1 * 3]
177
+    lea     r6,     [r3 * 3]
178
+.loop:
179
+    movu    m1, [r2]
180
+    movu    m2, [r2 + r3]
181
+    psubw   m1, [r0]
182
+    psubw   m2, [r0 + r1]
183
+    pabsw   m1, m1
184
+    pabsw   m2, m2
185
+    paddw   m0, m1
186
+    paddw   m3, m2
187
+
188
+    movu    m1, [r2 + 2 * r3]
189
+    movu    m2, [r2 + r6]
190
+    psubw   m1, [r0 + 2 * r1]
191
+    psubw   m2, [r0 + r5]
192
+    pabsw   m1, m1
193
+    pabsw   m2, m2
194
+    paddw   m0, m1
195
+    paddw   m3, m2
196
+
197
+    lea     r0, [r0 + 4 * r1]
198
+    lea     r2, [r2 + 4 * r3]
199
+
200
+    movu    m1, [r2]
201
x265_1.7.tar.gz/source/common/x86/ssd-a.asm -> x265_1.8.tar.gz/source/common/x86/ssd-a.asm Changed
201
 
1
@@ -113,6 +113,62 @@
2
     RET
3
 %endmacro
4
 
5
+; Function to find ssd for 32x16 block, sse2, 12 bit depth
6
+; Defined sepeartely to be called from SSD_ONE_32 macro
7
+INIT_XMM sse2
8
+cglobal ssd_ss_32x16
9
+    pxor        m8, m8
10
+    mov         r4d, 16
11
+.loop:
12
+    movu        m0, [r0]
13
+    movu        m1, [r0+mmsize]
14
+    movu        m2, [r0+2*mmsize]
15
+    movu        m3, [r0+3*mmsize]
16
+    movu        m4, [r2]
17
+    movu        m5, [r2+mmsize]
18
+    movu        m6, [r2+2*mmsize]
19
+    movu        m7, [r2+3*mmsize]
20
+    psubw       m0, m4
21
+    psubw       m1, m5
22
+    psubw       m2, m6
23
+    psubw       m3, m7
24
+    add         r0, r1
25
+    add         r2, r3
26
+    pmaddwd     m0, m0
27
+    pmaddwd     m1, m1
28
+    pmaddwd     m2, m2
29
+    pmaddwd     m3, m3
30
+    paddd       m2, m3
31
+    paddd       m0, m1
32
+    paddd       m0, m2
33
+    paddd       m8, m0
34
+    dec         r4d
35
+    jnz         .loop
36
+
37
+    mova        m4, m8
38
+    pxor        m5, m5
39
+    punpckldq   m8, m5
40
+    punpckhdq   m4, m5
41
+    paddq       m4, m8
42
+    movhlps     m5, m4
43
+    paddq       m4, m5
44
+    paddq       m9, m4
45
+    ret
46
+
47
+%macro SSD_ONE_32 0
48
+cglobal pixel_ssd_ss_32x64, 4,7,10
49
+    add         r1d, r1d
50
+    add         r3d, r3d
51
+    pxor        m9, m9
52
+    xor         r4, r4
53
+    call        ssd_ss_32x16
54
+    call        ssd_ss_32x16
55
+    call        ssd_ss_32x16
56
+    call        ssd_ss_32x16
57
+    movq        rax, m9
58
+    RET
59
+%endmacro
60
+
61
 %macro SSD_TWO 2
62
 cglobal pixel_ssd_ss_%1x%2, 4,7,8
63
     FIX_STRIDES r1, r3
64
@@ -312,6 +368,124 @@
65
     movd   eax, xm0
66
     RET
67
 %endmacro
68
+
69
+INIT_YMM avx2
70
+cglobal pixel_ssd_16x16, 4,7,8
71
+    FIX_STRIDES r1, r3
72
+    lea     r5, [3 * r1]
73
+    lea     r6, [3 * r3]
74
+    mov    r4d, 4
75
+    pxor    m0, m0
76
+.loop:
77
+    movu    m1, [r0]
78
+    movu    m2, [r0 + r1]
79
+    movu    m3, [r0 + r1 * 2]
80
+    movu    m4, [r0 + r5]
81
+    movu    m6, [r2]
82
+    movu    m7, [r2 + r3]
83
+    psubw   m1, m6
84
+    psubw   m2, m7
85
+    movu    m6, [r2 + r3 * 2]
86
+    movu    m7, [r2 + r6]
87
+    psubw   m3, m6
88
+    psubw   m4, m7
89
+
90
+    lea     r0, [r0 + r1 * 4]
91
+    lea     r2, [r2 + r3 * 4]
92
+
93
+    pmaddwd m1, m1
94
+    pmaddwd m2, m2
95
+    pmaddwd m3, m3
96
+    pmaddwd m4, m4
97
+    paddd   m1, m2
98
+    paddd   m3, m4
99
+    paddd   m0, m1
100
+    paddd   m0, m3
101
+
102
+    dec    r4d
103
+    jg .loop
104
+
105
+    HADDD   m0, m5
106
+    movd   eax, xm0
107
+    RET
108
+
109
+INIT_YMM avx2
110
+cglobal pixel_ssd_32x32, 4,7,8
111
+    add     r1, r1
112
+    add     r3, r3
113
+    mov     r4d, 16
114
+    pxor    m0, m0
115
+.loop:
116
+    movu    m1, [r0]
117
+    movu    m2, [r0 + 32]
118
+    movu    m3, [r0 + r1]
119
+    movu    m4, [r0 + r1 + 32]
120
+    movu    m6, [r2]
121
+    movu    m7, [r2 + 32]
122
+    psubw   m1, m6
123
+    psubw   m2, m7
124
+    movu    m6, [r2 + r3]
125
+    movu    m7, [r2 + r3 + 32]
126
+    psubw   m3, m6
127
+    psubw   m4, m7
128
+
129
+    lea     r0, [r0 + r1 * 2]
130
+    lea     r2, [r2 + r3 * 2]
131
+
132
+    pmaddwd m1, m1
133
+    pmaddwd m2, m2
134
+    pmaddwd m3, m3
135
+    pmaddwd m4, m4
136
+    paddd   m1, m2
137
+    paddd   m3, m4
138
+    paddd   m0, m1
139
+    paddd   m0, m3
140
+
141
+    dec    r4d
142
+    jg .loop
143
+
144
+    HADDD   m0, m5
145
+    movd   eax, xm0
146
+    RET
147
+
148
+INIT_YMM avx2
149
+cglobal pixel_ssd_64x64, 4,7,8
150
+    FIX_STRIDES r1, r3
151
+    mov    r4d, 64
152
+    pxor    m0, m0
153
+.loop:
154
+    movu    m1, [r0]
155
+    movu    m2, [r0+32]
156
+    movu    m3, [r0+32*2]
157
+    movu    m4, [r0+32*3]
158
+    movu    m6, [r2]
159
+    movu    m7, [r2+32]
160
+    psubw   m1, m6
161
+    psubw   m2, m7
162
+    movu    m6, [r2+32*2]
163
+    movu    m7, [r2+32*3]
164
+    psubw   m3, m6
165
+    psubw   m4, m7
166
+
167
+    lea     r0, [r0+r1]
168
+    lea     r2, [r2+r3]
169
+
170
+    pmaddwd m1, m1
171
+    pmaddwd m2, m2
172
+    pmaddwd m3, m3
173
+    pmaddwd m4, m4
174
+    paddd   m1, m2
175
+    paddd   m3, m4
176
+    paddd   m0, m1
177
+    paddd   m0, m3
178
+
179
+    dec    r4d
180
+    jg .loop
181
+
182
+    HADDD   m0, m5
183
+    movd   eax, xm0
184
+    RET
185
+
186
 INIT_MMX mmx2
187
 SSD_ONE     4,  4
188
 SSD_ONE     4,  8
189
@@ -338,7 +512,13 @@
190
 SSD_ONE    32, 16
191
 SSD_ONE    32, 24
192
 SSD_ONE    32, 32
193
-SSD_ONE    32, 64
194
+
195
+%if BIT_DEPTH <= 10
196
+    SSD_ONE    32, 64
197
+%else
198
+    SSD_ONE_32
199
+%endif
200
+
201
x265_1.7.tar.gz/source/common/x86/x86inc.asm -> x265_1.8.tar.gz/source/common/x86/x86inc.asm Changed
24
 
1
@@ -37,7 +37,7 @@
2
 ; to x264-devel@videolan.org .
3
 
4
 %ifndef private_prefix
5
-    %define private_prefix x265
6
+    %define private_prefix X265_NS
7
 %endif
8
 
9
 %ifndef public_prefix
10
@@ -1483,13 +1483,3 @@
11
 %endif
12
 %endmacro
13
 %endif
14
-
15
-; workaround: vpbroadcastd with register, the yasm will generate wrong code
16
-%macro vpbroadcastd 2
17
-  %ifid %2
18
-    movd         %1 %+ xmm, %2
19
-    vpbroadcastd %1, %1 %+ xmm
20
-  %else
21
-    vpbroadcastd %1, %2
22
-  %endif
23
-%endmacro
24
x265_1.7.tar.gz/source/common/x86/x86util.asm -> x265_1.8.tar.gz/source/common/x86/x86util.asm Changed
17
 
1
@@ -358,11 +358,11 @@
2
 %if sizeof%1==32
3
                                  ; %3 = abcdefgh ijklmnop (lower address)
4
                                  ; %2 = ABCDEFGH IJKLMNOP (higher address)
5
-;   vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
6
-%if %4 < 16
7
-    palignr    %1, %5, %3, %4    ; %1 = bcdefghi jklmnopA
8
+    vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH
9
+%if %3 < 16
10
+    palignr    %1, %4, %2, %3    ; %1 = bcdefghi jklmnopA
11
 %else
12
-    palignr    %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
13
+    palignr    %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO
14
 %endif
15
 %elif cpuflag(ssse3)
16
     %if %0==5
17
x265_1.7.tar.gz/source/common/yuv.cpp -> x265_1.8.tar.gz/source/common/yuv.cpp Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "picyuv.h"
3
 #include "primitives.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 Yuv::Yuv()
9
 {
10
x265_1.7.tar.gz/source/common/yuv.h -> x265_1.8.tar.gz/source/common/yuv.h Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "primitives.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class ShortYuv;
10
x265_1.7.tar.gz/source/compat/getopt/getopt.h -> x265_1.8.tar.gz/source/compat/getopt/getopt.h Changed
35
 
1
@@ -144,23 +144,23 @@
2
 /* Many other libraries have conflicting prototypes for getopt, with
3
    differences in the consts, in stdlib.h.  To avoid compilation
4
    errors, only prototype getopt for the GNU C library.  */
5
-extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
6
+extern int getopt (int argc, char *const *argv, const char *shortopts);
7
 # else /* not __GNU_LIBRARY__ */
8
 extern int getopt ();
9
 # endif /* __GNU_LIBRARY__ */
10
 
11
 # ifndef __need_getopt
12
-extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
13
-               const struct option *__longopts, int32_t *__longind);
14
-extern int getopt_long_only (int __argc, char *const *__argv,
15
-                const char *__shortopts,
16
-                    const struct option *__longopts, int32_t *__longind);
17
+extern int getopt_long (int argc, char *const *argv, const char *shortopts,
18
+               const struct option *longopts, int32_t *longind);
19
+extern int getopt_long_only (int argc, char *const *argv,
20
+                const char *shortopts,
21
+                    const struct option *longopts, int32_t *longind);
22
 
23
 /* Internal only.  Users should not call this directly.  */
24
-extern int _getopt_internal (int __argc, char *const *__argv,
25
-                const char *__shortopts,
26
-                    const struct option *__longopts, int32_t *__longind,
27
-                int __long_only);
28
+extern int _getopt_internal (int argc, char *const *argv,
29
+                const char *shortopts,
30
+                    const struct option *longopts, int32_t *longind,
31
+                int longonly);
32
 # endif
33
 #else /* not __STDC__ */
34
 extern int getopt ();
35
x265_1.7.tar.gz/source/compat/msvc/stdint.h -> x265_1.8.tar.gz/source/compat/msvc/stdint.h Changed
9
 
1
@@ -8,6 +8,7 @@
2
 #if !defined(UINT64_MAX)
3
 #include <limits.h>
4
 #define UINT64_MAX _UI64_MAX
5
+#define INT16_MAX  _I16_MAX
6
 #endif
7
 
8
 /* a minimal set of C99 types for use with MSVC (VC9) */
9
x265_1.7.tar.gz/source/encoder/CMakeLists.txt -> x265_1.8.tar.gz/source/encoder/CMakeLists.txt Changed
22
 
1
@@ -11,6 +11,20 @@
2
    add_definitions(/wd4701) # potentially uninitialized local variable 'foo' used
3
 endif()
4
 
5
+if(EXTRA_LIB)
6
+    if(LINKED_8BIT)
7
+        list(APPEND APIFLAGS "-DLINKED_8BIT=1")
8
+    endif(LINKED_8BIT)
9
+    if(LINKED_10BIT)
10
+        list(APPEND APIFLAGS "-DLINKED_10BIT=1")
11
+    endif(LINKED_10BIT)
12
+    if(LINKED_12BIT)
13
+        list(APPEND APIFLAGS "-DLINKED_12BIT=1")
14
+    endif(LINKED_12BIT)
15
+    string(REPLACE ";" " " APIFLAGSTR "${APIFLAGS}")
16
+    set_source_files_properties(api.cpp PROPERTIES COMPILE_FLAGS ${APIFLAGSTR})
17
+endif(EXTRA_LIB)
18
+
19
 add_library(encoder OBJECT ../x265.h
20
     analysis.cpp analysis.h
21
     search.cpp search.h
22
x265_1.7.tar.gz/source/encoder/analysis.cpp -> x265_1.8.tar.gz/source/encoder/analysis.cpp Changed
201
 
1
@@ -33,7 +33,7 @@
2
 #include "rdcost.h"
3
 #include "encoder.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 /* An explanation of rate distortion levels (--rd-level)
9
  * 
10
@@ -209,24 +209,20 @@
11
         return;
12
     else if (md.bestMode->cu.isIntra(0))
13
     {
14
-        m_quant.m_tqBypass = true;
15
         md.pred[PRED_LOSSLESS].initCosts();
16
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
17
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
18
         uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
19
         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
20
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
21
-        m_quant.m_tqBypass = false;
22
     }
23
     else
24
     {
25
-        m_quant.m_tqBypass = true;
26
         md.pred[PRED_LOSSLESS].initCosts();
27
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
28
         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
29
         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
30
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
31
-        m_quant.m_tqBypass = false;
32
     }
33
 }
34
 
35
@@ -385,6 +381,8 @@
36
     /* perform Mode task, repeat until no more work is available */
37
     do
38
     {
39
+        uint32_t refMasks[2] = { 0, 0 };
40
+
41
         if (m_param->rdLevel <= 4)
42
         {
43
             switch (pmode.modes[task])
44
@@ -396,33 +394,33 @@
45
                 break;
46
 
47
             case PRED_2Nx2N:
48
-                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N);
49
+                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
50
                 if (m_slice->m_sliceType == B_SLICE)
51
                     slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
52
                 break;
53
 
54
             case PRED_Nx2N:
55
-                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N);
56
+                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
57
                 break;
58
 
59
             case PRED_2NxN:
60
-                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN);
61
+                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
62
                 break;
63
 
64
             case PRED_2NxnU:
65
-                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU);
66
+                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
67
                 break;
68
 
69
             case PRED_2NxnD:
70
-                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD);
71
+                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
72
                 break;
73
 
74
             case PRED_nLx2N:
75
-                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N);
76
+                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
77
                 break;
78
 
79
             case PRED_nRx2N:
80
-                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N);
81
+                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
82
                 break;
83
 
84
             default:
85
@@ -441,7 +439,7 @@
86
                 break;
87
 
88
             case PRED_2Nx2N:
89
-                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N);
90
+                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
91
                 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
92
                 if (m_slice->m_sliceType == B_SLICE)
93
                 {
94
@@ -452,27 +450,27 @@
95
                 break;
96
 
97
             case PRED_Nx2N:
98
-                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N);
99
+                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
100
                 break;
101
 
102
             case PRED_2NxN:
103
-                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN);
104
+                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
105
                 break;
106
 
107
             case PRED_2NxnU:
108
-                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU);
109
+                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
110
                 break;
111
 
112
             case PRED_2NxnD:
113
-                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD);
114
+                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
115
                 break;
116
 
117
             case PRED_nLx2N:
118
-                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N);
119
+                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
120
                 break;
121
 
122
             case PRED_nRx2N:
123
-                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N);
124
+                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
125
                 break;
126
 
127
             default:
128
@@ -581,7 +579,8 @@
129
                 /* RD selection between merge, inter, bidir and intra */
130
                 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
131
                 {
132
-                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
133
+                    uint32_t numPU = bestInter->cu.getNumPartInter(0);
134
+                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
135
                     {
136
                         PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
137
                         motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
138
@@ -617,7 +616,8 @@
139
                 else if (!md.bestMode->cu.m_mergeFlag[0])
140
                 {
141
                     /* finally code the best mode selected from SA8D costs */
142
-                    for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
143
+                    uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
144
+                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
145
                     {
146
                         PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
147
                         motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
148
@@ -746,7 +746,7 @@
149
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
150
 }
151
 
152
-void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
153
+uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
154
 {
155
     uint32_t depth = cuGeom.depth;
156
     uint32_t cuAddr = parentCTU.m_cuAddr;
157
@@ -756,24 +756,104 @@
158
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
159
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
160
     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
161
-
162
+    bool earlyskip = false;
163
+    bool splitIntra = true;
164
+    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
165
+    /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
166
     if (mightNotSplit && depth >= minDepth)
167
     {
168
-        bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
169
-
170
         /* Compute Merge Cost */
171
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
172
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
173
         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
174
-
175
-        bool earlyskip = false;
176
         if (m_param->rdLevel)
177
             earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
178
+    }
179
+
180
+    bool bNoSplit = false;
181
+    if (md.bestMode)
182
+    {
183
+        bNoSplit = md.bestMode->cu.isSkipped(0);
184
+        if (mightSplit && depth && depth >= minDepth && !bNoSplit)
185
+            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
186
+    }
187
+
188
+    /* Step 2. Evaluate each of the 4 split sub-blocks in series */
189
+    if (mightSplit && !bNoSplit)
190
+    {
191
+        Mode* splitPred = &md.pred[PRED_SPLIT];
192
+        splitPred->initCosts();
193
+        CUData* splitCU = &splitPred->cu;
194
+        splitCU->initSubCU(parentCTU, cuGeom, qp);
195
+
196
+        uint32_t nextDepth = depth + 1;
197
+        ModeDepth& nd = m_modeDepth[nextDepth];
198
+        invalidateContexts(nextDepth);
199
+        Entropy* nextContext = &m_rqt[depth].cur;
200
+        int nextQP = qp;
201
x265_1.7.tar.gz/source/encoder/analysis.h -> x265_1.8.tar.gz/source/encoder/analysis.h Changed
32
 
1
@@ -35,7 +35,7 @@
2
 #include "entropy.h"
3
 #include "search.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class Entropy;
10
@@ -113,16 +113,16 @@
11
 
12
     /* full analysis for a P or B slice CU */
13
     void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
14
-    void compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
15
-    void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
16
+    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
17
+    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
18
 
19
     /* measure merge and skip */
20
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
21
-    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isSkipMode);
22
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);
23
 
24
     /* measure inter options */
25
-    void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
26
-    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
27
+    void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
28
+    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
29
 
30
     void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
31
 
32
x265_1.7.tar.gz/source/encoder/api.cpp -> x265_1.8.tar.gz/source/encoder/api.cpp Changed
201
 
1
@@ -31,25 +31,69 @@
2
 #include "nal.h"
3
 #include "bitcost.h"
4
 
5
-using namespace x265;
6
+/* multilib namespace reflectors */
7
+#if LINKED_8BIT
8
+namespace x265_8bit {
9
+const x265_api* x265_api_get(int bitDepth);
10
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
11
+}
12
+#endif
13
+
14
+#if LINKED_10BIT
15
+namespace x265_10bit {
16
+const x265_api* x265_api_get(int bitDepth);
17
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
18
+}
19
+#endif
20
+
21
+#if LINKED_12BIT
22
+namespace x265_12bit {
23
+const x265_api* x265_api_get(int bitDepth);
24
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
25
+}
26
+#endif
27
+
28
+#if EXPORT_C_API
29
+/* these functions are exported as C functions (default) */
30
+using namespace X265_NS;
31
+extern "C" {
32
+#else
33
+/* these functions exist within private namespace (multilib) */
34
+namespace X265_NS {
35
+#endif
36
 
37
-extern "C"
38
 x265_encoder *x265_encoder_open(x265_param *p)
39
 {
40
     if (!p)
41
         return NULL;
42
 
43
+#if _MSC_VER
44
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
45
+#endif
46
+
47
+#if HIGH_BIT_DEPTH
48
+    if (X265_DEPTH == 12)
49
+        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
50
+    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
51
+#else
52
+    if (X265_DEPTH != 8)
53
+#endif
54
+    {
55
+        x265_log(p, X265_LOG_ERROR, "Build error, internal bit depth mismatch\n");
56
+        return NULL;
57
+    }
58
+
59
     Encoder* encoder = NULL;
60
-    x265_param* param = x265_param_alloc();
61
-    x265_param* latestParam = x265_param_alloc();
62
+    x265_param* param = PARAM_NS::x265_param_alloc();
63
+    x265_param* latestParam = PARAM_NS::x265_param_alloc();
64
     if (!param || !latestParam)
65
         goto fail;
66
 
67
     memcpy(param, p, sizeof(x265_param));
68
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str);
69
-    x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str);
70
+    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", PFX(version_str));
71
+    x265_log(param, X265_LOG_INFO, "build info %s\n", PFX(build_info_str));
72
 
73
-    x265_setup_primitives(param, param->cpuid);
74
+    x265_setup_primitives(param);
75
 
76
     if (x265_check_params(param))
77
         goto fail;
78
@@ -59,7 +103,7 @@
79
 
80
     encoder = new Encoder;
81
     if (!param->rc.bEnableSlowFirstPass)
82
-        x265_param_apply_fastfirstpass(param);
83
+        PARAM_NS::x265_param_apply_fastfirstpass(param);
84
 
85
     // may change params for auto-detect, etc
86
     encoder->configure(param);
87
@@ -87,12 +131,11 @@
88
 
89
 fail:
90
     delete encoder;
91
-    x265_param_free(param);
92
-    x265_param_free(latestParam);
93
+    PARAM_NS::x265_param_free(param);
94
+    PARAM_NS::x265_param_free(latestParam);
95
     return NULL;
96
 }
97
 
98
-extern "C"
99
 int x265_encoder_headers(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal)
100
 {
101
     if (pp_nal && enc)
102
@@ -109,7 +152,6 @@
103
     return -1;
104
 }
105
 
106
-extern "C"
107
 void x265_encoder_parameters(x265_encoder *enc, x265_param *out)
108
 {
109
     if (enc && out)
110
@@ -119,7 +161,6 @@
111
     }
112
 }
113
 
114
-extern "C"
115
 int x265_encoder_reconfig(x265_encoder* enc, x265_param* param_in)
116
 {
117
     if (!enc || !param_in)
118
@@ -140,7 +181,6 @@
119
     return ret;
120
 }
121
 
122
-extern "C"
123
 int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
124
 {
125
     if (!enc)
126
@@ -175,7 +215,6 @@
127
     return numEncoded;
128
 }
129
 
130
-extern "C"
131
 void x265_encoder_get_stats(x265_encoder *enc, x265_stats *outputStats, uint32_t statsSizeBytes)
132
 {
133
     if (enc && outputStats)
134
@@ -185,17 +224,15 @@
135
     }
136
 }
137
 
138
-extern "C"
139
-void x265_encoder_log(x265_encoder* enc, int argc, char **argv)
140
+void x265_encoder_log(x265_encoder* enc, int, char **)
141
 {
142
     if (enc)
143
     {
144
         Encoder *encoder = static_cast<Encoder*>(enc);
145
-        encoder->writeLog(argc, argv);
146
+        x265_log(encoder->m_param, X265_LOG_WARNING, "x265_encoder_log is now deprecated\n");
147
     }
148
 }
149
 
150
-extern "C"
151
 void x265_encoder_close(x265_encoder *enc)
152
 {
153
     if (enc)
154
@@ -210,7 +247,6 @@
155
     }
156
 }
157
 
158
-extern "C"
159
 void x265_cleanup(void)
160
 {
161
     if (!g_ctuSizeConfigured)
162
@@ -220,13 +256,11 @@
163
     }
164
 }
165
 
166
-extern "C"
167
 x265_picture *x265_picture_alloc()
168
 {
169
     return (x265_picture*)x265_malloc(sizeof(x265_picture));
170
 }
171
 
172
-extern "C"
173
 void x265_picture_init(x265_param *param, x265_picture *pic)
174
 {
175
     memset(pic, 0, sizeof(x265_picture));
176
@@ -245,7 +279,6 @@
177
     }
178
 }
179
 
180
-extern "C"
181
 void x265_picture_free(x265_picture *p)
182
 {
183
     return x265_free(p);
184
@@ -253,12 +286,24 @@
185
 
186
 static const x265_api libapi =
187
 {
188
-    &x265_param_alloc,
189
-    &x265_param_free,
190
-    &x265_param_default,
191
-    &x265_param_parse,
192
-    &x265_param_apply_profile,
193
-    &x265_param_default_preset,
194
+    X265_MAJOR_VERSION,
195
+    X265_BUILD,
196
+    sizeof(x265_param),
197
+    sizeof(x265_picture),
198
+    sizeof(x265_analysis_data),
199
+    sizeof(x265_zone),
200
+    sizeof(x265_stats),
201
x265_1.7.tar.gz/source/encoder/bitcost.cpp -> x265_1.8.tar.gz/source/encoder/bitcost.cpp Changed
19
 
1
@@ -25,7 +25,7 @@
2
 #include "primitives.h"
3
 #include "bitcost.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 void BitCost::setQP(unsigned int qp)
9
 {
10
@@ -45,7 +45,7 @@
11
 
12
             // estimate same cost for negative and positive MVD
13
             for (int i = 0; i <= 2 * BC_MAX_MV; i++)
14
-                s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 16) - 1);
15
+                s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 15) - 1);
16
         }
17
     }
18
 
19
x265_1.7.tar.gz/source/encoder/bitcost.h -> x265_1.8.tar.gz/source/encoder/bitcost.h Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "threading.h"
3
 #include "mv.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class BitCost
10
x265_1.7.tar.gz/source/encoder/dpb.cpp -> x265_1.8.tar.gz/source/encoder/dpb.cpp Changed
10
 
1
@@ -29,7 +29,7 @@
2
 
3
 #include "dpb.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 DPB::~DPB()
9
 {
10
x265_1.7.tar.gz/source/encoder/dpb.h -> x265_1.8.tar.gz/source/encoder/dpb.h Changed
10
 
1
@@ -26,7 +26,7 @@
2
 
3
 #include "piclist.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace for x265
8
 
9
 class Frame;
10
x265_1.7.tar.gz/source/encoder/encoder.cpp -> x265_1.8.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -39,21 +39,13 @@
2
 
3
 #include "x265.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
8
 }
9
 
10
-static const char* summaryCSVHeader =
11
-    "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
12
-    "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
13
-    "I count, I ave-QP, I kpbs, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
14
-    "P count, P ave-QP, P kpbs, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
15
-    "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
16
-    "Version\n";
17
-
18
 static const char* defaultAnalysisFileName = "x265_analysis.dat";
19
 
20
-using namespace x265;
21
+using namespace X265_NS;
22
 
23
 Encoder::Encoder()
24
 {
25
@@ -72,7 +64,6 @@
26
     m_exportedPic = NULL;
27
     m_numDelayedPic = 0;
28
     m_outputCount = 0;
29
-    m_csvfpt = NULL;
30
     m_param = NULL;
31
     m_latestParam = NULL;
32
     m_cuOffsetY = NULL;
33
@@ -103,7 +94,10 @@
34
 
35
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
36
     if (rows == 1 || cols < 3)
37
+    {
38
+        x265_log(p, X265_LOG_WARNING, "Too few rows/columns, --wpp disabled\n");
39
         p->bEnableWavefront = 0;
40
+    }
41
 
42
     bool allowPools = !p->numaPools || strcmp(p->numaPools, "none");
43
 
44
@@ -149,6 +143,12 @@
45
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = p->lookaheadSlices = 0;
46
     }
47
 
48
+    if (!p->bEnableWavefront && p->rc.vbvBufferSize)
49
+    {
50
+        x265_log(p, X265_LOG_ERROR, "VBV requires wavefront parallelism\n");
51
+        m_aborted = true;
52
+    }
53
+
54
     char buf[128];
55
     int len = 0;
56
     if (p->bEnableWavefront)
57
@@ -214,43 +214,6 @@
58
     initSPS(&m_sps);
59
     initPPS(&m_pps);
60
 
61
-    /* Try to open CSV file handle */
62
-    if (m_param->csvfn)
63
-    {
64
-        m_csvfpt = fopen(m_param->csvfn, "r");
65
-        if (m_csvfpt)
66
-        {
67
-            /* file already exists, re-open for append */
68
-            fclose(m_csvfpt);
69
-            m_csvfpt = fopen(m_param->csvfn, "ab");
70
-        }
71
-        else
72
-        {
73
-            /* new CSV file, write header */
74
-            m_csvfpt = fopen(m_param->csvfn, "wb");
75
-            if (m_csvfpt)
76
-            {
77
-                if (m_param->logLevel >= X265_LOG_FRAME)
78
-                {
79
-                    fprintf(m_csvfpt, "Encode Order, Type, POC, QP, Bits, ");
80
-                    if (m_param->rc.rateControlMode == X265_RC_CRF)
81
-                        fprintf(m_csvfpt, "RateFactor, ");
82
-                    fprintf(m_csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
83
-                    /* detailed performance statistics */
84
-                    fprintf(m_csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks\n");
85
-                }
86
-                else
87
-                    fputs(summaryCSVHeader, m_csvfpt);
88
-            }
89
-        }
90
-
91
-        if (!m_csvfpt)
92
-        {
93
-            x265_log(m_param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", m_param->csvfn);
94
-            m_aborted = true;
95
-        }
96
-    }
97
-
98
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
99
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
100
     for (int i = 0; i < m_param->frameNumThreads; i++)
101
@@ -362,8 +325,6 @@
102
 
103
     if (m_analysisFile)
104
         fclose(m_analysisFile);
105
-    if (m_csvfpt)
106
-        fclose(m_csvfpt);
107
 
108
     if (m_param)
109
     {
110
@@ -372,15 +333,14 @@
111
         free((char*)m_param->rc.statFileName);
112
         free((char*)m_param->analysisFileName);
113
         free((char*)m_param->scalingLists);
114
-        free((char*)m_param->csvfn);
115
         free((char*)m_param->numaPools);
116
         free((char*)m_param->masteringDisplayColorVolume);
117
         free((char*)m_param->contentLightLevelInfo);
118
 
119
-        x265_param_free(m_param);
120
+        PARAM_NS::x265_param_free(m_param);
121
     }
122
 
123
-    x265_param_free(m_latestParam);
124
+    PARAM_NS::x265_param_free(m_latestParam);
125
 }
126
 
127
 void Encoder::updateVbvPlan(RateControl* rc)
128
@@ -570,6 +530,7 @@
129
         if (outFrame)
130
         {
131
             Slice *slice = outFrame->m_encData->m_slice;
132
+            x265_frame_stats* frameData = NULL;
133
 
134
             /* Free up pic_in->analysisData since it has already been used */
135
             if (m_param->analysisMode == X265_ANALYSIS_LOAD)
136
@@ -582,6 +543,7 @@
137
                 pic_out->bitDepth = X265_DEPTH;
138
                 pic_out->userData = outFrame->m_userData;
139
                 pic_out->colorSpace = m_param->internalCsp;
140
+                frameData = &(pic_out->frameData);
141
 
142
                 pic_out->pts = outFrame->m_pts;
143
                 pic_out->dts = outFrame->m_dts;
144
@@ -648,7 +610,12 @@
145
             if (m_aborted)
146
                 return -1;
147
 
148
-            finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits);
149
+            finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits, frameData);
150
+
151
+            /* Write RateControl Frame level stats in multipass encodes */
152
+            if (m_param->rc.bStatWrite)
153
+                if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
154
+                    m_aborted = true;
155
 
156
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
157
             if (!pic_out)
158
@@ -729,7 +696,7 @@
159
                 m_aborted = true;
160
         }
161
         else if (m_encodedFrameNum)
162
-            m_rateControl->setFinalFrameCount(m_encodedFrameNum);
163
+            m_rateControl->setFinalFrameCount(m_encodedFrameNum); 
164
     }
165
     while (m_bZeroLatency && ++pass < 2);
166
 
167
@@ -787,38 +754,6 @@
168
     m_totalQp += aveQp;
169
 }
170
 
171
-char* Encoder::statsCSVString(EncStats& stat, char* buffer)
172
-{
173
-    if (!stat.m_numPics)
174
-    {
175
-        sprintf(buffer, "-, -, -, -, -, -, -, ");
176
-        return buffer;
177
-    }
178
-
179
-    double fps = (double)m_param->fpsNum / m_param->fpsDenom;
180
-    double scale = fps / 1000 / (double)stat.m_numPics;
181
-
182
-    int len = sprintf(buffer, "%-6u, ", stat.m_numPics);
183
-
184
-    len += sprintf(buffer + len, "%2.2lf, ", stat.m_totalQp / (double)stat.m_numPics);
185
-    len += sprintf(buffer + len, "%-8.2lf, ", stat.m_accBits * scale);
186
-    if (m_param->bEnablePsnr)
187
-    {
188
-        len += sprintf(buffer + len, "%.3lf, %.3lf, %.3lf, ",
189
-                       stat.m_psnrSumY / (double)stat.m_numPics,
190
-                       stat.m_psnrSumU / (double)stat.m_numPics,
191
-                       stat.m_psnrSumV / (double)stat.m_numPics);
192
-    }
193
-    else
194
-        len += sprintf(buffer + len, "-, -, -, ");
195
-
196
-    if (m_param->bEnableSsim)
197
-        sprintf(buffer + len, "%.3lf, ", x265_ssim2dB(stat.m_globalSsim / (double)stat.m_numPics));
198
-    else
199
-        sprintf(buffer + len, "-, ");
200
-    return buffer;
201
x265_1.7.tar.gz/source/encoder/encoder.h -> x265_1.8.tar.gz/source/encoder/encoder.h Changed
42
 
1
@@ -32,7 +32,7 @@
2
 
3
 struct x265_encoder {};
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 extern const char g_sliceTypeToChar[3];
9
 
10
@@ -105,7 +105,6 @@
11
     EncStats           m_analyzeI;
12
     EncStats           m_analyzeP;
13
     EncStats           m_analyzeB;
14
-    FILE*              m_csvfpt;
15
     int64_t            m_encodeStartTime;
16
 
17
     // weighted prediction
18
@@ -149,14 +148,10 @@
19
 
20
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
21
 
22
-    void writeLog(int argc, char **argv);
23
-
24
     void printSummary();
25
 
26
     char* statsString(EncStats&, char*);
27
 
28
-    char* statsCSVString(EncStats& stat, char* buffer);
29
-
30
     void configure(x265_param *param);
31
 
32
     void updateVbvPlan(RateControl* rc);
33
@@ -169,7 +164,7 @@
34
 
35
     void writeAnalysisFile(x265_analysis_data* pic);
36
 
37
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
38
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
39
 
40
 protected:
41
 
42
x265_1.7.tar.gz/source/encoder/entropy.cpp -> x265_1.8.tar.gz/source/encoder/entropy.cpp Changed
201
 
1
@@ -35,9 +35,7 @@
2
 #define CU_DQP_EG_k    0 // exp-golomb order
3
 #define START_VALUE    8 // start value for dpcm mode
4
 
5
-static const uint32_t g_puOffset[8] = { 0, 8, 4, 4, 2, 10, 1, 5 };
6
-
7
-namespace x265 {
8
+namespace X265_NS {
9
 
10
 Entropy::Entropy()
11
 {
12
@@ -216,7 +214,7 @@
13
         WRITE_FLAG(csp == X265_CSP_I420 || csp == X265_CSP_I400,                         "general_max_420chroma_constraint_flag");
14
         WRITE_FLAG(csp == X265_CSP_I400,                                                 "general_max_monochrome_constraint_flag");
15
         WRITE_FLAG(ptl.intraConstraintFlag,        "general_intra_constraint_flag");
16
-        WRITE_FLAG(0,                              "general_one_picture_only_constraint_flag");
17
+        WRITE_FLAG(ptl.onePictureOnlyConstraintFlag,"general_one_picture_only_constraint_flag");
18
         WRITE_FLAG(ptl.lowerBitRateConstraintFlag, "general_lower_bit_rate_constraint_flag");
19
         WRITE_CODE(0 , 16, "XXX_reserved_zero_35bits[0..15]");
20
         WRITE_CODE(0 , 16, "XXX_reserved_zero_35bits[16..31]");
21
@@ -862,12 +860,9 @@
22
 void Entropy::codePUWise(const CUData& cu, uint32_t absPartIdx)
23
 {
24
     X265_CHECK(!cu.isIntra(absPartIdx), "intra block not expected\n");
25
-    PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
26
-    uint32_t numPU = (partSize == SIZE_2Nx2N ? 1 : (partSize == SIZE_NxN ? 4 : 2));
27
-    uint32_t depth = cu.m_cuDepth[absPartIdx];
28
-    uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_unitSizeDepth - depth) * 2) >> 4;
29
+    uint32_t numPU = cu.getNumPartInter(absPartIdx);
30
 
31
-    for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += puOffset)
32
+    for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, absPartIdx))
33
     {
34
         codeMergeFlag(cu, subPartIdx);
35
         if (cu.m_mergeFlag[subPartIdx])
36
@@ -1433,6 +1428,55 @@
37
         encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
38
 }
39
 
40
+#if CHECKED_BUILD || _DEBUG
41
+uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)
42
+{
43
+    uint32_t goRiceParam = 0;
44
+    int firstCoeff2 = 1;
45
+    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
46
+
47
+    uint32_t sum = 0;
48
+    int idx = 0;
49
+    do
50
+    {
51
+        int baseLevel = (baseLevelN & 3) | firstCoeff2;
52
+        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
53
+        baseLevelN >>= 2;
54
+        int codeNumber = absCoeff[idx] - baseLevel;
55
+
56
+        if (codeNumber >= 0)
57
+        {
58
+            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
59
+            uint32_t length = 0;
60
+
61
+            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
62
+            if (codeNumber >= 0)
63
+            {
64
+                {
65
+                    unsigned long cidx;
66
+                    CLZ(cidx, codeNumber + 1);
67
+                    length = cidx;
68
+                }
69
+                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
70
+
71
+                codeNumber = (length + length);
72
+            }
73
+            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
74
+
75
+            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
76
+                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
77
+            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
78
+        }
79
+        if (absCoeff[idx] >= 2)
80
+            firstCoeff2 = 0;
81
+        idx++;
82
+    }
83
+    while(idx < numNonZero);
84
+
85
+    return sum;
86
+}
87
+#endif // debug only code
88
+
89
 void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
90
 {
91
     uint32_t trSize = 1 << log2TrSize;
92
@@ -1440,7 +1484,7 @@
93
     // compute number of significant coefficients
94
     uint32_t numSig = primitives.cu[log2TrSize - 2].count_nonzero(coeff);
95
     X265_CHECK(numSig > 0, "cbf check fail\n");
96
-    bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled && !tqBypass;
97
+    bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled & !tqBypass;
98
 
99
     if (log2TrSize <= MAX_LOG2_TS_SIZE && !tqBypass && cu.m_slice->m_pps->bTransformSkipEnabled)
100
         codeTransformSkipFlags(cu.m_transformSkip[ttype][absPartIdx], ttype);
101
@@ -1489,9 +1533,11 @@
102
         if (codingParameters.scanType == SCAN_VER)
103
             std::swap(pos[0], pos[1]);
104
 
105
-        int ctxIdx = bIsLuma ? (3 * (log2TrSize - 2) + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
106
-        int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
107
+        int ctxIdx = bIsLuma ? (3 * (log2TrSize - 2) + (log2TrSize == 5)) : NUM_CTX_LAST_FLAG_XY_LUMA;
108
+        int ctxShift = (bIsLuma ? (log2TrSize > 2) : (log2TrSize - 2));
109
         uint32_t maxGroupIdx = (log2TrSize << 1) - 1;
110
+        X265_CHECK(((log2TrSize - 1) >> 2) == (uint32_t)(log2TrSize == 5), "ctxIdx check failure\n");
111
+        X265_CHECK((uint32_t)ctxShift == (bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2), "ctxShift check failure\n");
112
 
113
         uint8_t *ctx = &m_contextState[OFF_CTX_LAST_FLAG_X];
114
         for (uint32_t i = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
115
@@ -1519,12 +1565,12 @@
116
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
117
     uint32_t c1 = 1;
118
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
119
-    int absCoeff[1 << MLS_CG_SIZE];
120
-    int numNonZero = 1;
121
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
122
+    uint32_t numNonZero = 1;
123
     unsigned long lastNZPosInCG;
124
     unsigned long firstNZPosInCG;
125
 
126
-    absCoeff[0] = int(abs(coeff[posLast]));
127
+    absCoeff[0] = (uint16_t)abs(coeff[posLast]);
128
 
129
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
130
     {
131
@@ -1540,7 +1586,7 @@
132
 
133
         // encode significant_coeffgroup_flag
134
         const int cgBlkPos = codingParameters.scanCG[subSet];
135
-        const int cgPosY   = cgBlkPos >> (log2TrSize - MLS_CG_LOG2_SIZE);
136
+        const int cgPosY   = (uint32_t)cgBlkPos >> (log2TrSize - MLS_CG_LOG2_SIZE);
137
         const int cgPosX   = cgBlkPos & ((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1);
138
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
139
 
140
@@ -1554,21 +1600,14 @@
141
         }
142
 
143
         // encode significant_coeff_flag
144
-        if (sigCoeffGroupFlag64 & cgBlkPosMask)
145
+        if ((scanPosSigOff >= 0) && (sigCoeffGroupFlag64 & cgBlkPosMask))
146
         {
147
             X265_CHECK((log2TrSize != 2) || (log2TrSize == 2 && subSet == 0), "log2TrSize and subSet mistake!\n");
148
             const int patternSigCtx = Quant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, (trSize >> MLS_CG_LOG2_SIZE));
149
             const uint32_t posOffset = (bIsLuma && subSet) ? 3 : 0;
150
 
151
-            static const uint8_t ctxIndMap4x4[16] =
152
-            {
153
-                0, 1, 4, 5,
154
-                2, 3, 4, 5,
155
-                6, 6, 8, 8,
156
-                7, 7, 8, 8
157
-            };
158
             // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
159
-            static const uint8_t table_cnt[4][SCAN_SET_SIZE] =
160
+            static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
161
             {
162
                 // patternSigCtx = 0
163
                 {
164
@@ -1597,50 +1636,61 @@
165
                     2, 2, 2, 2,
166
                     2, 2, 2, 2,
167
                     2, 2, 2, 2,
168
+                },
169
+                // 4x4
170
+                {
171
+                    0, 1, 4, 5,
172
+                    2, 3, 4, 5,
173
+                    6, 6, 8, 8,
174
+                    7, 7, 8, 8
175
                 }
176
             };
177
 
178
             const int offset = codingParameters.firstSignificanceMapContext;
179
-            ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
180
-            // TODO: accelerate by PABSW
181
             const uint32_t blkPosBase  = codingParameters.scan[subPosBase];
182
-            for (int i = 0; i < MLS_CG_SIZE; i++)
183
-            {
184
-                tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
185
-                tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
186
-                tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
187
-                tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
188
-            }
189
 
190
+            X265_CHECK(scanPosSigOff >= 0, "scanPosSigOff check failure\n");
191
             if (m_bitIf)
192
             {
193
+                ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
194
+
195
+                // TODO: accelerate by PABSW
196
+                for (int i = 0; i < MLS_CG_SIZE; i++)
197
+                {
198
+                    tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
199
+                    tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
200
+                    tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
201
x265_1.7.tar.gz/source/encoder/entropy.h -> x265_1.8.tar.gz/source/encoder/entropy.h Changed
10
 
1
@@ -31,7 +31,7 @@
2
 #include "contexts.h"
3
 #include "slice.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 struct SaoCtuParam;
10
x265_1.7.tar.gz/source/encoder/frameencoder.cpp -> x265_1.8.tar.gz/source/encoder/frameencoder.cpp Changed
201
 
1
@@ -35,7 +35,7 @@
2
 #include "slicetype.h"
3
 #include "nal.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
8
 
9
 FrameEncoder::FrameEncoder()
10
@@ -59,7 +59,6 @@
11
     m_cuGeoms = NULL;
12
     m_ctuGeomMap = NULL;
13
     m_localTldIdx = 0;
14
-    memset(&m_frameStats, 0, sizeof(m_frameStats));
15
     memset(&m_rce, 0, sizeof(RateControlEntry));
16
 }
17
 
18
@@ -313,7 +312,7 @@
19
     m_SSDY = m_SSDU = m_SSDV = 0;
20
     m_ssim = 0;
21
     m_ssimCnt = 0;
22
-    memset(&m_frameStats, 0, sizeof(m_frameStats));
23
+    memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
24
 
25
     /* Emit access unit delimiter unless this is the first frame and the user is
26
      * not repeating headers (since AUD is supposed to be the first NAL in the access
27
@@ -419,25 +418,6 @@
28
 
29
             m_top->m_lastBPSEI = m_rce.encodeOrder;
30
         }
31
-
32
-        // The recovery point SEI message assists a decoder in determining when the decoding
33
-        // process will produce acceptable pictures for display after the decoder initiates
34
-        // random access. The m_recoveryPocCnt is in units of POC(picture order count) which
35
-        // means pictures encoded after the CRA but precede it in display order(leading) are
36
-        // implicitly discarded after a random access seek regardless of the value of
37
-        // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA,
38
-        // so all pictures following the CRA in POC order are guaranteed to be displayable,
39
-        // so m_recoveryPocCnt is always 0.
40
-        SEIRecoveryPoint sei_recovery_point;
41
-        sei_recovery_point.m_recoveryPocCnt = 0;
42
-        sei_recovery_point.m_exactMatchingFlag = true;
43
-        sei_recovery_point.m_brokenLinkFlag = false;
44
-
45
-        m_bs.resetBits();
46
-        sei_recovery_point.write(m_bs, *slice->m_sps);
47
-        m_bs.writeByteAlignment();
48
-
49
-        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
50
     }
51
 
52
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
53
@@ -475,6 +455,19 @@
54
         m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
55
     }
56
 
57
+    /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
58
+     * tune RateControl parameters for other frames.
59
+     * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
60
+     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
61
+     * and VBV, unlock only after rateControlUpdateStats of this frame is called */
62
+    if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
63
+    {
64
+        m_top->m_rateControl->m_startEndOrder.incr();
65
+
66
+        if (m_rce.encodeOrder < m_param->frameNumThreads - 1)
67
+            m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
68
+    }
69
+
70
     /* Analyze CTU rows, most of the hard work is done here.  Frame is
71
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
72
      * filters runs behind the CTU compression and reconstruction */
73
@@ -559,17 +552,56 @@
74
         // accumulate intra,inter,skip cu count per frame for 2 pass
75
         for (uint32_t i = 0; i < m_numRows; i++)
76
         {
77
-            m_frameStats.mvBits    += m_rows[i].rowStats.mvBits;
78
-            m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
79
-            m_frameStats.miscBits  += m_rows[i].rowStats.miscBits;
80
-            totalI                 += m_rows[i].rowStats.iCuCnt;
81
-            totalP                 += m_rows[i].rowStats.pCuCnt;
82
-            totalSkip              += m_rows[i].rowStats.skipCuCnt;
83
+            m_frame->m_encData->m_frameStats.mvBits    += m_rows[i].rowStats.mvBits;
84
+            m_frame->m_encData->m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
85
+            m_frame->m_encData->m_frameStats.miscBits  += m_rows[i].rowStats.miscBits;
86
+            totalI                                     += m_rows[i].rowStats.intra8x8Cnt;
87
+            totalP                                     += m_rows[i].rowStats.inter8x8Cnt;
88
+            totalSkip                                  += m_rows[i].rowStats.skip8x8Cnt;
89
         }
90
         int totalCuCount = totalI + totalP + totalSkip;
91
-        m_frameStats.percentIntra = (double)totalI / totalCuCount;
92
-        m_frameStats.percentInter = (double)totalP / totalCuCount;
93
-        m_frameStats.percentSkip  = (double)totalSkip / totalCuCount;
94
+        m_frame->m_encData->m_frameStats.percent8x8Intra = (double)totalI / totalCuCount;
95
+        m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
96
+        m_frame->m_encData->m_frameStats.percent8x8Skip  = (double)totalSkip / totalCuCount;
97
+    }
98
+    for (uint32_t i = 0; i < m_numRows; i++)
99
+    {
100
+        m_frame->m_encData->m_frameStats.cntIntraNxN      += m_rows[i].rowStats.cntIntraNxN;
101
+        m_frame->m_encData->m_frameStats.totalCu          += m_rows[i].rowStats.totalCu;
102
+        m_frame->m_encData->m_frameStats.totalCtu         += m_rows[i].rowStats.totalCtu;
103
+        m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
104
+        m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
105
+        m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
106
+        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
107
+
108
+        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
109
+            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
110
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
111
+        {
112
+            m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
113
+            m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth];
114
+            for (int m = 0; m < INTER_MODES; m++)
115
+                m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
116
+            for (int n = 0; n < INTRA_MODES; n++)
117
+                m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n];
118
+        }
119
+    }
120
+    m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
121
+    m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
122
+    m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
123
+    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
124
+    m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
125
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
126
+    {
127
+        m_frame->m_encData->m_frameStats.percentSkipCu[depth]  = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
128
+        m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
129
+        for (int n = 0; n < INTRA_MODES; n++)
130
+            m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
131
+        uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts
132
+        cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2];
133
+        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
134
+        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
135
+        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu;
136
     }
137
 
138
     m_bs.resetBits();
139
@@ -638,7 +670,7 @@
140
     m_endCompressTime = x265_mdate();
141
 
142
     /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
143
-    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
144
+    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce) < 0)
145
         m_top->m_aborted = true;
146
 
147
     /* Decrement referenced frame reference counts, allow them to be recycled */
148
@@ -826,13 +858,6 @@
149
     const uint32_t lineStartCUAddr = row * numCols;
150
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
151
 
152
-    /* These store the count of inter, intra and skip cus within quad tree structure of each CTU */
153
-    uint32_t qTreeInterCnt[NUM_CU_DEPTH];
154
-    uint32_t qTreeIntraCnt[NUM_CU_DEPTH];
155
-    uint32_t qTreeSkipCnt[NUM_CU_DEPTH];
156
-    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
157
-        qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
158
-
159
     while (curRow.completed < numCols)
160
     {
161
         ProfileScopeEvent(encodeCTU);
162
@@ -904,30 +929,57 @@
163
         // Completed CU processing
164
         curRow.completed++;
165
 
166
-        if (m_param->bLogCuStats || m_param->rc.bStatWrite)
167
-            curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, qTreeInterCnt, qTreeIntraCnt, qTreeSkipCnt);
168
-        else if (m_param->rc.aqMode)
169
-            curEncData.m_rowStat[row].sumQpAq += calcCTUQP(*ctu);
170
+        FrameStats frameLog;
171
+        curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
172
 
173
         // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
174
         if (m_param->rc.bStatWrite)
175
         {
176
-            curRow.rowStats.mvBits += best.mvBits;
177
+            curRow.rowStats.mvBits    += best.mvBits;
178
             curRow.rowStats.coeffBits += best.coeffBits;
179
-            curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
180
+            curRow.rowStats.miscBits  += best.totalBits - (best.mvBits + best.coeffBits);
181
 
182
             for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
183
             {
184
                 /* 1 << shift == number of 8x8 blocks at current depth */
185
                 int shift = 2 * (g_maxCUDepth - depth);
186
-                curRow.rowStats.iCuCnt += qTreeIntraCnt[depth] << shift;
187
-                curRow.rowStats.pCuCnt += qTreeInterCnt[depth] << shift;
188
-                curRow.rowStats.skipCuCnt += qTreeSkipCnt[depth] << shift;
189
+                int cuSize = g_maxCUSize >> depth;
190
 
191
-                // clear the row cu data from thread local object
192
-                qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
193
+                if (cuSize == 8)
194
+                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
195
+                else
196
+                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] << shift);
197
+
198
+                curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
199
+                curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
200
             }
201
x265_1.7.tar.gz/source/encoder/frameencoder.h -> x265_1.8.tar.gz/source/encoder/frameencoder.h Changed
38
 
1
@@ -41,7 +41,7 @@
2
 #include "reference.h"
3
 #include "nal.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class ThreadPool;
10
@@ -49,8 +49,6 @@
11
 
12
 #define ANGULAR_MODE_ID 2
13
 #define AMP_ID 3
14
-#define INTER_MODES 4
15
-#define INTRA_MODES 3
16
 
17
 struct StatisticLog
18
 {
19
@@ -156,8 +154,6 @@
20
     MD5Context               m_state[3];
21
     uint32_t                 m_crc[3];
22
     uint32_t                 m_checksum[3];
23
-    StatisticLog             m_sliceTypeLog[3];     // per-slice type CU statistics
24
-    FrameStats               m_frameStats;          // stats of current frame for multi-pass encodes
25
 
26
     volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
27
     volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
28
@@ -221,8 +217,7 @@
29
     void encodeSlice();
30
 
31
     void threadMain();
32
-    int  collectCTUStatistics(const CUData& ctu, uint32_t* qtreeInterCnt, uint32_t* qtreeIntraCnt, uint32_t* qtreeSkipCnt);
33
-    int  calcCTUQP(const CUData& ctu);
34
+    int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
35
     void noiseReductionUpdate();
36
 
37
     /* Called by WaveFront::findJob() */
38
x265_1.7.tar.gz/source/encoder/framefilter.cpp -> x265_1.8.tar.gz/source/encoder/framefilter.cpp Changed
10
 
1
@@ -30,7 +30,7 @@
2
 #include "frameencoder.h"
3
 #include "wavefront.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
9
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
10
x265_1.7.tar.gz/source/encoder/framefilter.h -> x265_1.8.tar.gz/source/encoder/framefilter.h Changed
10
 
1
@@ -30,7 +30,7 @@
2
 #include "deblock.h"
3
 #include "sao.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class Encoder;
10
x265_1.7.tar.gz/source/encoder/level.cpp -> x265_1.8.tar.gz/source/encoder/level.cpp Changed
201
 
1
@@ -25,7 +25,7 @@
2
 #include "slice.h"
3
 #include "level.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 typedef struct
8
 {
9
     uint32_t maxLumaSamples;
10
@@ -61,18 +61,37 @@
11
 /* determine minimum decoder level required to decode the described video */
12
 void determineLevel(const x265_param &param, VPS& vps)
13
 {
14
+    vps.ptl.onePictureOnlyConstraintFlag = param.totalFrames == 1;
15
+    vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
16
+    vps.ptl.bitDepthConstraint = param.internalBitDepth;
17
+    vps.ptl.chromaFormatConstraint = param.internalCsp;
18
+
19
+    /* TODO: figure out HighThroughput signaling, aka: HbrFactor in section A.4.2, only available
20
+     * for intra-only profiles (vps.ptl.intraConstraintFlag) */
21
+    vps.ptl.lowerBitRateConstraintFlag = true;
22
+
23
     vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
24
-    if (param.internalCsp == X265_CSP_I420)
25
+    
26
+    if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
27
     {
28
-        if (param.internalBitDepth == 8)
29
+        /* Probably an HEVC v1 profile, but must check to be sure */
30
+        if (param.internalBitDepth <= 8)
31
         {
32
-            if (param.keyframeMax == 1 && param.maxNumReferences == 1)
33
+            if (vps.ptl.onePictureOnlyConstraintFlag)
34
                 vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
35
+            else if (vps.ptl.intraConstraintFlag)
36
+                vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
37
             else 
38
                 vps.ptl.profileIdc = Profile::MAIN;
39
         }
40
-        else if (param.internalBitDepth == 10)
41
-            vps.ptl.profileIdc = Profile::MAIN10;
42
+        else if (param.internalBitDepth <= 10)
43
+        {
44
+            /* note there is no 10bit still picture profile */
45
+            if (vps.ptl.intraConstraintFlag)
46
+                vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
47
+            else
48
+                vps.ptl.profileIdc = Profile::MAIN10;
49
+        }
50
     }
51
     else
52
         vps.ptl.profileIdc = Profile::MAINREXT;
53
@@ -162,17 +181,19 @@
54
             return;
55
         }
56
 
57
-#define CHECK_RANGE(value, main, high) (value > main && value <= high)
58
+#define CHECK_RANGE(value, main, high) (high != MAX_UINT && value > main && value <= high)
59
 
60
-        if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) &&
61
-            CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh) &&
62
-            levels[i].maxBitrateHigh != MAX_UINT)
63
+        if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) ||
64
+            CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh))
65
         {
66
-            /* If the user has not enabled high tier, continue looking to see if we can encode at a higher level, main tier */
67
-            if (!param.bHighTier && (levels[i].levelIdc < param.levelIdc))
68
-                continue;
69
-            else
70
+            /* The bitrate or buffer size are out of range for Main tier, but in
71
+             * range for High tier. If the user requested High tier then give
72
+             * them High tier at this level.  Otherwise allow the loop to
73
+             * progress to the Main tier of the next level */
74
+            if (param.bHighTier)
75
                 vps.ptl.tierFlag = Level::HIGH;
76
+            else
77
+                continue;
78
         }
79
         else
80
             vps.ptl.tierFlag = Level::MAIN;
81
@@ -184,29 +205,68 @@
82
         break;
83
     }
84
 
85
-    vps.ptl.intraConstraintFlag = false;
86
-    vps.ptl.lowerBitRateConstraintFlag = true;
87
-    vps.ptl.bitDepthConstraint = param.internalBitDepth;
88
-    vps.ptl.chromaFormatConstraint = param.internalCsp;
89
-    
90
     static const char *profiles[] = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
91
     static const char *tiers[]    = { "Main", "High" };
92
 
93
-    const char *profile = profiles[vps.ptl.profileIdc];
94
+    char profbuf[64];
95
+    strcpy(profbuf, profiles[vps.ptl.profileIdc]);
96
+
97
+    bool bStillPicture = false;
98
     if (vps.ptl.profileIdc == Profile::MAINREXT)
99
     {
100
-        if (param.internalCsp == X265_CSP_I422)
101
-            profile = "Main 4:2:2 10";
102
-        if (param.internalCsp == X265_CSP_I444)
103
+        if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
104
+        {
105
+            if (vps.ptl.onePictureOnlyConstraintFlag)
106
+            {
107
+                strcpy(profbuf, "Main 4:4:4 16 Still Picture");
108
+                bStillPicture = true;
109
+            }
110
+            else
111
+                strcpy(profbuf, "Main 4:4:4 16");
112
+        }
113
+        else if (param.internalCsp == X265_CSP_I420)
114
+        {
115
+            X265_CHECK(vps.ptl.intraConstraintFlag || vps.ptl.bitDepthConstraint > 10, "rext fail\n");
116
+            if (vps.ptl.bitDepthConstraint <= 8)
117
+                strcpy(profbuf, "Main");
118
+            else if (vps.ptl.bitDepthConstraint <= 10)
119
+                strcpy(profbuf, "Main 10");
120
+            else if (vps.ptl.bitDepthConstraint <= 12)
121
+                strcpy(profbuf, "Main 12");
122
+        }
123
+        else if (param.internalCsp == X265_CSP_I422)
124
+        {
125
+            /* there is no Main 4:2:2 profile, so it must be signaled as Main10 4:2:2 */
126
+            if (param.internalBitDepth <= 10)
127
+                strcpy(profbuf, "Main 4:2:2 10");
128
+            else if (vps.ptl.bitDepthConstraint <= 12)
129
+                strcpy(profbuf, "Main 4:2:2 12");
130
+        }
131
+        else if (param.internalCsp == X265_CSP_I444)
132
         {
133
             if (vps.ptl.bitDepthConstraint <= 8)
134
-                profile = "Main 4:4:4 8";
135
+            {
136
+                if (vps.ptl.onePictureOnlyConstraintFlag)
137
+                {
138
+                    strcpy(profbuf, "Main 4:4:4 Still Picture");
139
+                    bStillPicture = true;
140
+                }
141
+                else
142
+                    strcpy(profbuf, "Main 4:4:4");
143
+            }
144
             else if (vps.ptl.bitDepthConstraint <= 10)
145
-                profile = "Main 4:4:4 10";
146
+                strcpy(profbuf, "Main 4:4:4 10");
147
+            else if (vps.ptl.bitDepthConstraint <= 12)
148
+                strcpy(profbuf, "Main 4:4:4 12");
149
         }
150
+        else
151
+            strcpy(profbuf, "Unknown");
152
+
153
+        if (vps.ptl.intraConstraintFlag && !bStillPicture)
154
+            strcat(profbuf, " Intra");
155
     }
156
     x265_log(&param, X265_LOG_INFO, "%s profile, Level-%s (%s tier)\n",
157
-             profile, levels[i].name, tiers[vps.ptl.tierFlag]);
158
+             profbuf, levels[i].name, tiers[vps.ptl.tierFlag]);
159
 }
160
 
161
 /* enforce a maximum decoder level requirement, in other words assure that a
162
@@ -340,80 +400,88 @@
163
 
164
     return true;
165
 }
166
+}
167
+
168
+#if EXPORT_C_API
169
+
170
+/* these functions are exported as C functions (default) */
171
+using namespace X265_NS;
172
+extern "C" {
173
+
174
+#else
175
+
176
+/* these functions exist within private namespace (multilib) */
177
+namespace X265_NS {
178
+
179
+#endif
180
 
181
-extern "C"
182
 int x265_param_apply_profile(x265_param *param, const char *profile)
183
 {
184
     if (!param || !profile)
185
         return 0;
186
 
187
-#if HIGH_BIT_DEPTH
188
-    if (!strcmp(profile, "main") || !strcmp(profile, "mainstillpicture") || !strcmp(profile, "msp") || !strcmp(profile, "main444-8"))
189
-    {
190
-        x265_log(param, X265_LOG_ERROR, "%s profile not supported, compiled for Main10.\n", profile);
191
-        return -1;
192
-    }
193
-#else
194
-    if (!strcmp(profile, "main10") || !strcmp(profile, "main422-10") || !strcmp(profile, "main444-10"))
195
-    {
196
-        x265_log(param, X265_LOG_ERROR, "%s profile not supported, compiled for Main.\n", profile);
197
-        return -1;
198
-    }
199
+    /* Check if profile bit-depth requirement is exceeded by internal bit depth */
200
+    bool bInvalidDepth = false;
201
x265_1.7.tar.gz/source/encoder/level.h -> x265_1.8.tar.gz/source/encoder/level.h Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "x265.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // encoder private namespace
8
 
9
 struct VPS;
10
x265_1.7.tar.gz/source/encoder/motion.cpp -> x265_1.8.tar.gz/source/encoder/motion.cpp Changed
125
 
1
@@ -31,7 +31,7 @@
2
 #pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
3
 #endif
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 namespace {
9
 
10
@@ -56,7 +56,7 @@
11
     { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
12
 };
13
 
14
-int sizeScale[NUM_PU_SIZES];
15
+static int sizeScale[NUM_PU_SIZES];
16
 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
17
 
18
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
19
@@ -234,14 +234,9 @@
20
                pix_base + (m1x) + (m1y) * stride, \
21
                pix_base + (m2x) + (m2y) * stride, \
22
                stride, costs); \
23
-        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \
24
-        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \
25
-        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \
26
-        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \
27
-        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \
28
-        (costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \
29
-        (costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \
30
-        (costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \
31
+        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
32
+        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
33
+        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
34
     }
35
 
36
 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
37
@@ -271,16 +266,10 @@
38
                pix_base + (m2x) + (m2y) * stride, \
39
                pix_base + (m3x) + (m3y) * stride, \
40
                stride, costs); \
41
-        const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \
42
-        const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \
43
-        X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
44
-        X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
45
-        X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
46
-        X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
47
-        costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
48
-        costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
49
-        costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
50
-        costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
51
+        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
52
+        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
53
+        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
54
+        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
55
         COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
56
         COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
57
         COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
58
@@ -296,17 +285,10 @@
59
                pix_base + (m2x) + (m2y) * stride, \
60
                pix_base + (m3x) + (m3y) * stride, \
61
                stride, costs); \
62
-        /* TODO: use restrict keyword in ICL */ \
63
-        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \
64
-        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \
65
-        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
66
-        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
67
-        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
68
-        X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
69
-        (costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
70
-        (costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
71
-        (costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
72
-        (costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
73
+        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
74
+        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
75
+        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
76
+        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
77
     }
78
 
79
 #define DIA1_ITER(mx, my) \
80
@@ -639,36 +621,18 @@
81
         }
82
     }
83
 
84
+    X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
85
     // measure SAD cost at each QPEL motion vector candidate
86
-    if (ref->isLowres)
87
-    {
88
-        for (int i = 0; i < numCandidates; i++)
89
-        {
90
-            MV m = mvc[i].clipped(qmvmin, qmvmax);
91
-            if (m.notZero() && m != pmv && m != bestpre) // check already measured
92
-            {
93
-                int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
94
-                if (cost < bprecost)
95
-                {
96
-                    bprecost = cost;
97
-                    bestpre = m;
98
-                }
99
-            }
100
-        }
101
-    }
102
-    else
103
+    for (int i = 0; i < numCandidates; i++)
104
     {
105
-        for (int i = 0; i < numCandidates; i++)
106
+        MV m = mvc[i].clipped(qmvmin, qmvmax);
107
+        if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
108
         {
109
-            MV m = mvc[i].clipped(qmvmin, qmvmax);
110
-            if (m.notZero() && m != pmv && m != bestpre) // check already measured
111
+            int cost = subpelCompare(ref, m, sad) + mvcost(m);
112
+            if (cost < bprecost)
113
             {
114
-                int cost = subpelCompare(ref, m, sad) + mvcost(m);
115
-                if (cost < bprecost)
116
-                {
117
-                    bprecost = cost;
118
-                    bestpre = m;
119
-                }
120
+                bprecost = cost;
121
+                bestpre = m;
122
             }
123
         }
124
     }
125
x265_1.7.tar.gz/source/encoder/motion.h -> x265_1.8.tar.gz/source/encoder/motion.h Changed
10
 
1
@@ -30,7 +30,7 @@
2
 #include "bitcost.h"
3
 #include "yuv.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class MotionEstimate : public BitCost
10
x265_1.7.tar.gz/source/encoder/nal.cpp -> x265_1.8.tar.gz/source/encoder/nal.cpp Changed
10
 
1
@@ -25,7 +25,7 @@
2
 #include "bitstream.h"
3
 #include "nal.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 NALList::NALList()
9
     : m_numNal(0)
10
x265_1.7.tar.gz/source/encoder/nal.h -> x265_1.8.tar.gz/source/encoder/nal.h Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "x265.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class Bitstream;
10
x265_1.7.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.8.tar.gz/source/encoder/ratecontrol.cpp Changed
201
 
1
@@ -37,7 +37,7 @@
2
 #define BR_SHIFT  6
3
 #define CPB_SHIFT 4
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 /* Amortize the partial cost of I frames over the next N frames */
9
 
10
@@ -181,6 +181,8 @@
11
     m_bTerminated = false;
12
     m_finalFrameCount = 0;
13
     m_numEntries = 0;
14
+    m_isSceneTransition = false;
15
+    m_lastPredictorReset = 0;
16
     if (m_param->rc.rateControlMode == X265_RC_CRF)
17
     {
18
         m_param->rc.qp = (int)m_param->rc.rfConstant;
19
@@ -273,7 +275,6 @@
20
     if(m_param->rc.bStrictCbr)
21
         m_rateTolerance = 0.7;
22
 
23
-    m_leadingBframes = m_param->bframes;
24
     m_bframeBits = 0;
25
     m_leadingNoBSatd = 0;
26
     m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor);
27
@@ -282,6 +283,7 @@
28
     /* Adjust the first frame in order to stabilize the quality level compared to the rest */
29
 #define ABR_INIT_QP_MIN (24)
30
 #define ABR_INIT_QP_MAX (40)
31
+#define ABR_SCENECUT_INIT_QP_MIN (12)
32
 #define CRF_INIT_QP (int)m_param->rc.rfConstant
33
     for (int i = 0; i < 3; i++)
34
         m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
35
@@ -369,20 +371,8 @@
36
     m_accumPNorm = .01;
37
     m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm;
38
 
39
-    /* Frame Predictors and Row predictors used in vbv */
40
-    for (int i = 0; i < 4; i++)
41
-    {
42
-        m_pred[i].coeff = 1.0;
43
-        m_pred[i].count = 1.0;
44
-        m_pred[i].decay = 0.5;
45
-        m_pred[i].offset = 0.0;
46
-    }
47
-    m_pred[0].coeff = m_pred[3].coeff = 0.75;
48
-    if (m_param->rc.qCompress >= 0.8) // when tuned for grain 
49
-    {
50
-        m_pred[1].coeff = 0.75;
51
-        m_pred[0].coeff = m_pred[3].coeff = 0.50;
52
-    }
53
+    /* Frame Predictors used in vbv */
54
+    initFramePredictors();
55
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
56
     {
57
         /* If the user hasn't defined the stat filename, use the default value */
58
@@ -931,6 +921,24 @@
59
         return X265_TYPE_AUTO;
60
 }
61
 
62
+void RateControl::initFramePredictors()
63
+{
64
+    /* Frame Predictors used in vbv */
65
+    for (int i = 0; i < 4; i++)
66
+    {
67
+        m_pred[i].coeff = 1.0;
68
+        m_pred[i].count = 1.0;
69
+        m_pred[i].decay = 0.5;
70
+        m_pred[i].offset = 0.0;
71
+    }
72
+    m_pred[0].coeff = m_pred[3].coeff = 0.75;
73
+    if (m_param->rc.qCompress >= 0.8) // when tuned for grain 
74
+    {
75
+        m_pred[1].coeff = 0.75;
76
+        m_pred[0].coeff = m_pred[3].coeff = 0.50;
77
+    }
78
+}
79
+
80
 int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc)
81
 {
82
     int orderValue = m_startEndOrder.get();
83
@@ -960,10 +968,20 @@
84
         copyRceData(rce, &m_rce2Pass[rce->poc]);
85
     }
86
     rce->isActive = true;
87
-    if (m_sliceType == B_SLICE)
88
-        rce->bframes = m_leadingBframes;
89
-    else
90
-        m_leadingBframes = curFrame->m_lowres.leadingBframes;
91
+    bool isRefFrameScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refPicList[0][0]->m_lowres.bScenecut == 1;
92
+    if (curFrame->m_lowres.bScenecut)
93
+    {
94
+        m_isSceneTransition = true;
95
+        m_lastPredictorReset = rce->encodeOrder;
96
+        initFramePredictors();
97
+    }
98
+    else if (m_sliceType != B_SLICE && !isRefFrameScenecut)
99
+        m_isSceneTransition = false;
100
+
101
+    if (rce->encodeOrder < m_lastPredictorReset + m_param->frameNumThreads)
102
+    {
103
+        rce->rowPreds[0][0].count = 0;
104
+    }
105
 
106
     rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame;
107
     rce->bufferRate = m_bufferRate;
108
@@ -1040,6 +1058,10 @@
109
                 }
110
             }
111
         }
112
+        /* For a scenecut that occurs within the mini-gop, enable scene transition
113
+         * switch until the next mini-gop to ensure a min qp for all the frames within 
114
+         * the scene-transition mini-gop */
115
+
116
         double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce));
117
         q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
118
         m_qp = int(q + 0.5);
119
@@ -1087,18 +1109,6 @@
120
     }
121
     m_framesDone++;
122
 
123
-    /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
124
-     * tune RateControl parameters for other frames.
125
-     * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
126
-     * RateControlEnd here.those modes here. For the rest - ABR
127
-     * and VBV, unlock only after rateControlUpdateStats of this frame is called */
128
-    if (m_param->rc.rateControlMode != X265_RC_ABR && !m_isVbv)
129
-    {
130
-        m_startEndOrder.incr();
131
-
132
-        if (rce->encodeOrder < m_param->frameNumThreads - 1)
133
-            m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
134
-    }
135
     return m_qp;
136
 }
137
 
138
@@ -1394,6 +1404,13 @@
139
         else
140
             q += m_pbOffset;
141
 
142
+        /* Set a min qp at scenechanges and transitions */
143
+        if (m_isSceneTransition)
144
+        {
145
+            q = X265_MAX(ABR_SCENECUT_INIT_QP_MIN, q);
146
+            double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN); 
147
+            m_lastQScaleFor[P_SLICE] = X265_MAX(minScenecutQscale, m_lastQScaleFor[P_SLICE]);
148
+        }
149
         double qScale = x265_qp2qScale(q);
150
         rce->qpNoVbv = q;
151
         double lmin = 0, lmax = 0;
152
@@ -1556,11 +1573,19 @@
153
                 q = X265_MIN(lqmax, q);
154
             }
155
             q = x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
156
+            /* Set a min qp at scenechanges and transitions */
157
+            if (m_isSceneTransition)
158
+            {
159
+               double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN); 
160
+               q = X265_MAX(minScenecutQscale, q);
161
+               m_lastQScaleFor[P_SLICE] = X265_MAX(minScenecutQscale, m_lastQScaleFor[P_SLICE]);
162
+            }
163
             rce->qpNoVbv = x265_qScale2qp(q);
164
             q = clipQscale(curFrame, rce, q);
165
             /*  clip qp to permissible range after vbv-lookahead estimation to avoid possible
166
-             * mispredictions by initial frame size predictors */
167
-            if (!m_2pass && m_isVbv && m_pred[m_predType].count == 1)
168
+             * mispredictions by initial frame size predictors, after each scenecut */
169
+            bool isFrameAfterScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refPicList[0][0]->m_lowres.bScenecut;
170
+            if (!m_2pass && m_isVbv && isFrameAfterScenecut)
171
                 q = x265_clip3(lqmin, lqmax, q);
172
         }
173
         m_lastQScaleFor[m_sliceType] = q;
174
@@ -1762,7 +1787,7 @@
175
                 }
176
                 /* Try to get the buffer not more than 80% filled, but don't set an impossible goal. */
177
                 targetFill = x265_clip3(m_bufferSize * (1 - 0.2 * finalDur), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
178
-                if (m_isCbr && bufferFillCur > targetFill)
179
+                if (m_isCbr && bufferFillCur > targetFill && !m_isSceneTransition)
180
                 {
181
                     q /= 1.01;
182
                     loopTerminate |= 2;
183
@@ -1904,6 +1929,7 @@
184
             else if (picType == P_SLICE)
185
             {
186
                 intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].diagIntraSatd;
187
+                intraCostForPendingCus >>= X265_DEPTH - 8;
188
                 /* Our QP is lower than the reference! */
189
                 double pred_intra = predictSize(rce->rowPred[1], qScale, intraCostForPendingCus);
190
                 /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
191
@@ -1939,7 +1965,7 @@
192
             uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd;
193
             if (row == 1)
194
                 intraRowSatdCost += curEncData.m_rowStat[0].diagIntraSatd;
195
-
196
+            intraRowSatdCost >>= X265_DEPTH - 8;
197
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
198
         }
199
     }
200
@@ -2130,7 +2156,7 @@
201
x265_1.7.tar.gz/source/encoder/ratecontrol.h -> x265_1.8.tar.gz/source/encoder/ratecontrol.h Changed
74
 
1
@@ -29,7 +29,7 @@
2
 #include "common.h"
3
 #include "sei.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // encoder namespace
8
 
9
 class Encoder;
10
@@ -46,23 +46,6 @@
11
 #define MIN_AMORTIZE_FRACTION 0.2
12
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
13
 
14
-/* Current frame stats for 2 pass */
15
-struct FrameStats
16
-{
17
-    int         mvBits;    /* MV bits (MV+Ref+Block Type) */
18
-    int         coeffBits; /* Texture bits (DCT coefs) */
19
-    int         miscBits;
20
-
21
-    int         iCuCnt;
22
-    int         pCuCnt;
23
-    int         skipCuCnt;
24
-    
25
-    /* CU type counts stored as percentage */
26
-    double      percentIntra;
27
-    double      percentInter;
28
-    double      percentSkip;
29
-};
30
-
31
 struct Predictor
32
 {
33
     double coeff;
34
@@ -164,7 +147,6 @@
35
     double  m_pbOffset;
36
     int64_t m_bframeBits;
37
     int64_t m_currentSatd;
38
-    int     m_leadingBframes;
39
     int     m_qpConstant[3];
40
     int     m_lastNonBPictType;
41
     int     m_framesDone;        /* # of frames passed through RateCotrol already */
42
@@ -190,6 +172,8 @@
43
     int64_t m_lastBsliceSatdCost;
44
     int     m_numBframesInPattern;
45
     bool    m_isPatternPresent;
46
+    bool    m_isSceneTransition;
47
+    int     m_lastPredictorReset;
48
 
49
     /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
50
      * sync the calls to these functions. For example
51
@@ -241,12 +225,12 @@
52
     // to be called for each curFrame to process RateControl and set QP
53
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
54
     void rateControlUpdateStats(RateControlEntry* rce);
55
-    int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats);
56
+    int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce);
57
     int  rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
58
     int  rateControlSliceType(int frameNum);
59
     bool cuTreeReadFor2Pass(Frame* curFrame);
60
     void hrdFullness(SEIBufferingPeriod* sei);
61
-
62
+    int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
63
 protected:
64
 
65
     static const int   s_slidingWindowFrames;
66
@@ -274,6 +258,7 @@
67
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
68
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
69
     bool   initPass2();
70
+    void   initFramePredictors();
71
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
72
     double countExpectedBits();
73
     bool   vbv2Pass(uint64_t allAvailableBits);
74
x265_1.7.tar.gz/source/encoder/rdcost.h -> x265_1.8.tar.gz/source/encoder/rdcost.h Changed
66
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "slice.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class RDCost
10
@@ -88,10 +88,17 @@
11
         m_lambda = (uint64_t)floor(256.0 * lambda);
12
     }
13
 
14
-    inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const
15
+    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
16
     {
17
+#if X265_DEPTH <= 10
18
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
19
-                   "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
20
+                   "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
21
+                   distortion, bits, m_lambda2);
22
+#else
23
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
24
+                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
25
+                   distortion, bits, m_lambda2);
26
+#endif
27
         return distortion + ((bits * m_lambda2 + 128) >> 8);
28
     }
29
 
30
@@ -108,7 +115,7 @@
31
     }
32
 
33
     /* return the RD cost of this prediction, including the effect of psy-rd */
34
-    inline uint64_t calcPsyRdCost(uint32_t distortion, uint32_t bits, uint32_t psycost) const
35
+    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
36
     {
37
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
38
     }
39
@@ -116,15 +123,22 @@
40
     inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
41
     {
42
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
43
-                   "calcRdSADCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
44
+                   "calcRdSADCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n", sadCost, bits, m_lambda);
45
         return sadCost + ((bits * m_lambda + 128) >> 8);
46
     }
47
 
48
-    inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
49
+    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
50
     {
51
+#if X265_DEPTH <= 10
52
+        X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
53
+                   "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
54
+                   dist, m_chromaDistWeight[plane - 1]);
55
+#else
56
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
57
-                   "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
58
-        return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
59
+                   "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
60
+                   dist, m_chromaDistWeight[plane - 1]);
61
+#endif
62
+        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
63
     }
64
 
65
     inline uint32_t getCost(uint32_t bits) const
66
x265_1.7.tar.gz/source/encoder/reference.cpp -> x265_1.8.tar.gz/source/encoder/reference.cpp Changed
10
 
1
@@ -29,7 +29,7 @@
2
 
3
 #include "reference.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 MotionReference::MotionReference()
9
 {
10
x265_1.7.tar.gz/source/encoder/reference.h -> x265_1.8.tar.gz/source/encoder/reference.h Changed
10
 
1
@@ -29,7 +29,7 @@
2
 #include "lowres.h"
3
 #include "mv.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 struct WeightParam;
10
x265_1.7.tar.gz/source/encoder/sao.cpp -> x265_1.8.tar.gz/source/encoder/sao.cpp Changed
201
 
1
@@ -42,15 +42,25 @@
2
     return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
3
 }
4
 
5
+inline int signOf2(const int a, const int b)
6
+{
7
+    // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
8
+    int r = 0;
9
+    if (a < b)
10
+        r = -1;
11
+    if (a > b)
12
+        r = 1;
13
+    return r;
14
+}
15
+
16
 inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
17
 {
18
     return (count * offset - offsetOrg * 2) * offset;
19
 }
20
-
21
 } // end anonymous namespace
22
 
23
 
24
-namespace x265 {
25
+namespace X265_NS {
26
 
27
 const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
28
 {
29
@@ -213,14 +223,19 @@
30
         frame->m_encData->m_saoParam = saoParam;
31
     }
32
 
33
-    rdoSaoUnitRowInit(saoParam);
34
+    saoParam->bSaoFlag[0] = true;
35
+    saoParam->bSaoFlag[1] = true;
36
 
37
-    // NOTE: Disable SAO automatic turn-off when frame parallelism is
38
-    // enabled for output exact independent of frame thread count
39
-    if (m_param->frameNumThreads > 1)
40
+    m_numNoSao[0] = 0; // Luma
41
+    m_numNoSao[1] = 0; // Chroma
42
+
43
+    // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
44
+    if (m_param->frameNumThreads == 1)
45
     {
46
-        saoParam->bSaoFlag[0] = true;
47
-        saoParam->bSaoFlag[1] = true;
48
+        if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
49
+            saoParam->bSaoFlag[0] = false;
50
+        if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
51
+            saoParam->bSaoFlag[1] = false;
52
     }
53
 }
54
 
55
@@ -656,7 +671,6 @@
56
 /* Calculate SAO statistics for current CTU without non-crossing slice */
57
 void SAO::calcSaoStatsCu(int addr, int plane)
58
 {
59
-    int x, y;
60
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
61
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
62
     const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
63
@@ -687,8 +701,6 @@
64
     int startY;
65
     int endX;
66
     int endY;
67
-    int32_t* stats;
68
-    int32_t* count;
69
 
70
     int skipB = plane ? 2 : 4;
71
     int skipR = plane ? 3 : 5;
72
@@ -698,34 +710,16 @@
73
 
74
     // SAO_BO:
75
     {
76
-        const int boShift = X265_DEPTH - SAO_BO_BITS;
77
-
78
         if (m_param->bSaoNonDeblocked)
79
         {
80
             skipB = plane ? 1 : 3;
81
             skipR = plane ? 2 : 4;
82
         }
83
-        stats = m_offsetOrg[plane][SAO_BO];
84
-        count = m_count[plane][SAO_BO];
85
-
86
-        fenc = fenc0;
87
-        rec  = rec0;
88
 
89
         endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
90
         endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
91
 
92
-        for (y = 0; y < endY; y++)
93
-        {
94
-            for (x = 0; x < endX; x++)
95
-            {
96
-                int classIdx = 1 + (rec[x] >> boShift);
97
-                stats[classIdx] += (fenc[x] - rec[x]);
98
-                count[classIdx]++;
99
-            }
100
-
101
-            fenc += stride;
102
-            rec += stride;
103
-        }
104
+        primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
105
     }
106
 
107
     {
108
@@ -736,30 +730,11 @@
109
                 skipB = plane ? 1 : 3;
110
                 skipR = plane ? 3 : 5;
111
             }
112
-            stats = m_offsetOrg[plane][SAO_EO_0];
113
-            count = m_count[plane][SAO_EO_0];
114
-
115
-            fenc = fenc0;
116
-            rec  = rec0;
117
 
118
             startX = !lpelx;
119
             endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
120
-            for (y = 0; y < ctuHeight - skipB; y++)
121
-            {
122
-                int signLeft = signOf(rec[startX] - rec[startX - 1]);
123
-                for (x = startX; x < endX; x++)
124
-                {
125
-                    int signRight = signOf(rec[x] - rec[x + 1]);
126
-                    int edgeType = signRight + signLeft + 2;
127
-                    signLeft = -signRight;
128
-
129
-                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
130
-                    count[s_eoTable[edgeType]]++;
131
-                }
132
 
133
-                fenc += stride;
134
-                rec += stride;
135
-            }
136
+            primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
137
         }
138
 
139
         // SAO_EO_1: // dir: |
140
@@ -769,8 +744,6 @@
141
                 skipB = plane ? 2 : 4;
142
                 skipR = plane ? 2 : 4;
143
             }
144
-            stats = m_offsetOrg[plane][SAO_EO_1];
145
-            count = m_count[plane][SAO_EO_1];
146
 
147
             fenc = fenc0;
148
             rec  = rec0;
149
@@ -786,21 +759,7 @@
150
 
151
             primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
152
 
153
-            for (y = startY; y < endY; y++)
154
-            {
155
-                for (x = 0; x < endX; x++)
156
-                {
157
-                    int8_t signDown = signOf(rec[x] - rec[x + stride]);
158
-                    int edgeType = signDown + upBuff1[x] + 2;
159
-                    upBuff1[x] = -signDown;
160
-
161
-                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
162
-                    count[s_eoTable[edgeType]]++;
163
-                }
164
-
165
-                fenc += stride;
166
-                rec += stride;
167
-            }
168
+            primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
169
         }
170
 
171
         // SAO_EO_2: // dir: 135
172
@@ -810,8 +769,6 @@
173
                 skipB = plane ? 2 : 4;
174
                 skipR = plane ? 3 : 5;
175
             }
176
-            stats = m_offsetOrg[plane][SAO_EO_2];
177
-            count = m_count[plane][SAO_EO_2];
178
 
179
             fenc = fenc0;
180
             rec  = rec0;
181
@@ -829,23 +786,7 @@
182
 
183
             primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
184
 
185
-            for (y = startY; y < endY; y++)
186
-            {
187
-                upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
188
-                for (x = startX; x < endX; x++)
189
-                {
190
-                    int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
191
-                    int edgeType = signDown + upBuff1[x] + 2;
192
-                    upBufft[x + 1] = -signDown;
193
-                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
194
-                    count[s_eoTable[edgeType]]++;
195
-                }
196
-
197
-                std::swap(upBuff1, upBufft);
198
-
199
-                rec += stride;
200
-                fenc += stride;
201
x265_1.7.tar.gz/source/encoder/sao.h -> x265_1.8.tar.gz/source/encoder/sao.h Changed
42
 
1
@@ -30,7 +30,7 @@
2
 #include "frame.h"
3
 #include "entropy.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 enum SAOTypeLen
10
@@ -52,12 +52,12 @@
11
 
12
 class SAO
13
 {
14
-protected:
15
+public:
16
 
17
     enum { SAO_MAX_DEPTH = 4 };
18
     enum { SAO_BO_BITS  = 5 };
19
     enum { MAX_NUM_SAO_CLASS = 33 };
20
-    enum { SAO_BIT_INC = X265_MAX(X265_DEPTH - 10, 0) };
21
+    enum { SAO_BIT_INC = 0 }; /* in HM12.0, it wrote as X265_MAX(X265_DEPTH - 10, 0) */
22
     enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
23
     enum { NUM_EDGETYPE = 5 };
24
     enum { NUM_PLANE = 3 };
25
@@ -68,6 +68,8 @@
26
     typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
27
     typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
28
 
29
+protected:
30
+
31
     /* allocated per part */
32
     PerClass*   m_count;
33
     PerClass*   m_offset;
34
@@ -142,7 +144,6 @@
35
                              int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
36
     inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
37
 
38
-    void rdoSaoUnitRowInit(SAOParam* saoParam);
39
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
40
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
41
 };
42
x265_1.7.tar.gz/source/encoder/search.cpp -> x265_1.8.tar.gz/source/encoder/search.cpp Changed
201
 
1
@@ -33,7 +33,7 @@
2
 #include "analysis.h"  // TLD
3
 #include "framedata.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #if _MSC_VER
9
 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
10
@@ -319,7 +319,7 @@
11
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
12
         if (numSig)
13
         {
14
-            m_quant.invtransformNxN(residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
15
+            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
16
             primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
17
         }
18
         else
19
@@ -517,7 +517,7 @@
20
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
21
         if (numSig)
22
         {
23
-            m_quant.invtransformNxN(residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
24
+            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
25
             primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
26
         }
27
         else if (useTSkip)
28
@@ -530,7 +530,7 @@
29
             // no residual coded, recon = pred
30
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
31
 
32
-        uint32_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
33
+        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
34
 
35
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
36
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
37
@@ -667,7 +667,7 @@
38
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
39
         if (numSig)
40
         {
41
-            m_quant.invtransformNxN(residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
42
+            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
43
             primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
44
             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
45
         }
46
@@ -797,7 +797,7 @@
47
     uint32_t qtLayer = log2TrSize - 2;
48
     uint32_t stride = mode.fencYuv->m_csize;
49
     const uint32_t sizeIdxC = log2TrSizeC - 2;
50
-    uint32_t outDist = 0;
51
+    sse_ret_t outDist = 0;
52
 
53
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
54
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
55
@@ -841,7 +841,7 @@
56
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
57
             if (numSig)
58
             {
59
-                m_quant.invtransformNxN(residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
60
+                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
61
                 primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
62
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
63
             }
64
@@ -942,7 +942,7 @@
65
                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
66
                 if (numSig)
67
                 {
68
-                    m_quant.invtransformNxN(residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
69
+                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
70
                     primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
71
                     cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
72
                 }
73
@@ -956,7 +956,7 @@
74
                     primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
75
                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
76
                 }
77
-                uint32_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
78
+                sse_ret_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
79
                 tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
80
 
81
                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
82
@@ -1129,7 +1129,7 @@
83
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
84
             if (numSig)
85
             {
86
-                m_quant.invtransformNxN(residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
87
+                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
88
                 primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
89
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
90
             }
91
@@ -1156,14 +1156,14 @@
92
 
93
     cu.setPartSizeSubParts(partSize);
94
     cu.setPredModeSubParts(MODE_INTRA);
95
-    m_quant.m_tqBypass = !!cu.m_tqBypass[0];
96
 
97
     uint32_t tuDepthRange[2];
98
     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
99
 
100
     intraMode.initCosts();
101
-    intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
102
-    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
103
+    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
104
+    intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
105
+    intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
106
 
107
     m_entropyCoder.resetBits();
108
     if (m_slice->m_pps->bTransquantBypassEnabled)
109
@@ -1378,8 +1378,9 @@
110
     codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
111
     extractIntraResultQT(cu, *reconYuv, 0, 0);
112
 
113
-    intraMode.distortion = icosts.distortion;
114
-    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom, NULL);
115
+    intraMode.lumaDistortion = icosts.distortion;
116
+    intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom, NULL);
117
+    intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
118
 
119
     m_entropyCoder.resetBits();
120
     if (m_slice->m_pps->bTransquantBypassEnabled)
121
@@ -1861,6 +1862,29 @@
122
     return outCost;
123
 }
124
 
125
+/* find the lowres motion vector from lookahead in middle of current PU */
126
+MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
127
+{
128
+    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPicList[list][ref]->m_poc);
129
+    if (diffPoc > m_param->bframes + 1)
130
+        /* poc difference is out of range for lookahead */
131
+        return 0;
132
+
133
+    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc - 1];
134
+    if (mvs[0].x == 0x7FFF)
135
+        /* this motion search was not estimated by lookahead */
136
+        return 0;
137
+
138
+    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
139
+    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
140
+    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
141
+
142
+    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
143
+    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
144
+
145
+    return mvs[idx] << 1; /* scale up lowres mv */
146
+}
147
+
148
 /* Pick between the two AMVP candidates which is the best one to use as
149
  * MVP for the motion search, based on SAD cost */
150
 int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
151
@@ -1929,10 +1953,16 @@
152
     /* Perform ME, repeat until no more work is available */
153
     do
154
     {
155
-        if (meId < m_slice->m_numRefIdx[0])
156
-            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, meId);
157
+        if (meId < pme.m_jobs.refCnt[0])
158
+        {
159
+            int refIdx = pme.m_jobs.ref[0][meId]; //L0
160
+            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
161
+        }
162
         else
163
-            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
164
+        {
165
+            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
166
+            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
167
+        }
168
 
169
         meId = -1;
170
         pme.m_lock.acquire();
171
@@ -1950,13 +1980,18 @@
172
 
173
     MotionData* bestME = interMode.bestME[part];
174
 
175
-    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
176
+    // 12 mv candidates including lowresMV
177
+    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
178
     int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
179
 
180
     const MV* amvp = interMode.amvpCand[list][ref];
181
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
182
     MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
183
 
184
+    MV lmv = getLowresMV(interMode.cu, pu, list, ref);
185
+    if (lmv.notZero())
186
+        mvc[numMvc++] = lmv;
187
+
188
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
189
 
190
     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
191
@@ -1983,23 +2018,22 @@
192
 }
193
 
194
 /* find the best inter prediction for each PU of specified mode */
195
-void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC)
196
+void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
197
 {
198
     ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
199
 
200
     CUData& cu = interMode.cu;
201
x265_1.7.tar.gz/source/encoder/search.h -> x265_1.8.tar.gz/source/encoder/search.h Changed
148
 
1
@@ -48,7 +48,7 @@
2
 #define ProfileCounter(cu, count)
3
 #endif
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class Entropy;
10
@@ -109,7 +109,9 @@
11
     uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
12
     uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
13
     uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
14
-    uint32_t   distortion; // sum of partition SSE distortion
15
+    sse_ret_t  lumaDistortion;
16
+    sse_ret_t  chromaDistortion;
17
+    sse_ret_t  distortion; // sum of partition SSE distortion
18
     uint32_t   totalBits;  // sum of partition bits (mv + coeff)
19
     uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
20
     uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
21
@@ -120,6 +122,8 @@
22
         sa8dCost = 0;
23
         sa8dBits = 0;
24
         psyEnergy = 0;
25
+        lumaDistortion = 0;
26
+        chromaDistortion = 0;
27
         distortion = 0;
28
         totalBits = 0;
29
         mvBits = 0;
30
@@ -133,7 +137,15 @@
31
         sa8dCost = UINT64_MAX / 2;
32
         sa8dBits = MAX_UINT / 2;
33
         psyEnergy = MAX_UINT / 2;
34
+#if X265_DEPTH <= 10
35
+        lumaDistortion = MAX_UINT / 2;
36
+        chromaDistortion = MAX_UINT / 2;
37
         distortion = MAX_UINT / 2;
38
+#else
39
+        lumaDistortion = UINT64_MAX / 2;
40
+        chromaDistortion = UINT64_MAX / 2;
41
+        distortion = UINT64_MAX / 2;
42
+#endif
43
         totalBits = MAX_UINT / 2;
44
         mvBits = MAX_UINT / 2;
45
         coeffBits = MAX_UINT / 2;
46
@@ -141,14 +153,29 @@
47
 
48
     bool ok() const
49
     {
50
+#if X265_DEPTH <= 10
51
+        return !(rdCost >= UINT64_MAX / 2 ||
52
+            sa8dCost >= UINT64_MAX / 2 ||
53
+            sa8dBits >= MAX_UINT / 2 ||
54
+            psyEnergy >= MAX_UINT / 2 ||
55
+            lumaDistortion >= MAX_UINT / 2 ||
56
+            chromaDistortion >= MAX_UINT / 2 ||
57
+            distortion >= MAX_UINT / 2 ||
58
+            totalBits >= MAX_UINT / 2 ||
59
+            mvBits >= MAX_UINT / 2 ||
60
+            coeffBits >= MAX_UINT / 2);
61
+#else
62
         return !(rdCost >= UINT64_MAX / 2 ||
63
                  sa8dCost >= UINT64_MAX / 2 ||
64
                  sa8dBits >= MAX_UINT / 2 ||
65
                  psyEnergy >= MAX_UINT / 2 ||
66
-                 distortion >= MAX_UINT / 2 ||
67
+                 lumaDistortion >= UINT64_MAX / 2 ||
68
+                 chromaDistortion >= UINT64_MAX / 2 ||
69
+                 distortion >= UINT64_MAX / 2 ||
70
                  totalBits >= MAX_UINT / 2 ||
71
                  mvBits >= MAX_UINT / 2 ||
72
                  coeffBits >= MAX_UINT / 2);
73
+#endif
74
     }
75
 
76
     void addSubCosts(const Mode& subMode)
77
@@ -159,6 +186,8 @@
78
         sa8dCost += subMode.sa8dCost;
79
         sa8dBits += subMode.sa8dBits;
80
         psyEnergy += subMode.psyEnergy;
81
+        lumaDistortion += subMode.lumaDistortion;
82
+        chromaDistortion += subMode.chromaDistortion;
83
         distortion += subMode.distortion;
84
         totalBits += subMode.totalBits;
85
         mvBits += subMode.mvBits;
86
@@ -186,6 +215,11 @@
87
     int64_t  weightAnalyzeTime;                 // elapsed worker time analyzing reference weights
88
     int64_t  totalCTUTime;                      // elapsed worker time in compressCTU (includes pmode master)
89
 
90
+    uint32_t skippedMotionReferences[NUM_CU_DEPTH];
91
+    uint32_t totalMotionReferences[NUM_CU_DEPTH];
92
+    uint32_t skippedIntraCU[NUM_CU_DEPTH];
93
+    uint32_t totalIntraCU[NUM_CU_DEPTH];
94
+
95
     uint64_t countIntraRDO[NUM_CU_DEPTH];
96
     uint64_t countInterRDO[NUM_CU_DEPTH];
97
     uint64_t countIntraAnalysis;
98
@@ -213,6 +247,10 @@
99
             interRDOElapsedTime[i] += other.interRDOElapsedTime[i];
100
             countIntraRDO[i] += other.countIntraRDO[i];
101
             countInterRDO[i] += other.countInterRDO[i];
102
+            skippedMotionReferences[i] += other.skippedMotionReferences[i];
103
+            totalMotionReferences[i] += other.totalMotionReferences[i];
104
+            skippedIntraCU[i] += other.skippedIntraCU[i];
105
+            totalIntraCU[i] += other.totalIntraCU[i];
106
         }
107
 
108
         intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
109
@@ -301,7 +339,7 @@
110
     void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
111
 
112
     // estimation inter prediction (non-skip)
113
-    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC);
114
+    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
115
 
116
     // encode residual and compute rd-cost for inter mode
117
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
118
@@ -319,6 +357,8 @@
119
     void checkDQP(Mode& mode, const CUGeom& cuGeom);
120
     void checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom);
121
 
122
+    MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
123
+
124
     class PME : public BondedTaskGroup
125
     {
126
     public:
127
@@ -329,6 +369,11 @@
128
         const PredictionUnit& pu;
129
         int           puIdx;
130
 
131
+        struct {
132
+            int ref[2][MAX_NUM_REF];
133
+            int refCnt[2];
134
+        } m_jobs;
135
+
136
         PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
137
 
138
         void processTasks(int workerThreadId);
139
@@ -365,7 +410,7 @@
140
     {
141
         uint64_t rdcost;
142
         uint32_t bits;
143
-        uint32_t distortion;
144
+        sse_ret_t distortion;
145
         uint32_t energy;
146
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
147
     };
148
x265_1.7.tar.gz/source/encoder/sei.cpp -> x265_1.8.tar.gz/source/encoder/sei.cpp Changed
10
 
1
@@ -26,7 +26,7 @@
2
 #include "slice.h"
3
 #include "sei.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 /* x265's identifying GUID */
9
 const uint8_t SEIuserDataUnregistered::m_uuid_iso_iec_11578[16] = {
10
x265_1.7.tar.gz/source/encoder/sei.h -> x265_1.8.tar.gz/source/encoder/sei.h Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "bitstream.h"
3
 #include "slice.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 class SEI : public SyntaxElementWriter
10
x265_1.7.tar.gz/source/encoder/slicetype.cpp -> x265_1.8.tar.gz/source/encoder/slicetype.cpp Changed
201
 
1
@@ -40,7 +40,7 @@
2
 #define ProfileLookaheadTime(elapsed, count)
3
 #endif
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 namespace {
9
 
10
@@ -94,9 +94,7 @@
11
     /* Actual adaptive quantization */
12
     int maxCol = curFrame->m_fencPic->m_picWidth;
13
     int maxRow = curFrame->m_fencPic->m_picHeight;
14
-    int blockWidth = ((param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
15
-    int blockHeight = ((param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
16
-    int blockCount = blockWidth * blockHeight;
17
+    int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
18
 
19
     for (int y = 0; y < 3; y++)
20
     {
21
@@ -133,15 +131,16 @@
22
     {
23
         blockXY = 0;
24
         double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
25
-        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
26
+        double bias_strength = 0.f;
27
+        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
28
         {
29
-            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
30
+            double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
31
             for (blockY = 0; blockY < maxRow; blockY += 16)
32
             {
33
                 for (blockX = 0; blockX < maxCol; blockX += 16)
34
                 {
35
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
36
-                    qp_adj = pow(energy + 1, 0.1);
37
+                    qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
38
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
39
                     avg_adj += qp_adj;
40
                     avg_adj_pow2 += qp_adj * qp_adj;
41
@@ -151,8 +150,9 @@
42
 
43
             avg_adj /= blockCount;
44
             avg_adj_pow2 /= blockCount;
45
-            strength = param->rc.aqStrength * avg_adj / bit_depth_correction;
46
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
47
+            strength = param->rc.aqStrength * avg_adj;
48
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f)) / avg_adj;
49
+            bias_strength = param->rc.aqStrength;
50
         }
51
         else
52
             strength = param->rc.aqStrength * 1.0397f;
53
@@ -162,7 +162,12 @@
54
         {
55
             for (blockX = 0; blockX < maxCol; blockX += 16)
56
             {
57
-                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
58
+                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
59
+                {
60
+                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
61
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 11.f / (qp_adj * qp_adj));
62
+                }
63
+                else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
64
                 {
65
                     qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
66
                     qp_adj = strength * (qp_adj - avg_adj);
67
@@ -464,6 +469,7 @@
68
     m_pool  = pool;
69
 
70
     m_lastNonB = NULL;
71
+    m_isSceneTransition = false;
72
     m_scratch  = NULL;
73
     m_tld      = NULL;
74
     m_filled   = false;
75
@@ -1248,7 +1254,9 @@
76
 
77
     int numBFrames = 0;
78
     int numAnalyzed = numFrames;
79
-    if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch))
80
+    bool isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
81
+    /* When scenecut threshold is set, use scenecut detection for I frame placements */
82
+    if (m_param->scenecutThreshold && isScenecut)
83
     {
84
         frames[1]->sliceType = X265_TYPE_I;
85
         return;
86
@@ -1338,14 +1346,13 @@
87
         /* Check scenecut on the first minigop. */
88
         for (int j = 1; j < numBFrames + 1; j++)
89
         {
90
-            if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch))
91
+            if (scenecut(frames, j, j + 1, false, origNumFrames))
92
             {
93
                 frames[j]->sliceType = X265_TYPE_P;
94
                 numAnalyzed = j;
95
                 break;
96
             }
97
         }
98
-
99
         resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1);
100
     }
101
     else
102
@@ -1369,50 +1376,99 @@
103
     if (bIsVbvLookahead)
104
         vbvLookahead(frames, numFrames, bKeyframe);
105
 
106
+     int maxp1 = X265_MIN(m_param->bframes + 1, origNumFrames);
107
     /* Restore frame types for all frames that haven't actually been decided yet. */
108
     for (int j = resetStart; j <= numFrames; j++)
109
+    {
110
         frames[j]->sliceType = X265_TYPE_AUTO;
111
+        /* If any frame marked as scenecut is being restarted for sliceDecision, 
112
+         * undo scene Transition flag */
113
+        if (j <= maxp1 && frames[j]->bScenecut && m_isSceneTransition)
114
+            m_isSceneTransition = false;
115
+    }
116
 }
117
 
118
-bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch)
119
+bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames)
120
 {
121
     /* Only do analysis during a normal scenecut check. */
122
     if (bRealScenecut && m_param->bframes)
123
     {
124
         int origmaxp1 = p0 + 1;
125
         /* Look ahead to avoid coding short flashes as scenecuts. */
126
-        if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
127
-            /* Don't analyse any more frames than the trellis would have covered. */
128
-            origmaxp1 += m_param->bframes;
129
-        else
130
-            origmaxp1++;
131
+        origmaxp1 += m_param->bframes;
132
         int maxp1 = X265_MIN(origmaxp1, numFrames);
133
-
134
+        bool fluctuate = false;
135
+        bool noScenecuts = false;
136
+        int64_t avgSatdCost = 0;
137
+        if (frames[0]->costEst[1][0] > -1)
138
+            avgSatdCost = frames[0]->costEst[1][0];
139
+        int cnt = 1;
140
         /* Where A and B are scenes: AAAAAABBBAAAAAA
141
          * If BBB is shorter than (maxp1-p0), it is detected as a flash
142
          * and not considered a scenecut. */
143
         for (int cp1 = p1; cp1 <= maxp1; cp1++)
144
         {
145
             if (!scenecutInternal(frames, p0, cp1, false))
146
+            {
147
                 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
148
                 for (int i = cp1; i > p0; i--)
149
+                {
150
                     frames[i]->bScenecut = false;
151
+                    noScenecuts = false;
152
+                }
153
+            }
154
+            else if (scenecutInternal(frames, cp1 - 1, cp1, false))
155
+            {
156
+                /* If current frame is a Scenecut from p0 frame as well as Scenecut from
157
+                 * preceeding frame, mark it as a Scenecut */
158
+                frames[cp1]->bScenecut = true;
159
+                noScenecuts = true;
160
+            }
161
+
162
+            /* compute average satdcost of all the frames in the mini-gop to confirm 
163
+             * whether there is any great fluctuation among them to rule out false positives */
164
+            X265_CHECK(frames[cp1]->costEst[cp1 - p0][0]!= -1, "costEst is not done \n");
165
+            avgSatdCost += frames[cp1]->costEst[cp1 - p0][0];
166
+            cnt++;
167
         }
168
 
169
-        /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
170
-         * If each of BB ... EE are shorter than (maxp1-p0), they are
171
-         * detected as flashes and not considered scenecuts.
172
-         * Instead, the first F frame becomes a scenecut.
173
-         * If the video ends before F, no frame becomes a scenecut. */
174
-        for (int cp0 = p0; cp0 <= maxp1; cp0++)
175
+        /* Identify possible scene fluctuations by comparing the satd cost of the frames.
176
+         * This could denote the beginning or ending of scene transitions.
177
+         * During a scene transition(fade in/fade outs), if fluctuate remains false,
178
+         * then the scene had completed its transition or stabilized */
179
+        if (noScenecuts)
180
         {
181
-            if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false)))
182
-                /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
183
-                frames[cp0]->bScenecut = false;
184
+            fluctuate = false;
185
+            avgSatdCost /= cnt;
186
+            for (int i = p1; i <= maxp1; i++)
187
+            {
188
+                int64_t curCost  = frames[i]->costEst[i - p0][0];
189
+                int64_t prevCost = frames[i - 1]->costEst[i - 1 - p0][0];
190
+                if (fabs((double)(curCost - avgSatdCost)) > 0.1 * avgSatdCost || 
191
+                    fabs((double)(curCost - prevCost)) > 0.1 * prevCost)
192
+                {
193
+                    fluctuate = true;
194
+                    if (!m_isSceneTransition && frames[i]->bScenecut)
195
+                    {
196
+                        m_isSceneTransition = true;
197
+                        /* just mark the first scenechange in the scene transition as a scenecut. */
198
+                        for (int j = i + 1; j <= maxp1; j++)
199
+                            frames[j]->bScenecut = false;
200
+                        break;
201
x265_1.7.tar.gz/source/encoder/slicetype.h -> x265_1.8.tar.gz/source/encoder/slicetype.h Changed
28
 
1
@@ -30,7 +30,7 @@
2
 #include "piclist.h"
3
 #include "threadpool.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private namespace
8
 
9
 struct Lowres;
10
@@ -127,7 +127,7 @@
11
     int           m_numCoopSlices;
12
     int           m_numRowsPerSlice;
13
     bool          m_filled;
14
-
15
+    bool          m_isSceneTransition;
16
     Lookahead(x265_param *param, ThreadPool *pool);
17
 
18
 #if DETAILED_CU_STATS
19
@@ -156,7 +156,7 @@
20
     void    slicetypeAnalyse(Lowres **frames, bool bKeyframe);
21
 
22
     /* called by slicetypeAnalyse() to make slice decisions */
23
-    bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch);
24
+    bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
25
     bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
26
     void    slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]);
27
     int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
28
x265_1.7.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.8.tar.gz/source/encoder/weightPrediction.cpp Changed
19
 
1
@@ -31,7 +31,7 @@
2
 #include "mv.h"
3
 #include "bitstream.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 namespace {
8
 struct Cache
9
 {
10
@@ -217,7 +217,7 @@
11
 }
12
 }
13
 
14
-namespace x265 {
15
+namespace X265_NS {
16
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
17
 {
18
     WeightParam wp[2][MAX_NUM_REF][3];
19
x265_1.7.tar.gz/source/input/input.cpp -> x265_1.8.tar.gz/source/input/input.cpp Changed
10
 
1
@@ -25,7 +25,7 @@
2
 #include "yuv.h"
3
 #include "y4m.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m)
9
 {
10
x265_1.7.tar.gz/source/input/input.h -> x265_1.8.tar.gz/source/input/input.h Changed
24
 
1
@@ -31,9 +31,9 @@
2
 #define MIN_FRAME_RATE 1
3
 #define MAX_FRAME_RATE 300
4
 
5
-#include "x265.h"
6
+#include "common.h"
7
 
8
-namespace x265 {
9
+namespace X265_NS {
10
 // private x265 namespace
11
 
12
 struct InputFileInfo
13
@@ -79,6 +79,10 @@
14
     virtual bool isFail() = 0;
15
 
16
     virtual const char *getName() const = 0;
17
+
18
+    virtual int getWidth() const = 0;
19
+
20
+    virtual int getHeight() const = 0;
21
 };
22
 }
23
 
24
x265_1.7.tar.gz/source/input/y4m.cpp -> x265_1.8.tar.gz/source/input/y4m.cpp Changed
10
 
1
@@ -36,7 +36,7 @@
2
 #endif
3
 #endif
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 using namespace std;
8
 
9
 static const char header[] = "FRAME";
10
x265_1.7.tar.gz/source/input/y4m.h -> x265_1.8.tar.gz/source/input/y4m.h Changed
21
 
1
@@ -30,7 +30,7 @@
2
 
3
 #define QUEUE_SIZE 5
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // x265 private namespace
8
 
9
 class Y4MInput : public InputFile, public Thread
10
@@ -88,6 +88,10 @@
11
     bool readPicture(x265_picture&);
12
 
13
     const char *getName() const   { return "y4m"; }
14
+
15
+    int getWidth() const                          { return width; }
16
+
17
+    int getHeight() const                         { return height; }
18
 };
19
 }
20
 
21
x265_1.7.tar.gz/source/input/yuv.cpp -> x265_1.8.tar.gz/source/input/yuv.cpp Changed
10
 
1
@@ -36,7 +36,7 @@
2
 #endif
3
 #endif
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 using namespace std;
8
 
9
 YUVInput::YUVInput(InputFileInfo& info)
10
x265_1.7.tar.gz/source/input/yuv.h -> x265_1.8.tar.gz/source/input/yuv.h Changed
21
 
1
@@ -30,7 +30,7 @@
2
 
3
 #define QUEUE_SIZE 5
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class YUVInput : public InputFile, public Thread
10
@@ -80,6 +80,10 @@
11
     bool readPicture(x265_picture&);
12
 
13
     const char *getName() const                   { return "yuv"; }
14
+
15
+    int getWidth() const                          { return width; }
16
+
17
+    int getHeight() const                         { return height; }
18
 };
19
 }
20
 
21
x265_1.7.tar.gz/source/output/output.cpp -> x265_1.8.tar.gz/source/output/output.cpp Changed
10
 
1
@@ -28,7 +28,7 @@
2
 
3
 #include "raw.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
9
 {
10
x265_1.7.tar.gz/source/output/output.h -> x265_1.8.tar.gz/source/output/output.h Changed
10
 
1
@@ -28,7 +28,7 @@
2
 #include "x265.h"
3
 #include "input/input.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class ReconFile
10
x265_1.7.tar.gz/source/output/raw.cpp -> x265_1.8.tar.gz/source/output/raw.cpp Changed
10
 
1
@@ -24,7 +24,7 @@
2
 
3
 #include "raw.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 using namespace std;
8
 
9
 RAWOutput::RAWOutput(const char* fname, InputFileInfo&)
10
x265_1.7.tar.gz/source/output/raw.h -> x265_1.8.tar.gz/source/output/raw.h Changed
10
 
1
@@ -30,7 +30,7 @@
2
 #include <fstream>
3
 #include <iostream>
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 class RAWOutput : public OutputFile
8
 {
9
 protected:
10
x265_1.7.tar.gz/source/output/reconplay.cpp -> x265_1.8.tar.gz/source/output/reconplay.cpp Changed
10
 
1
@@ -27,7 +27,7 @@
2
 
3
 #include <signal.h>
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 #if _WIN32
9
 #define popen  _popen
10
x265_1.7.tar.gz/source/output/reconplay.h -> x265_1.8.tar.gz/source/output/reconplay.h Changed
10
 
1
@@ -29,7 +29,7 @@
2
 #include "threading.h"
3
 #include <cstdio>
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class ReconPlay : public Thread
10
x265_1.7.tar.gz/source/output/y4m.cpp -> x265_1.8.tar.gz/source/output/y4m.cpp Changed
10
 
1
@@ -25,7 +25,7 @@
2
 #include "output.h"
3
 #include "y4m.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 using namespace std;
8
 
9
 Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
10
x265_1.7.tar.gz/source/output/y4m.h -> x265_1.8.tar.gz/source/output/y4m.h Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include "output.h"
3
 #include <fstream>
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class Y4MOutput : public ReconFile
10
x265_1.7.tar.gz/source/output/yuv.cpp -> x265_1.8.tar.gz/source/output/yuv.cpp Changed
10
 
1
@@ -25,7 +25,7 @@
2
 #include "output.h"
3
 #include "yuv.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 using namespace std;
8
 
9
 YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
10
x265_1.7.tar.gz/source/output/yuv.h -> x265_1.8.tar.gz/source/output/yuv.h Changed
10
 
1
@@ -29,7 +29,7 @@
2
 
3
 #include <fstream>
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 // private x265 namespace
8
 
9
 class YUVOutput : public ReconFile
10
x265_1.7.tar.gz/source/profile/vtune/vtune.cpp -> x265_1.8.tar.gz/source/profile/vtune/vtune.cpp Changed
10
 
1
@@ -36,7 +36,7 @@
2
 
3
 }
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 
8
 __itt_domain* domain;
9
 __itt_string_handle* taskHandle[NUM_VTUNE_TASKS];
10
x265_1.7.tar.gz/source/profile/vtune/vtune.h -> x265_1.8.tar.gz/source/profile/vtune/vtune.h Changed
10
 
1
@@ -26,7 +26,7 @@
2
 
3
 #include "ittnotify.h"
4
 
5
-namespace x265 {
6
+namespace X265_NS {
7
 
8
 #define CPU_EVENT(x) x,
9
 enum VTuneTasksEnum
10
x265_1.7.tar.gz/source/test/CMakeLists.txt -> x265_1.8.tar.gz/source/test/CMakeLists.txt Changed
17
 
1
@@ -1,3 +1,4 @@
2
+# vim: syntax=cmake
3
 enable_language(ASM_YASM)
4
 
5
 if(MSVC_IDE)
6
@@ -24,5 +25,9 @@
7
     intrapredharness.cpp intrapredharness.h)
8
 target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
9
 if(LINKER_OPTIONS)
10
-    set_target_properties(TestBench PROPERTIES LINK_FLAGS ${LINKER_OPTIONS})
11
+    if(EXTRA_LIB)
12
+        list(APPEND LINKER_OPTIONS "-L..")
13
+    endif(EXTRA_LIB)
14
+    string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")
15
+    set_target_properties(TestBench PROPERTIES LINK_FLAGS "${LINKER_OPTION_STR}")
16
 endif()
17
x265_1.7.tar.gz/source/test/checkasm-a.asm -> x265_1.8.tar.gz/source/test/checkasm-a.asm Changed
29
 
1
@@ -152,10 +152,12 @@
2
 
3
     jz .ok
4
     mov  r9, rax
5
+    mov r10, rdx
6
     lea  r0, [error_message]
7
     call puts
8
     mov  r1, [rsp+max_args*8]
9
     mov  dword [r1], 0
10
+    mov  rdx, r10
11
     mov  rax, r9
12
 .ok:
13
     RET
14
@@ -191,12 +193,14 @@
15
     or   r3, r5
16
     jz .ok
17
     mov  r3, eax
18
+    mov  r4, edx
19
     lea  r1, [error_message]
20
     push r1
21
     call puts
22
     add  esp, 4
23
     mov  r1, r1m
24
     mov  dword [r1], 0
25
+    mov  edx, r4
26
     mov  eax, r3
27
 .ok:
28
     REP_RET
29
x265_1.7.tar.gz/source/test/intrapredharness.cpp -> x265_1.8.tar.gz/source/test/intrapredharness.cpp Changed
79
 
1
@@ -25,12 +25,22 @@
2
 #include "predict.h"
3
 #include "intrapredharness.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 IntraPredHarness::IntraPredHarness()
9
 {
10
     for (int i = 0; i < INPUT_SIZE; i++)
11
         pixel_buff[i] = rand() % PIXEL_MAX;
12
+
13
+    /* [0] --- Random values
14
+     * [1] --- Minimum
15
+     * [2] --- Maximum */
16
+    for (int i = 0; i < BUFFSIZE; i++)
17
+    {
18
+        pixel_test_buff[0][i]   = rand() % PIXEL_MAX;
19
+        pixel_test_buff[1][i]   = PIXEL_MIN;
20
+        pixel_test_buff[2][i]   = PIXEL_MAX;
21
+    }
22
 }
23
 
24
 bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
25
@@ -177,6 +187,27 @@
26
     return true;
27
 }
28
 
29
+bool IntraPredHarness::check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt)
30
+{
31
+    memset(pixel_out_c, 0, 64 * 64 * sizeof(pixel));
32
+    memset(pixel_out_vec, 0, 64 * 64 * sizeof(pixel));
33
+    int j = 0;
34
+
35
+    for (int i = 0; i < 100; i++)
36
+    {
37
+        int index = rand() % TEST_CASES;
38
+
39
+        ref(pixel_test_buff[index] + j, pixel_out_c);
40
+        checked(opt, pixel_test_buff[index] + j, pixel_out_vec);
41
+
42
+        if (memcmp(pixel_out_c, pixel_out_vec, 64 * 64 * sizeof(pixel)))
43
+            return false;
44
+
45
+        reportfail();
46
+        j += FENC_STRIDE;
47
+    }
48
+    return true;
49
+}
50
 bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
51
 {
52
     for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++)
53
@@ -213,6 +244,14 @@
54
                 return false;
55
             }
56
         }
57
+        if (opt.cu[i].intra_filter)
58
+        {
59
+            if (!check_intra_filter_primitive(ref.cu[i].intra_filter, opt.cu[i].intra_filter))
60
+            {
61
+                printf("intra_filter_%dx%d failed\n", size, size);
62
+                return false;
63
+            }
64
+        }
65
     }
66
 
67
     return true;
68
@@ -268,5 +307,10 @@
69
                                pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, mode, bFilter);
70
             }
71
         }
72
+        if (opt.cu[i].intra_filter)
73
+        {
74
+            printf("intra_filter_%dx%d", size, size);
75
+            REPORT_SPEEDUP(opt.cu[i].intra_filter, ref.cu[i].intra_filter, pixel_buff, pixel_out_c);
76
+        }
77
     }
78
 }
79
x265_1.7.tar.gz/source/test/intrapredharness.h -> x265_1.8.tar.gz/source/test/intrapredharness.h Changed
25
 
1
@@ -34,7 +34,15 @@
2
     enum { INPUT_SIZE = 4 * 65 * 65 * 100 };
3
     enum { OUTPUT_SIZE = 64 * FENC_STRIDE };
4
     enum { OUTPUT_SIZE_33 = 33 * OUTPUT_SIZE };
5
+    enum { TEST_CASES = 3 };
6
+    enum { INCR = 32 };
7
+    enum { STRIDE = 64 };
8
+    enum { ITERS = 100 };
9
+    enum { MAX_HEIGHT = 64 };
10
+    enum { PAD_ROWS = 64 };
11
+    enum { BUFFSIZE = STRIDE * (MAX_HEIGHT + PAD_ROWS) + INCR * ITERS };
12
 
13
+    pixel    pixel_test_buff[TEST_CASES][BUFFSIZE];
14
     ALIGN_VAR_16(pixel, pixel_buff[INPUT_SIZE]);
15
     pixel pixel_out_c[OUTPUT_SIZE];
16
     pixel pixel_out_vec[OUTPUT_SIZE];
17
@@ -45,6 +53,7 @@
18
     bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
19
     bool check_angular_primitive(const intra_pred_t ref[], const intra_pred_t opt[], int size);
20
     bool check_allangs_primitive(const intra_allangs_t ref, const intra_allangs_t opt, int size);
21
+    bool check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt);
22
 
23
 public:
24
 
25
x265_1.7.tar.gz/source/test/ipfilterharness.cpp -> x265_1.8.tar.gz/source/test/ipfilterharness.cpp Changed
25
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "ipfilterharness.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 IPFilterHarness::IPFilterHarness()
9
 {
10
@@ -122,7 +122,14 @@
11
                     coeffIdx);
12
 
13
             if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
14
+            {
15
+            ref(pixel_test_buff[index] + 3 * rand_srcStride,
16
+                rand_srcStride,
17
+                IPF_C_output_s,
18
+                rand_dstStride,
19
+                coeffIdx);
20
                 return false;
21
+            }
22
 
23
             reportfail();
24
         }
25
x265_1.7.tar.gz/source/test/mbdstharness.cpp -> x265_1.8.tar.gz/source/test/mbdstharness.cpp Changed
36
 
1
@@ -27,7 +27,7 @@
2
 #include "common.h"
3
 #include "mbdstharness.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 struct DctConf
9
 {
10
@@ -53,7 +53,7 @@
11
 
12
 MBDstHarness::MBDstHarness()
13
 {
14
-    const int idct_max = (1 << (BIT_DEPTH + 4)) - 1;
15
+    const int idct_max = (1 << (X265_DEPTH + 4)) - 1;
16
 
17
     /* [0] --- Random values
18
      * [1] --- Minimum
19
@@ -215,8 +215,14 @@
20
         uint32_t optReturnValue = 0;
21
         uint32_t refReturnValue = 0;
22
 
23
-        int bits = (rand() % 24) + 8;
24
-        int valueToAdd = rand() % (1 << bits);
25
+        int sliceType = rand() % 2;
26
+        int log2TrSize = rand() % 4 + 2;
27
+        int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1);
28
+        int per = qp / 6;
29
+        int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
30
+
31
+        int bits = QUANT_SHIFT + per + transformShift;
32
+        int valueToAdd = (sliceType == 1 ? 171 : 85) << (bits - 9);
33
         int cmp_size = sizeof(int) * height * width;
34
         int cmp_size1 = sizeof(short) * height * width;
35
         int numCoeff = height * width;
36
x265_1.7.tar.gz/source/test/pixelharness.cpp -> x265_1.8.tar.gz/source/test/pixelharness.cpp Changed
201
 
1
@@ -23,8 +23,9 @@
2
 
3
 #include "pixelharness.h"
4
 #include "primitives.h"
5
+#include "entropy.h"
6
 
7
-using namespace x265;
8
+using namespace X265_NS;
9
 
10
 PixelHarness::PixelHarness()
11
 {
12
@@ -93,7 +94,7 @@
13
     return true;
14
 }
15
 
16
-bool PixelHarness::check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
17
+bool PixelHarness::check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt)
18
 {
19
     int j = 0;
20
     intptr_t stride = STRIDE;
21
@@ -102,8 +103,29 @@
22
     {
23
         int index1 = rand() % TEST_CASES;
24
         int index2 = rand() % TEST_CASES;
25
-        int vres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
26
-        int cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
27
+        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
28
+        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
29
+        if (vres != cres)
30
+            return false;
31
+
32
+        reportfail();
33
+        j += INCR;
34
+    }
35
+
36
+    return true;
37
+}
38
+
39
+bool PixelHarness::check_pixel_sse_ss(pixel_sse_ss_t ref, pixel_sse_ss_t opt)
40
+{
41
+    int j = 0;
42
+    intptr_t stride = STRIDE;
43
+
44
+    for (int i = 0; i < ITERS; i++)
45
+    {
46
+        int index1 = rand() % TEST_CASES;
47
+        int index2 = rand() % TEST_CASES;
48
+        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
49
+        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
50
         if (vres != cres)
51
             return false;
52
 
53
@@ -900,8 +922,8 @@
54
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
55
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
56
 
57
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
58
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
59
+    for (int i = 0; i < 64 * 64; i++)
60
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
61
 
62
     int j = 0;
63
 
64
@@ -928,8 +950,8 @@
65
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
66
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
67
 
68
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
69
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
70
+    for (int i = 0; i < 64 * 64; i++)
71
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
72
 
73
     int j = 0;
74
 
75
@@ -956,8 +978,8 @@
76
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
77
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
78
 
79
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
80
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
81
+    for (int i = 0; i < 64 * 64; i++)
82
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
83
 
84
     for (int id = 0; id < 2; id++)
85
     {
86
@@ -992,8 +1014,8 @@
87
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
88
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
89
 
90
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
91
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
92
+    for (int i = 0; i < 64 * 64; i++)
93
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
94
 
95
     int j = 0;
96
 
97
@@ -1016,13 +1038,234 @@
98
     return true;
99
 }
100
 
101
+bool PixelHarness::check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt)
102
+{
103
+    enum { NUM_EDGETYPE = 33 }; // classIdx = 1 + (rec[x] >> 3);
104
+    int32_t stats_ref[NUM_EDGETYPE];
105
+    int32_t stats_vec[NUM_EDGETYPE];
106
+
107
+    int32_t count_ref[NUM_EDGETYPE];
108
+    int32_t count_vec[NUM_EDGETYPE];
109
+
110
+    int j = 0;
111
+    for (int i = 0; i < ITERS; i++)
112
+    {
113
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
114
+        for (int x = 0; x < NUM_EDGETYPE; x++)
115
+        {
116
+            stats_ref[x] = stats_vec[x] = rand();
117
+            count_ref[x] = count_vec[x] = rand();
118
+        }
119
+
120
+        intptr_t stride = 16 * (rand() % 4 + 1);
121
+        int endX = MAX_CU_SIZE - (rand() % 5);
122
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
123
+
124
+        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
125
+        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
126
+
127
+        if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
128
+            return false;
129
+
130
+        reportfail();
131
+        j += INCR;
132
+    }
133
+
134
+    return true;
135
+}
136
+
137
+bool PixelHarness::check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt)
138
+{
139
+    enum { NUM_EDGETYPE = 5 };
140
+    int32_t stats_ref[NUM_EDGETYPE];
141
+    int32_t stats_vec[NUM_EDGETYPE];
142
+
143
+    int32_t count_ref[NUM_EDGETYPE];
144
+    int32_t count_vec[NUM_EDGETYPE];
145
+
146
+    int j = 0;
147
+    for (int i = 0; i < ITERS; i++)
148
+    {
149
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
150
+        for (int x = 0; x < NUM_EDGETYPE; x++)
151
+        {
152
+            stats_ref[x] = stats_vec[x] = rand();
153
+            count_ref[x] = count_vec[x] = rand();
154
+        }
155
+
156
+        intptr_t stride = 16 * (rand() % 4 + 1);
157
+        int endX = MAX_CU_SIZE - (rand() % 5) - 1;
158
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
159
+
160
+        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
161
+        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
162
+
163
+        if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
164
+            return false;
165
+
166
+        reportfail();
167
+        j += INCR;
168
+    }
169
+
170
+    return true;
171
+}
172
+
173
+bool PixelHarness::check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt)
174
+{
175
+    enum { NUM_EDGETYPE = 5 };
176
+    int32_t stats_ref[NUM_EDGETYPE];
177
+    int32_t stats_vec[NUM_EDGETYPE];
178
+
179
+    int32_t count_ref[NUM_EDGETYPE];
180
+    int32_t count_vec[NUM_EDGETYPE];
181
+
182
+    int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1;
183
+    int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1;
184
+
185
+    int j = 0;
186
+
187
+    for (int i = 0; i < ITERS; i++)
188
+    {
189
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
190
+        for (int x = 0; x < NUM_EDGETYPE; x++)
191
+        {
192
+            stats_ref[x] = stats_vec[x] = rand();
193
+            count_ref[x] = count_vec[x] = rand();
194
+        }
195
+
196
+        // initial sign
197
+        for (int x = 0; x < MAX_CU_SIZE + 2; x++)
198
+            _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1;
199
+
200
+        intptr_t stride = 16 * (rand() % 4 + 1);
201
x265_1.7.tar.gz/source/test/pixelharness.h -> x265_1.8.tar.gz/source/test/pixelharness.h Changed
32
 
1
@@ -66,7 +66,8 @@
2
     double   double_test_buff[TEST_CASES][BUFFSIZE];
3
 
4
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
5
-    bool check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
6
+    bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
7
+    bool check_pixel_sse_ss(pixel_sse_ss_t ref, pixel_sse_ss_t opt);
8
     bool check_pixelcmp_x3(pixelcmp_x3_t ref, pixelcmp_x3_t opt);
9
     bool check_pixelcmp_x4(pixelcmp_x4_t ref, pixelcmp_x4_t opt);
10
     bool check_copy_pp(copy_pp_t ref, copy_pp_t opt);
11
@@ -100,6 +101,11 @@
12
     bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
13
     bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
14
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
15
+    bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
16
+    bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt);
17
+    bool check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt);
18
+    bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
19
+    bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
20
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
21
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
22
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
23
@@ -108,6 +114,8 @@
24
     bool check_calSign(sign_t ref, sign_t opt);
25
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
26
     bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
27
+    bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
28
+    bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
29
 
30
 public:
31
 
32
x265_1.7.tar.gz/source/test/regression-tests.txt -> x265_1.8.tar.gz/source/test/regression-tests.txt Changed
137
 
1
@@ -12,50 +12,50 @@
2
 # not auto-detected.
3
 
4
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
5
-BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 32
6
+BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
7
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
8
-BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16
9
+BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
10
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
11
 BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
12
 BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
13
 BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
14
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode
15
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
16
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
17
-Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1
18
+Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
19
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
20
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
21
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
22
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing
23
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
24
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
25
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode
26
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
27
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
28
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
29
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
30
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
31
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
32
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
33
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers
34
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
35
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
36
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
37
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32
38
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
39
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
40
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
41
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0
42
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
43
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
44
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
45
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
46
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
47
-Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4
48
+Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
49
 Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
50
 Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
51
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
52
 KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
53
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8
54
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16
55
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
56
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
57
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
58
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain
59
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
60
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
61
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 32
62
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
63
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
64
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
65
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
66
@@ -66,16 +66,16 @@
67
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
68
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
69
 RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
70
-RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip
71
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra
72
+RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
73
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
74
 RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
75
 RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
76
-RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr
77
+RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
78
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
79
-RaceHorses_416x240_30_10bit.yuv,--preset placebo
80
+RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
81
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
82
 big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
83
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb
84
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
85
 big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
86
 big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
87
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
88
@@ -83,20 +83,20 @@
89
 city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
90
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
91
 city_4cif_60fps.y4m,--preset slower --scaling-list default
92
-city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra
93
+city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
94
 ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
95
-ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6
96
+ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
97
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
98
 ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
99
 ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
100
 ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
101
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
102
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0
103
-ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1
104
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
105
+ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
106
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
107
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
108
 mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
109
-mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip
110
+mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
111
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
112
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
113
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
114
@@ -113,12 +113,19 @@
115
 vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
116
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
117
 washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
118
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4
119
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32
120
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
121
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
122
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
123
 washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
124
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
125
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless
126
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
127
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
128
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
129
+old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
130
+Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
131
+BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
132
+FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
133
+FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
134
 
135
 # interlace test, even though input YUV is not field seperated
136
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
137
x265_1.7.tar.gz/source/test/smoke-tests.txt -> x265_1.8.tar.gz/source/test/smoke-tests.txt Changed
18
 
1
@@ -6,14 +6,14 @@
2
 
3
 big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
4
 big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default
5
-big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --cu-stats --pme --qg-size 16
6
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --pme --qg-size 16
7
 washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1 --qg-size 16
8
 washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
9
 washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0
10
 old_town_cross_444_720p50.y4m,--preset=ultrafast --weightp --keyint -1
11
 old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
12
 old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode --qg-size 32
13
-RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --cu-stats --max-tu-size 8
14
+RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
15
 RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
16
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
17
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
18
x265_1.7.tar.gz/source/test/testbench.cpp -> x265_1.8.tar.gz/source/test/testbench.cpp Changed
38
 
1
@@ -32,7 +32,7 @@
2
 #include "param.h"
3
 #include "cpu.h"
4
 
5
-using namespace x265;
6
+using namespace X265_NS;
7
 
8
 const char* lumaPartStr[NUM_PU_SIZES] =
9
 {
10
@@ -95,7 +95,7 @@
11
 
12
 int main(int argc, char *argv[])
13
 {
14
-    int cpuid = x265::cpu_detect();
15
+    int cpuid = X265_NS::cpu_detect();
16
     const char *testname = 0;
17
 
18
     if (!(argc & 1))
19
@@ -137,8 +137,7 @@
20
     }
21
 
22
     int seed = (int)time(NULL);
23
-    const char *bpp[] = { "8bpp", "16bpp" };
24
-    printf("Using random seed %X %s\n", seed, bpp[HIGH_BIT_DEPTH]);
25
+    printf("Using random seed %X %dbit\n", seed, X265_DEPTH);
26
     srand(seed);
27
 
28
     // To disable classes of tests, simply comment them out in this list
29
@@ -174,7 +173,7 @@
30
 
31
     for (int i = 0; test_arch[i].flag; i++)
32
     {
33
-        if (test_arch[i].flag & cpuid)
34
+        if ((test_arch[i].flag & cpuid) == test_arch[i].flag)
35
         {
36
             printf("Testing primitives: %s\n", test_arch[i].name);
37
             fflush(stdout);
38
x265_1.7.tar.gz/source/test/testharness.h -> x265_1.8.tar.gz/source/test/testharness.h Changed
73
 
1
@@ -31,18 +31,13 @@
2
 #pragma warning(disable: 4324) // structure was padded due to __declspec(align())
3
 #endif
4
 
5
-#if HIGH_BIT_DEPTH
6
-#define BIT_DEPTH 10
7
-#else
8
-#define BIT_DEPTH 8
9
-#endif
10
-#define PIXEL_MAX ((1 << BIT_DEPTH) - 1)
11
+#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
12
 #define PIXEL_MIN 0
13
 #define SHORT_MAX  32767
14
 #define SHORT_MIN -32767
15
 #define UNSIGNED_SHORT_MAX 65535
16
 
17
-using namespace x265;
18
+using namespace X265_NS;
19
 
20
 extern const char* lumaPartStr[NUM_PU_SIZES];
21
 extern const char* const* chromaPartStr[X265_CSP_COUNT];
22
@@ -123,14 +118,14 @@
23
 
24
 extern "C" {
25
 #if X265_ARCH_X86
26
-int x265_stack_pagealign(int (*func)(), int align);
27
+int PFX(stack_pagealign)(int (*func)(), int align);
28
 
29
 /* detect when callee-saved regs aren't saved
30
  * needs an explicit asm check because it only sometimes crashes in normal use. */
31
-intptr_t x265_checkasm_call(intptr_t (*func)(), int *ok, ...);
32
-float x265_checkasm_call_float(float (*func)(), int *ok, ...);
33
+intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
34
+float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
35
 #else
36
-#define x265_stack_pagealign(func, align) func()
37
+#define PFX(stack_pagealign)(func, align) func()
38
 #endif
39
 
40
 #if X86_64
41
@@ -144,24 +139,24 @@
42
  * overwrite the junk written to the stack so there's no guarantee that it will always
43
  * detect all functions that assumes zero-extension.
44
  */
45
-void x265_checkasm_stack_clobber(uint64_t clobber, ...);
46
+void PFX(checkasm_stack_clobber)(uint64_t clobber, ...);
47
 #define checked(func, ...) ( \
48
         m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
49
-        x265_checkasm_stack_clobber(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
50
+        PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
51
                                     m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
52
                                     m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
53
-        x265_checkasm_call((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
54
+        PFX(checkasm_call)((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
55
 
56
 #define checked_float(func, ...) ( \
57
         m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
58
-        x265_checkasm_stack_clobber(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
59
+        PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
60
                                     m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
61
                                     m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
62
-        x265_checkasm_call_float((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
63
+        PFX(checkasm_call_float)((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
64
 #define reportfail() if (!m_ok) { fflush(stdout); fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); }
65
 #elif ARCH_X86
66
-#define checked(func, ...) x265_checkasm_call((intptr_t(*)())func, &m_ok, __VA_ARGS__);
67
-#define checked_float(func, ...) x265_checkasm_call_float((float(*)())func, &m_ok, __VA_ARGS__);
68
+#define checked(func, ...) PFX(checkasm_call)((intptr_t(*)())func, &m_ok, __VA_ARGS__);
69
+#define checked_float(func, ...) PFX(checkasm_call_float)((float(*)())func, &m_ok, __VA_ARGS__);
70
 
71
 #else // if X86_64
72
 #define checked(func, ...) func(__VA_ARGS__)
73
x265_1.8.tar.gz/source/x265-extras.cpp Added
201
 
1
@@ -0,0 +1,341 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2015 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
7
+ *          Divya Manivannan <divya@multicorewareinc.com>
8
+ *
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "x265.h"
28
+#include "x265-extras.h"
29
+
30
+#include "common.h"
31
+
32
+using namespace X265_NS;
33
+
34
+static const char* summaryCSVHeader =
35
+    "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
36
+    "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
37
+    "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
38
+    "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
39
+    "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
40
+    "Version\n";
41
+
42
+FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
43
+{
44
+    if (sizeof(x265_stats) != api.sizeof_stats || sizeof(x265_picture) != api.sizeof_picture)
45
+    {
46
+        fprintf(stderr, "extras [error]: structure size skew, unable to create CSV logfile\n");
47
+        return NULL;
48
+    }
49
+
50
+    FILE *csvfp = fopen(fname, "r");
51
+    if (csvfp)
52
+    {
53
+        /* file already exists, re-open for append */
54
+        fclose(csvfp);
55
+        return fopen(fname, "ab");
56
+    }
57
+    else
58
+    {
59
+        /* new CSV file, write header */
60
+        csvfp = fopen(fname, "wb");
61
+        if (csvfp)
62
+        {
63
+            if (level)
64
+            {
65
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
66
+                if (param.rc.rateControlMode == X265_RC_CRF)
67
+                    fprintf(csvfp, "RateFactor, ");
68
+                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
69
+                /* detailed performance statistics */
70
+                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
71
+                if (level >= 2)
72
+                {
73
+                    uint32_t size = param.maxCUSize;
74
+                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
75
+                    {
76
+                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
77
+                        size /= 2;
78
+                    }
79
+                    fprintf(csvfp, ", 4x4");
80
+                    size = param.maxCUSize;
81
+                    if (param.bEnableRectInter)
82
+                    {
83
+                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
84
+                        {
85
+                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
86
+                            if (param.bEnableAMP)
87
+                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
88
+                            size /= 2;
89
+                        }
90
+                    }
91
+                    else
92
+                    {
93
+                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
94
+                        {
95
+                            fprintf(csvfp, ", Inter %dx%d", size, size);
96
+                            size /= 2;
97
+                        }
98
+                    }
99
+                    size = param.maxCUSize;
100
+                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
101
+                    {
102
+                        fprintf(csvfp, ", Skip %dx%d", size, size);
103
+                        size /= 2;
104
+                    }
105
+                    size = param.maxCUSize;
106
+                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
107
+                    {
108
+                        fprintf(csvfp, ", Merge %dx%d", size, size);
109
+                        size /= 2;
110
+                    }
111
+                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
112
+                }
113
+                fprintf(csvfp, "\n");
114
+            }
115
+            else
116
+                fputs(summaryCSVHeader, csvfp);
117
+        }
118
+        return csvfp;
119
+    }
120
+}
121
+
122
+// per frame CSV logging
123
+void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& pic, int level)
124
+{
125
+    if (!csvfp)
126
+        return;
127
+
128
+    const x265_frame_stats* frameStats = &pic.frameData;
129
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
130
+    if (param.rc.rateControlMode == X265_RC_CRF)
131
+        fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
132
+    if (param.bEnablePsnr)
133
+        fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
134
+    else
135
+        fputs(" -, -, -, -,", csvfp);
136
+    if (param.bEnableSsim)
137
+        fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
138
+    else
139
+        fputs(" -, -,", csvfp);
140
+    if (frameStats->sliceType == 'I')
141
+        fputs(" -, -,", csvfp);
142
+    else
143
+    {
144
+        int i = 0;
145
+        while (frameStats->list0POC[i] != -1)
146
+            fprintf(csvfp, "%d ", frameStats->list0POC[i++]);
147
+        fprintf(csvfp, ",");
148
+        if (frameStats->sliceType != 'P')
149
+        {
150
+            i = 0;
151
+            while (frameStats->list1POC[i] != -1)
152
+                fprintf(csvfp, "%d ", frameStats->list1POC[i++]);
153
+            fprintf(csvfp, ",");
154
+        }
155
+        else
156
+            fputs(" -,", csvfp);
157
+    }
158
+    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
159
+    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
160
+    if (level >= 2)
161
+    {
162
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
163
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
164
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
165
+        if (param.bEnableRectInter)
166
+        {
167
+            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
168
+            {
169
+                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
170
+                if (param.bEnableAMP)
171
+                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
172
+            }
173
+        }
174
+        else
175
+        {
176
+            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
177
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
178
+        }
179
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
180
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
181
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
182
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
183
+        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
184
+    }
185
+    fprintf(csvfp, "\n");
186
+    fflush(stderr);
187
+}
188
+
189
+void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
190
+{
191
+    if (!csvfp)
192
+        return;
193
+
194
+    if (level)
195
+    {
196
+        // adding summary to a per-frame csv log file, so it needs a summary header
197
+        fprintf(csvfp, "\nSummary\n");
198
+        fputs(summaryCSVHeader, csvfp);
199
+    }
200
+
201
x265_1.8.tar.gz/source/x265-extras.h Added
68
 
1
@@ -0,0 +1,66 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2015 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_EXTRAS_H
26
+#define X265_EXTRAS_H 1
27
+
28
+#include "x265.h"
29
+
30
+#include <stdio.h>
31
+#include <stdint.h>
32
+
33
+#ifdef __cplusplus
34
+extern "C" {
35
+#endif
36
+
37
+#if _WIN32
38
+#define LIBAPI __declspec(dllexport)
39
+#else
40
+#define LIBAPI
41
+#endif
42
+
43
+/* Open a CSV log file. On success it returns a file handle which must be passed
44
+ * to x265_csvlog_frame() and/or x265_csvlog_encode(). The file handle must be
45
+ * closed by the caller using fclose(). If level is 0, then no frame logging
46
+ * header is written to the file. This function will return NULL if it is unable
47
+ * to open the file for write or if it detects a structure size skew */
48
+LIBAPI FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level);
49
+
50
+/* Log frame statistics to the CSV file handle. level should have been non-zero
51
+ * in the call to x265_csvlog_open() if this function is called. */
52
+LIBAPI void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& pic, int level);
53
+
54
+/* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are
55
+ * intended to be command line arguments passed to the encoder. Encode
56
+ * statistics should be queried from the encoder just prior to closing it. */
57
+LIBAPI void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv);
58
+
59
+/* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
60
+ * the residual bits to dither each row. */
61
+LIBAPI void x265_dither_image(const x265_api& api, x265_picture&, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
62
+
63
+#ifdef __cplusplus
64
+}
65
+#endif
66
+
67
+#endif
68
x265_1.7.tar.gz/source/x265.cpp -> x265_1.8.tar.gz/source/x265.cpp Changed
201
 
1
@@ -25,15 +25,17 @@
2
 #pragma warning(disable: 4127) // conditional expression is constant, yes I know
3
 #endif
4
 
5
+#include "x265.h"
6
+#include "x265-extras.h"
7
+#include "x265cli.h"
8
+
9
+#include "common.h"
10
 #include "input/input.h"
11
 #include "output/output.h"
12
 #include "output/reconplay.h"
13
-#include "filters/filters.h"
14
-#include "common.h"
15
+
16
 #include "param.h"
17
 #include "cpu.h"
18
-#include "x265.h"
19
-#include "x265cli.h"
20
 
21
 #if HAVE_VLD
22
 /* Visual Leak Detector */
23
@@ -59,7 +61,7 @@
24
 #define SetThreadExecutionState(es)
25
 #endif
26
 
27
-using namespace x265;
28
+using namespace X265_NS;
29
 
30
 /* Ctrl-C handler */
31
 static volatile sig_atomic_t b_ctrl_c /* = 0 */;
32
@@ -74,12 +76,15 @@
33
     ReconFile* recon;
34
     OutputFile* output;
35
     FILE*       qpfile;
36
+    FILE*       csvfpt;
37
+    const char* csvfn;
38
     const char* reconPlayCmd;
39
     const x265_api* api;
40
     x265_param* param;
41
     bool bProgress;
42
     bool bForceY4m;
43
     bool bDither;
44
+    int csvLogLevel;
45
     uint32_t seek;              // number of frames to skip from the beginning
46
     uint32_t framesToBeEncoded; // number of frames to encode
47
     uint64_t totalbytes;
48
@@ -95,6 +100,8 @@
49
         recon = NULL;
50
         output = NULL;
51
         qpfile = NULL;
52
+        csvfpt = NULL;
53
+        csvfn = NULL;
54
         reconPlayCmd = NULL;
55
         api = NULL;
56
         param = NULL;
57
@@ -105,6 +112,7 @@
58
         startTime = x265_mdate();
59
         prevUpdateTime = 0;
60
         bDither = false;
61
+        csvLogLevel = 0;
62
     }
63
 
64
     void destroy();
65
@@ -124,6 +132,9 @@
66
     if (qpfile)
67
         fclose(qpfile);
68
     qpfile = NULL;
69
+    if (csvfpt)
70
+        fclose(csvfpt);
71
+    csvfpt = NULL;
72
     if (output)
73
         output->release();
74
     output = NULL;
75
@@ -158,8 +169,8 @@
76
 
77
 bool CLIOptions::parse(int argc, char **argv)
78
 {
79
-    bool bError = 0;
80
-    int help = 0;
81
+    bool bError = false;
82
+    int bShowHelp = false;
83
     int inputBitDepth = 8;
84
     int outputBitDepth = 0;
85
     int reconFileBitDepth = 0;
86
@@ -188,8 +199,21 @@
87
             tune = optarg;
88
         else if (c == 'D')
89
             outputBitDepth = atoi(optarg);
90
+        else if (c == 'P')
91
+            profile = optarg;
92
         else if (c == '?')
93
-            showHelp(param);
94
+            bShowHelp = true;
95
+    }
96
+
97
+    if (!outputBitDepth && profile)
98
+    {
99
+        /* try to derive the output bit depth from the requested profile */
100
+        if (strstr(profile, "10"))
101
+            outputBitDepth = 10;
102
+        else if (strstr(profile, "12"))
103
+            outputBitDepth = 12;
104
+        else
105
+            outputBitDepth = 8;
106
     }
107
 
108
     api = x265_api_get(outputBitDepth);
109
@@ -212,6 +236,12 @@
110
         return true;
111
     }
112
 
113
+    if (bShowHelp)
114
+    {
115
+        printVersion(param, api);
116
+        showHelp(param);
117
+    }
118
+
119
     for (optind = 0;; )
120
     {
121
         int long_options_index = -1;
122
@@ -222,12 +252,13 @@
123
         switch (c)
124
         {
125
         case 'h':
126
+            printVersion(param, api);
127
             showHelp(param);
128
             break;
129
 
130
         case 'V':
131
-            printVersion(param);
132
-            x265_setup_primitives(param, -1);
133
+            printVersion(param, api);
134
+            x265_report_simd(param);
135
             exit(0);
136
 
137
         default:
138
@@ -264,6 +295,8 @@
139
             if (0) ;
140
             OPT2("frame-skip", "seek") this->seek = (uint32_t)x265_atoi(optarg, bError);
141
             OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
142
+            OPT("csv") this->csvfn = optarg;
143
+            OPT("csv-log-level") this->csvLogLevel = x265_atoi(optarg, bError);
144
             OPT("no-progress") this->bProgress = false;
145
             OPT("output") outputfn = optarg;
146
             OPT("input") inputfn = optarg;
147
@@ -272,9 +305,9 @@
148
             OPT("dither") this->bDither = true;
149
             OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
150
             OPT("y4m") this->bForceY4m = true;
151
-            OPT("profile") profile = optarg; /* handled last */
152
-            OPT("preset") /* handled above */;
153
-            OPT("tune")   /* handled above */;
154
+            OPT("profile") /* handled above */;
155
+            OPT("preset")  /* handled above */;
156
+            OPT("tune")    /* handled above */;
157
             OPT("output-depth")   /* handled above */;
158
             OPT("recon-y4m-exec") reconPlayCmd = optarg;
159
             OPT("qpfile")
160
@@ -309,18 +342,22 @@
161
         return true;
162
     }
163
 
164
-    if (argc <= 1 || help)
165
+    if (argc <= 1)
166
+    {
167
+        api->param_default(param);
168
+        printVersion(param, api);
169
         showHelp(param);
170
+    }
171
 
172
-    if (inputfn == NULL || outputfn == NULL)
173
+    if (!inputfn || !outputfn)
174
     {
175
-        x265_log(param, X265_LOG_ERROR, "input or output file not specified, try -V for help\n");
176
+        x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
177
         return true;
178
     }
179
 
180
-    if (param->internalBitDepth != api->max_bit_depth)
181
+    if (param->internalBitDepth != api->bit_depth)
182
     {
183
-        x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", api->max_bit_depth);
184
+        x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", api->bit_depth);
185
         return true;
186
     }
187
 
188
@@ -465,7 +502,8 @@
189
  * 1 - unable to parse command line
190
  * 2 - unable to open encoder
191
  * 3 - unable to generate stream headers
192
- * 4 - encoder abort */
193
+ * 4 - encoder abort
194
+ * 5 - unable to open csv file */
195
 
196
 int main(int argc, char **argv)
197
 {
198
@@ -516,6 +554,19 @@
199
     /* get the encoder parameters post-initialization */
200
     api->encoder_parameters(encoder, param);
201
x265_1.7.tar.gz/source/x265.def.in -> x265_1.8.tar.gz/source/x265.def.in Changed
6
 
1
@@ -21,3 +21,4 @@
2
 x265_encoder_close
3
 x265_cleanup
4
 x265_api_get_${X265_BUILD}
5
+x265_api_query
6
x265_1.7.tar.gz/source/x265.h -> x265_1.8.tar.gz/source/x265.h Changed
201
 
1
@@ -100,6 +100,50 @@
2
     uint32_t         numPartitions;
3
 } x265_analysis_data;
4
 
5
+/* cu statistics */
6
+typedef struct x265_cu_stats
7
+{
8
+    double      percentSkipCu[4];                // Percentage of skip cu in all depths
9
+    double      percentMergeCu[4];               // Percentage of merge cu in all depths
10
+    double      percentIntraDistribution[4][3];  // Percentage of DC, Planar, Angular intra modes in all depths
11
+    double      percentInterDistribution[4][3];  // Percentage of 2Nx2N inter, rect and amp in all depths
12
+    double      percentIntraNxN;                 // Percentage of 4x4 cu
13
+
14
+    /* All the above values will add up to 100%. */
15
+} x265_cu_stats;
16
+
17
+/* Frame level statistics */
18
+typedef struct x265_frame_stats
19
+{
20
+    double           qp;
21
+    double           rateFactor;
22
+    double           psnrY;
23
+    double           psnrU;
24
+    double           psnrV;
25
+    double           psnr;
26
+    double           ssim;
27
+    double           decideWaitTime;
28
+    double           row0WaitTime;
29
+    double           wallTime;
30
+    double           refWaitWallTime;
31
+    double           totalCTUTime;
32
+    double           stallTime;
33
+    double           avgWPP;
34
+    double           avgLumaDistortion;
35
+    double           avgChromaDistortion;
36
+    double           avgPsyEnergy;
37
+    double           avgLumaLevel;
38
+    uint64_t         bits;
39
+    int              encoderOrder;
40
+    int              poc;
41
+    int              countRowBlocks;
42
+    int              list0POC[16];
43
+    int              list1POC[16];
44
+    uint16_t         maxLumaLevel;
45
+    char             sliceType;
46
+    x265_cu_stats    cuStats;
47
+} x265_frame_stats;
48
+
49
 /* Used to pass pictures into the encoder, and to get picture data back out of
50
  * the encoder.  The input and output semantics are different */
51
 typedef struct x265_picture
52
@@ -161,6 +205,9 @@
53
      * this data structure */
54
     x265_analysis_data analysisData;
55
 
56
+    /* Frame level statistics */
57
+    x265_frame_stats frameData;
58
+
59
 } x265_picture;
60
 
61
 typedef enum
62
@@ -221,9 +268,8 @@
63
 #define X265_LOG_ERROR          0
64
 #define X265_LOG_WARNING        1
65
 #define X265_LOG_INFO           2
66
-#define X265_LOG_FRAME          3
67
-#define X265_LOG_DEBUG          4
68
-#define X265_LOG_FULL           5
69
+#define X265_LOG_DEBUG          3
70
+#define X265_LOG_FULL           4
71
 
72
 #define X265_B_ADAPT_NONE       0
73
 #define X265_B_ADAPT_FAST       1
74
@@ -249,6 +295,7 @@
75
 #define X265_AQ_NONE                 0
76
 #define X265_AQ_VARIANCE             1
77
 #define X265_AQ_AUTO_VARIANCE        2
78
+#define X265_AQ_AUTO_VARIANCE_BIASED 3
79
 
80
 /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
81
 
82
@@ -302,20 +349,35 @@
83
     X265_RC_CRF
84
 } X265_RC_METHODS;
85
 
86
+/* slice type statistics */
87
+typedef struct x265_sliceType_stats
88
+{
89
+    double        avgQp;
90
+    double        bitrate;
91
+    double        psnrY;
92
+    double        psnrU;
93
+    double        psnrV;
94
+    double        ssim;
95
+    uint32_t      numPics;
96
+} x265_sliceType_stats;
97
+
98
 /* Output statistics from encoder */
99
 typedef struct x265_stats
100
 {
101
-    double    globalPsnrY;
102
-    double    globalPsnrU;
103
-    double    globalPsnrV;
104
-    double    globalPsnr;
105
-    double    globalSsim;
106
-    double    elapsedEncodeTime;    /* wall time since encoder was opened */
107
-    double    elapsedVideoTime;     /* encoded picture count / frame rate */
108
-    double    bitrate;              /* accBits / elapsed video time */
109
-    uint64_t  accBits;              /* total bits output thus far */
110
-    uint32_t  encodedPictureCount;  /* number of output pictures thus far */
111
-    uint32_t  totalWPFrames;        /* number of uni-directional weighted frames used */
112
+    double                globalPsnrY;
113
+    double                globalPsnrU;
114
+    double                globalPsnrV;
115
+    double                globalPsnr;
116
+    double                globalSsim;
117
+    double                elapsedEncodeTime;    /* wall time since encoder was opened */
118
+    double                elapsedVideoTime;     /* encoded picture count / frame rate */
119
+    double                bitrate;              /* accBits / elapsed video time */
120
+    uint64_t              accBits;              /* total bits output thus far */
121
+    uint32_t              encodedPictureCount;  /* number of output pictures thus far */
122
+    uint32_t              totalWPFrames;        /* number of uni-directional weighted frames used */
123
+    x265_sliceType_stats  statsI;               /* statistics of I slice */
124
+    x265_sliceType_stats  statsP;               /* statistics of P slice */
125
+    x265_sliceType_stats  statsB;               /* statistics of B slice */
126
 } x265_stats;
127
 
128
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
129
@@ -326,7 +388,7 @@
130
 static const char * const x265_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 };
131
 static const char * const x265_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100",
132
                                                     "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12",
133
-                                                    "smpte-st-2084", "smpte-st-428", 0 };
134
+                                                    "smpte-st-2084", "smpte-st-428", "arib-std-b67", 0 };
135
 static const char * const x265_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m",
136
                                                      "YCgCo", "bt2020nc", "bt2020c", 0 };
137
 static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11",
138
@@ -439,8 +501,7 @@
139
 
140
     /*== Logging Features ==*/
141
 
142
-    /* Enable analysis and logging distribution of CUs encoded across various
143
-     * modes during mode decision. Default disabled */
144
+    /* Enable analysis and logging distribution of CUs. Now deprecated */
145
     int       bLogCuStats;
146
 
147
     /* Enable the measurement and reporting of PSNR. Default is enabled */
148
@@ -453,11 +514,7 @@
149
      * X265_LOG_FULL, default is X265_LOG_INFO */
150
     int       logLevel;
151
 
152
-    /* filename of CSV log. If logLevel greater than or equal to X265_LOG_FRAME,
153
-     * the encoder will emit per-slice statistics to this log file in encode
154
-     * order. Otherwise the encoder will emit per-stream statistics into the log
155
-     * file when x265_encoder_log is called (presumably at the end of the
156
-     * encode) */
157
+    /* Filename of CSV log. Now deprecated */
158
     const char* csvfn;
159
 
160
     /*== Internal Picture Specification ==*/
161
@@ -1143,11 +1200,31 @@
162
 #define X265_PARAM_BAD_VALUE (-2)
163
 int x265_param_parse(x265_param *p, const char *name, const char *value);
164
 
165
-/* x265_param_apply_profile:
166
- *      Applies the restrictions of the given profile. (one of below) */
167
-static const char * const x265_profile_names[] = { "main", "main10", "mainstillpicture", 0 };
168
+static const char * const x265_profile_names[] = {
169
+    /* HEVC v1 */
170
+    "main", "main10", "mainstillpicture", /* alias */ "msp",
171
+
172
+    /* HEVC v2 (Range Extensions) */
173
+    "main-intra", "main10-intra",
174
+    "main444-8",  "main444-intra", "main444-stillpicture",
175
 
176
-/*      (can be NULL, in which case the function will do nothing)
177
+    "main422-10", "main422-10-intra",
178
+    "main444-10", "main444-10-intra",
179
+
180
+    "main12",     "main12-intra",                  /* Highly Experimental */
181
+    "main422-12", "main422-12-intra",
182
+    "main444-12", "main444-12-intra",
183
+
184
+    "main444-16-intra", "main444-16-stillpicture", /* Not Supported! */
185
+    0
186
+};
187
+
188
+/* x265_param_apply_profile:
189
+ *      Applies the restrictions of the given profile. (one of x265_profile_names)
190
+ *      (can be NULL, in which case the function will do nothing)
191
+ *      Note: the detected profile can be lower than the one specified to this
192
+ *      function. This function will force the encoder parameters to fit within
193
+ *      the specified profile, or fail if that is impossible.
194
  *      returns 0 on success, negative on failure (e.g. invalid profile name). */
195
 int x265_param_apply_profile(x265_param *, const char *profile);
196
 
197
@@ -1263,9 +1340,7 @@
198
 void x265_encoder_get_stats(x265_encoder *encoder, x265_stats *, uint32_t statsSizeBytes);
199
 
200
 /* x265_encoder_log:
201
x265_1.7.tar.gz/source/x265cli.h -> x265_1.8.tar.gz/source/x265cli.h Changed
147
 
1
@@ -24,10 +24,13 @@
2
 #ifndef X265CLI_H
3
 #define X265CLI_H 1
4
 
5
+#include "common.h"
6
+#include "param.h"
7
+
8
 #include <getopt.h>
9
 
10
 #ifdef __cplusplus
11
-namespace x265 {
12
+namespace X265_NS {
13
 #endif
14
 
15
 static const char short_options[] = "o:D:P:p:f:F:r:I:i:b:s:t:q:m:hwV?";
16
@@ -54,6 +57,7 @@
17
     { "allow-non-conformance",no_argument, NULL, 0 },
18
     { "no-allow-non-conformance",no_argument, NULL, 0 },
19
     { "csv",            required_argument, NULL, 0 },
20
+    { "csv-log-level",  required_argument, NULL, 0 },
21
     { "no-cu-stats",          no_argument, NULL, 0 },
22
     { "cu-stats",             no_argument, NULL, 0 },
23
     { "y4m",                  no_argument, NULL, 0 },
24
@@ -121,6 +125,7 @@
25
     { "no-b-pyramid",         no_argument, NULL, 0 },
26
     { "b-pyramid",            no_argument, NULL, 0 },
27
     { "ref",            required_argument, NULL, 0 },
28
+    { "limit-refs",     required_argument, NULL, 0 },
29
     { "no-weightp",           no_argument, NULL, 0 },
30
     { "weightp",              no_argument, NULL, 'w' },
31
     { "no-weightb",           no_argument, NULL, 0 },
32
@@ -183,7 +188,8 @@
33
     { "transfer",       required_argument, NULL, 0 },
34
     { "colormatrix",    required_argument, NULL, 0 },
35
     { "chromaloc",      required_argument, NULL, 0 },
36
-    { "crop-rect",      required_argument, NULL, 0 },
37
+    { "display-window", required_argument, NULL, 0 },
38
+    { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
39
     { "master-display", required_argument, NULL, 0 },
40
     { "max-cll",        required_argument, NULL, 0 },
41
     { "no-dither",            no_argument, NULL, 0 },
42
@@ -219,17 +225,15 @@
43
     { 0, 0, 0, 0 }
44
 };
45
 
46
-static void printVersion(x265_param *param)
47
+static void printVersion(x265_param *param, const x265_api* api)
48
 {
49
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str);
50
-    x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str);
51
+    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
52
+    x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
53
 }
54
 
55
 static void showHelp(x265_param *param)
56
 {
57
     int level = param->logLevel;
58
-    x265_param_default(param);
59
-    printVersion(param);
60
 
61
 #define OPT(value) (value ? "enabled" : "disabled")
62
 #define H0 printf
63
@@ -243,11 +247,11 @@
64
     H0("-V/--version                     Show version info and exit\n");
65
     H0("\nOutput Options:\n");
66
     H0("-o/--output <filename>           Bitstream output file name\n");
67
-    H0("-D/--output-depth 8|10           Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
68
-    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", x265::logLevelNames[param->logLevel + 1]);
69
+    H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
70
+    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
71
     H0("   --no-progress                 Disable CLI progress reports\n");
72
-    H0("   --[no-]cu-stats               Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats));
73
-    H1("   --csv <filename>              Comma separated log file, log level >= 3 frame log, else one line per run\n");
74
+    H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
75
+    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
76
     H0("\nInput Options:\n");
77
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
78
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
79
@@ -302,10 +306,12 @@
80
     H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
81
     H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
82
     H0("\nTemporal / motion search options:\n");
83
+    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
84
+    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
85
+    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
86
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
87
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
88
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
89
-    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
90
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
91
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
92
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
93
@@ -327,13 +333,6 @@
94
     H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
95
     H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
96
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
97
-    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
98
-    H1("   --zones <zone0>/<zone1>/...   Tweak the bitrate of regions of the video\n");
99
-    H1("                                 Each zone is of the form\n");
100
-    H1("                                   <start frame>,<end frame>,<option>\n");
101
-    H1("                                   where <option> is either\n");
102
-    H1("                                       q=<integer> (force QP)\n");
103
-    H1("                                   or  b=<float> (bitrate multiplier)\n");
104
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
105
     H1("                                 Format of each line: framenumber frametype QP\n");
106
     H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
107
@@ -359,7 +358,7 @@
108
     H0("   --[no-]strict-cbr             Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr));
109
     H0("   --analysis-mode <string|int>  save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
110
     H0("   --analysis-file <filename>    Specify file name used for either dumping or reading analysis data.\n");
111
-    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
112
+    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
113
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
114
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
115
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
116
@@ -370,6 +369,12 @@
117
     H1("   --cbqpoffs <integer>          Chroma Cb QP Offset [-12..12]. Default %d\n", param->cbQpOffset);
118
     H1("   --crqpoffs <integer>          Chroma Cr QP Offset [-12..12]. Default %d\n", param->crQpOffset);
119
     H1("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
120
+    H1("   --zones <zone0>/<zone1>/...   Tweak the bitrate of regions of the video\n");
121
+    H1("                                 Each zone is of the form\n");
122
+    H1("                                   <start frame>,<end frame>,<option>\n");
123
+    H1("                                   where <option> is either\n");
124
+    H1("                                       q=<integer> (force QP)\n");
125
+    H1("                                   or  b=<float> (bitrate multiplier)\n");
126
     H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
127
     H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
128
     H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
129
@@ -383,7 +388,7 @@
130
     H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
131
     H0("                                 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
132
     H0("                                 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
133
-    H1("   --crop-rect <string>          Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
134
+    H1("   --display-window <string>     Describe overscan cropping region as 'left,top,right,bottom' in pixels\n");
135
     H1("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
136
     H0("   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
137
     H0("   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited\n");
138
@@ -391,7 +396,7 @@
139
     H0("                                 smpte240m, film, bt2020. Default undef\n");
140
     H0("   --transfer <string>           Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
141
     H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
142
-    H0("                                 bt2020-10, bt2020-12. Default undef\n");
143
+    H0("                                 bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428, arib-std-b67. Default undef\n");
144
     H1("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
145
     H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
146
     H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
147