Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 27

x265.changes Changed

@@ -1,4 +1,61 @@
 -------------------------------------------------------------------
+Sat May  9 11:26:50 UTC 2026 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 4.2
+  New feature:
+  * Threaded Motion Estimation (Experimental feature)-It uses a
+    dedicated threadpool to precompute Motion Estimation in
+    parallel.Improves encoding speed upto 1.5x for 1080p & lower
+    resolution on multi core machines with low frequency setting.
+    On high frequency systems or on machines with low number of
+    cores, the overhead of additional Motion estimation work may
+    outweigh parallelism
+  Enhancements to existing features:
+  * Add new Levels 6.3 to 7.2 specified in ITU-T H.265 (V9)
+    (09/2023) and above
+  * Improve Slices feature with check zeroMv
+  * Enable frame parallelism with MCSTF feature
+  * Updated support to signal AOM FGM params
+  * Improve quality with SBRC feature
+  * Updated DolbyVision P5 VUI defaults
+  API changes:
+  * API Support to enable Threaded Motion
+    Estimation(--threaded-me)
+  Optimizations:
+  * RISC V optimizations including SAD, SATD, DCT, IDCT, block
+    copy, pixel utilities, SAO, loopfilter, transpose kernels
+    resulting in 2x encoding speed.
+  * ARM SIMD optimizations including the use of NEON and SVE
+    instruction set extensions. The following algorithms now
+    have optimized SIMD implementations: DST, IDCT, SSE, SSD,
+    intra_pred_planar, pelFilterLumaStrong, interpolation,
+    planecopy, dequant_normal, blockcopy, pixel variance
+    resulting in 8% faster encoding speed compared to v4.1
+  Bug fixes:
+  * Fix memory leaks (no command line option, SEI buffer,
+    analysis save/load)
+  * Fix chroma qp offset for non yuv444 inputs
+  * Fix max supported input resolution
+  * Fix bugs with ARM SIMD optimizations
+  * Fix Alpha and Multiview feature flag support in x265_config
+  * Fix test harness issues, CMake errors
+  * Fix inconsistent output with aq-motion
+  * Fix crash with hist-scenecut on high bit-depth builds
+  * Fix lookahead concurrency bug
+  * Fix shared link issue (R_X86_64_PC32), yuv recon output
+     issue, rd-refine and dynamic-refine issue, inputs for
+     Windows named pipe,weighted prediction delta_chroma_offset,
+     crf and vbv issue in abr-ladder, psnr and ssim reported
+     with MCSTF feature, internally overflowed VBV variables
+  Known issues:
+  * Output mismatch between analysis save & load with cutree
+    with reuse level < 10
+  * Inconsistent hash mismatch with abr-ladder feature
+  * Performance regression observed with threaded-me feature on
+    high frequency systems and for higher resolutions (4k)
+- Reworked x265.patch
+
+-------------------------------------------------------------------
 Mon Mar  3 03:03:03 UTC 2025 - olaf@aepfle.de
 
 - Update to version 4.1

x265.spec Changed

x265.patch Changed

_service Changed

baselibs.conf Changed

x265-4.1.tar/source/common/aarch64/blockcopy8-common.S Deleted

@@ -1,54 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-#include "asm.S"
-
-.arch           armv8-a
-
-// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-.macro cpy1Dto2D_shr_start
-    add             x2, x2, x2
-    dup             v0.8h, w3
-    cmeq            v1.8h, v1.8h, v1.8h
-    sshl            v1.8h, v1.8h, v0.8h
-    sri             v1.8h, v1.8h, #1
-    neg             v0.8h, v0.8h
-.endm
-
-.macro cpy2Dto1D_shr_start
-    add             x2, x2, x2
-    dup             v0.8h, w3
-    cmeq            v1.8h, v1.8h, v1.8h
-    sshl            v1.8h, v1.8h, v0.8h
-    sri             v1.8h, v1.8h, #1
-    neg             v0.8h, v0.8h
-.endm
-
-const xtn_xtn2_table, align=4
-.byte    0, 2, 4, 6, 8, 10, 12, 14
-.byte    16, 18, 20, 22, 24, 26, 28, 30
-endconst
-

x265-4.1.tar/source/common/aarch64/blockcopy8-sve.S Deleted

@@ -1,1416 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-#include "blockcopy8-common.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
- *
- * r0   - a
- * r1   - stridea
- * r2   - b
- * r3   - strideb */
-
-function PFX(blockcopy_sp_4x4_sve)
-    ptrue           p0.h, vl4
-.rept 2
-    ld1h            {z0.h}, p0/z, x2
-    add             x2, x2, x3, lsl #1
-    st1b            {z0.h}, p0, x0
-    add             x0, x0, x1
-    ld1h            {z1.h}, p0/z, x2
-    add             x2, x2, x3, lsl #1
-    st1b            {z1.h}, p0, x0
-    add             x0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_sp_8x8_sve)
-    ptrue           p0.h, vl8
-.rept 4
-    ld1h            {z0.h}, p0/z, x2
-    add             x2, x2, x3, lsl #1
-    st1b            {z0.h}, p0, x0
-    add            x0, x0, x1
-    ld1h            {z1.h}, p0/z, x2
-    add             x2, x2, x3, lsl #1
-    st1b            {z1.h}, p0, x0
-    add            x0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_sp_16x16_sve)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_blockcopy_sp_16_16
-    lsl             x3, x3, #1
-    movrel          x11, xtn_xtn2_table
-    ld1             {v31.16b}, x11
-.rept 8
-    ld1             {v0.8h-v1.8h}, x2, x3
-    ld1             {v2.8h-v3.8h}, x2, x3
-    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
-    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
-    st1             {v0.16b}, x0, x1
-    st1             {v1.16b}, x0, x1
-.endr
-    ret
-.vl_gt_16_blockcopy_sp_16_16:
-    ptrue           p0.h, vl16
-.rept 8
-    ld1h            {z0.h}, p0/z, x2
-    st1b            {z0.h}, p0, x0
-    add             x2, x2, x3, lsl #1
-    add             x0, x0, x1
-    ld1h            {z1.h}, p0/z, x2
-    st1b            {z1.h}, p0, x0
-    add             x2, x2, x3, lsl #1
-    add             x0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_sp_32x32_sve)
-    mov             w12, #4
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_blockcopy_sp_32_32
-    lsl             x3, x3, #1
-    movrel          x11, xtn_xtn2_table
-    ld1             {v31.16b}, x11
-.Loop_csp32_sve:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v0.8h-v3.8h}, x2, x3
-    ld1             {v4.8h-v7.8h}, x2, x3
-    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
-    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
-    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
-    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
-    st1             {v0.16b-v1.16b}, x0, x1
-    st1             {v2.16b-v3.16b}, x0, x1
-.endr
-    cbnz            w12, .Loop_csp32_sve
-    ret
-.vl_gt_16_blockcopy_sp_32_32:
-    cmp             x9, #48
-    bgt             .vl_gt_48_blockcopy_sp_32_32
-    ptrue           p0.h, vl16
-.vl_gt_16_loop_csp32_sve:
-    sub             w12, w12, #1
-.rept 4
-    ld1h            {z0.h}, p0/z, x2
-    ld1h            {z1.h}, p0/z, x2, #1, mul vl
-    st1b            {z0.h}, p0, x0
-    st1b            {z1.h}, p0, x0, #1, mul vl
-    add             x2, x2, x3, lsl #1
-    add             x0, x0, x1
-    ld1h            {z2.h}, p0/z, x2
-    ld1h            {z3.h}, p0/z, x2, #1, mul vl
-    st1b            {z2.h}, p0, x0
-    st1b            {z3.h}, p0, x0, #1, mul vl
-    add             x2, x2, x3, lsl #1
-    add             x0, x0, x1
-.endr
-    cbnz            w12, .vl_gt_16_loop_csp32_sve
-    ret
-.vl_gt_48_blockcopy_sp_32_32:
-    ptrue           p0.h, vl32
-.vl_gt_48_loop_csp32_sve:
-    sub             w12, w12, #1
-.rept 4
-    ld1h            {z0.h}, p0/z, x2
-    st1b            {z0.h}, p0, x0
-    add             x2, x2, x3, lsl #1
-    add             x0, x0, x1
-    ld1h            {z1.h}, p0/z, x2
-    st1b            {z1.h}, p0, x0
-    add             x2, x2, x3, lsl #1
-    add             x0, x0, x1
-.endr
-    cbnz            w12, .vl_gt_48_loop_csp32_sve
-    ret
-endfunc
-
-function PFX(blockcopy_ps_16x16_sve)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_blockcopy_ps_16_16
-    lsl             x1, x1, #1
-.rept 8
-    ld1             {v4.16b}, x2, x3
-    ld1             {v5.16b}, x2, x3
-    uxtl            v0.8h, v4.8b
-    uxtl2           v1.8h, v4.16b
-    uxtl            v2.8h, v5.8b
-    uxtl2           v3.8h, v5.16b
-    st1             {v0.8h-v1.8h}, x0, x1
-    st1             {v2.8h-v3.8h}, x0, x1
-.endr
-    ret
-.vl_gt_16_blockcopy_ps_16_16:
-    ptrue           p0.b, vl32
-.rept 16
-    ld1b            {z1.h}, p0/z, x2
-    st1h            {z1.h}, p0, x0
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_ps_32x32_sve)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_blockcopy_ps_32_32

x265-4.1.tar/source/common/aarch64/ssd-a-sve2.S Deleted

@@ -1,626 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen@myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-#include "ssd-a-common.S"
-
-.arch armv8-a+sve2
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(pixel_sse_ss_4x4_sve2)
-    ptrue           p0.b, vl8
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x2
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z17.h
-    smullb          z3.s, z1.h, z1.h
-    smullt          z4.s, z1.h, z1.h
-.rept 3
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x2
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z17.h
-    smlalb          z3.s, z1.h, z1.h
-    smlalt          z4.s, z1.h, z1.h
-.endr
-    uaddv           d3, p0, z3.s
-    fmov            w0, s3
-    uaddv           d4, p0, z4.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-endfunc
-
-function PFX(pixel_sse_ss_8x8_sve2)
-    ptrue           p0.b, vl16
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x2
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z17.h
-    smullb          z3.s, z1.h, z1.h
-    smullt          z4.s, z1.h, z1.h
-.rept 7
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x2
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z17.h
-    smlalb          z3.s, z1.h, z1.h
-    smlalt          z4.s, z1.h, z1.h
-.endr
-    uaddv           d3, p0, z3.s
-    fmov            w0, s3
-    uaddv           d4, p0, z4.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-endfunc
-
-function PFX(pixel_sse_ss_16x16_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_ss_16x16
-    ptrue           p0.b, vl16
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x2
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z18.h
-    sub             z2.h, z17.h, z19.h
-    smullb          z3.s, z1.h, z1.h
-    smullt          z4.s, z1.h, z1.h
-    smlalb          z3.s, z2.h, z2.h
-    smlalt          z4.s, z2.h, z2.h
-.rept 15
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x2
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z18.h
-    sub             z2.h, z17.h, z19.h
-    smlalb          z3.s, z1.h, z1.h
-    smlalt          z4.s, z1.h, z1.h
-    smlalb          z3.s, z2.h, z2.h
-    smlalt          z4.s, z2.h, z2.h
-.endr
-    uaddv           d3, p0, z3.s
-    fmov            w0, s3
-    uaddv           d4, p0, z4.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-.vl_gt_16_pixel_sse_ss_16x16:
-    ptrue           p0.b, vl32
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z18.h
-    smullb          z3.s, z1.h, z1.h
-    smullt          z4.s, z1.h, z1.h
-.rept 15
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z18.b}, p0/z, x2
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z18.h
-    smlalb          z3.s, z1.h, z1.h
-    smlalt          z4.s, z1.h, z1.h
-.endr
-    uaddv           d3, p0, z3.s
-    fmov            w0, s3
-    uaddv           d4, p0, z4.s
-    fmov            w1, s4
-    add             w0, w0, w1
-    ret
-endfunc
-
-function PFX(pixel_sse_ss_32x32_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_sse_ss_32x32
-    ptrue           p0.b, vl16
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x0, #2, mul vl
-    ld1b            {z19.b}, p0/z, x0, #3, mul vl
-    ld1b            {z20.b}, p0/z, x2
-    ld1b            {z21.b}, p0/z, x2, #1, mul vl
-    ld1b            {z22.b}, p0/z, x2, #2, mul vl
-    ld1b            {z23.b}, p0/z, x2, #3, mul vl
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z20.h
-    sub             z2.h, z17.h, z21.h
-    sub             z3.h, z18.h, z22.h
-    sub             z4.h, z19.h, z23.h
-    smullb          z5.s, z1.h, z1.h
-    smullt          z6.s, z1.h, z1.h
-    smlalb          z5.s, z2.h, z2.h
-    smlalt          z6.s, z2.h, z2.h
-    smlalb          z5.s, z3.h, z3.h
-    smlalt          z6.s, z3.h, z3.h
-    smlalb          z5.s, z4.h, z4.h
-    smlalt          z6.s, z4.h, z4.h
-.rept 31
-    ld1b            {z16.b}, p0/z, x0
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
-    ld1b            {z18.b}, p0/z, x0, #2, mul vl
-    ld1b            {z19.b}, p0/z, x0, #3, mul vl
-    ld1b            {z20.b}, p0/z, x2
-    ld1b            {z21.b}, p0/z, x2, #1, mul vl
-    ld1b            {z22.b}, p0/z, x2, #2, mul vl
-    ld1b            {z23.b}, p0/z, x2, #3, mul vl
-    add             x0, x0, x1, lsl #1
-    add             x2, x2, x3, lsl #1
-    sub             z1.h, z16.h, z20.h
-    sub             z2.h, z17.h, z21.h
-    sub             z3.h, z18.h, z22.h
-    sub             z4.h, z19.h, z23.h
-    smlalb          z5.s, z1.h, z1.h
-    smlalt          z6.s, z1.h, z1.h
-    smlalb          z5.s, z2.h, z2.h
-    smlalt          z6.s, z2.h, z2.h

x265-4.2.tar/bitbucket-pipelines.yml Added

@@ -0,0 +1,716 @@
+image: ubuntu:22.04
+
+definitions:
+  caches:
+    ccache: ~/.ccache
+
+  steps:
+    - step: &build-8bit
+        name: "Build 8-bit"
+        caches:
+          - ccache
+        script:
+          - export DEBIAN_FRONTEND=noninteractive
+          - apt-get update -qq
+          - apt-get install -y -qq build-essential cmake nasm ccache libnuma-dev git > /dev/null 2>&1
+          - export PATH="/usr/lib/ccache:$PATH"
+          - mkdir -p build/8bit && cd build/8bit
+          - cmake ../../source -DCMAKE_BUILD_TYPE=Release -DENABLE_SHARED=ON -DENABLE_CLI=ON -DENABLE_ASSEMBLY=ON -DENABLE_TESTS=ON -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          - make -j$(nproc)
+          - echo "--- Verify build artifacts ---"
+          - test -f x265 && echo "CLI binary OK"
+          - test -f libx265.so && echo "Shared library OK"
+          - test -f libx265.a && echo "Static library OK"
+          - ./x265 --version
+          - echo "--- Run TestBench (unit tests) ---"
+          - ./test/TestBench || (echo "TestBench FAILED"; exit 1)
+        artifacts:
+          - build/8bit/x265
+          - build/8bit/libx265.so*
+          - build/8bit/libx265.a
+
+    - step: &build-10bit
+        name: "Build 10-bit"
+        caches:
+          - ccache
+        script:
+          - export DEBIAN_FRONTEND=noninteractive
+          - apt-get update -qq
+          - apt-get install -y -qq build-essential cmake nasm ccache libnuma-dev git > /dev/null 2>&1
+          - export PATH="/usr/lib/ccache:$PATH"
+          - mkdir -p build/10bit && cd build/10bit
+          - cmake ../../source -DCMAKE_BUILD_TYPE=Release -DHIGH_BIT_DEPTH=ON -DENABLE_SHARED=OFF -DENABLE_CLI=ON -DENABLE_ASSEMBLY=ON -DENABLE_TESTS=ON -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          - make -j$(nproc)
+          - test -f x265 && echo "10-bit CLI binary OK"
+          - ./x265 --version
+          - ./test/TestBench || (echo "10-bit TestBench FAILED"; exit 1)
+        artifacts:
+          - build/10bit/libx265.a
+
+    - step: &build-12bit
+        name: "Build 12-bit"
+        caches:
+          - ccache
+        script:
+          - export DEBIAN_FRONTEND=noninteractive
+          - apt-get update -qq
+          - apt-get install -y -qq build-essential cmake nasm ccache libnuma-dev git > /dev/null 2>&1
+          - export PATH="/usr/lib/ccache:$PATH"
+          - mkdir -p build/12bit && cd build/12bit
+          - cmake ../../source -DCMAKE_BUILD_TYPE=Release -DHIGH_BIT_DEPTH=ON -DMAIN12=ON -DENABLE_SHARED=OFF -DENABLE_CLI=ON -DENABLE_ASSEMBLY=ON -DENABLE_TESTS=ON -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          - make -j$(nproc)
+          - test -f x265 && echo "12-bit CLI binary OK"
+          - ./x265 --version
+          - ./test/TestBench || (echo "12-bit TestBench FAILED"; exit 1)
+        artifacts:
+          - build/12bit/libx265.a
+
+    - step: &build-multilib
+        name: "Build Multi-lib (8+10+12 bit)"
+        caches:
+          - ccache
+        script:
+          - export DEBIAN_FRONTEND=noninteractive
+          - apt-get update -qq
+          - apt-get install -y -qq build-essential cmake nasm ccache libnuma-dev git > /dev/null 2>&1
+          - export PATH="/usr/lib/ccache:$PATH"
+          - mkdir -p build/multilib
+          - mkdir -p build/multilib/12bit && cd build/multilib/12bit
+          - cmake ../../../source -DCMAKE_BUILD_TYPE=Release -DHIGH_BIT_DEPTH=ON -DMAIN12=ON -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DEXPORT_C_API=OFF -DENABLE_ASSEMBLY=ON -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          - make -j$(nproc)
+          - cd ../../..
+          - mkdir -p build/multilib/10bit && cd build/multilib/10bit
+          - cmake ../../../source -DCMAKE_BUILD_TYPE=Release -DHIGH_BIT_DEPTH=ON -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DEXPORT_C_API=OFF -DENABLE_ASSEMBLY=ON -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          - make -j$(nproc)
+          - cd ../../..
+          - mkdir -p build/multilib/8bit && cd build/multilib/8bit
+          - ln -sf ../../multilib/10bit/libx265.a libx265_main10.a
+          - ln -sf ../../multilib/12bit/libx265.a libx265_main12.a
+          - cmake ../../../source -DCMAKE_BUILD_TYPE=Release -DENABLE_SHARED=ON -DENABLE_CLI=ON -DENABLE_ASSEMBLY=ON -DEXTRA_LIB="x265_main10.a;x265_main12.a" -DEXTRA_LINK_FLAGS="-L." -DLINKED_10BIT=ON -DLINKED_12BIT=ON -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          - make -j$(nproc)
+          - test -f x265 && echo "Multi-lib CLI binary OK"
+          - ./x265 --version
+        artifacts:
+          - build/multilib/8bit/x265
+          - build/multilib/8bit/libx265.so*
+          - build/multilib/8bit/libx265.a
+
+    - step: &test-cli-presets
+        name: "Test CLI - All Presets & Tune Modes"
+        script:
+          - export DEBIAN_FRONTEND=noninteractive
+          - apt-get update -qq
+          - apt-get install -y -qq build-essential cmake nasm libnuma-dev ffmpeg > /dev/null 2>&1
+          - mkdir -p build/8bit && cd build/8bit
+          - cmake ../../source -DCMAKE_BUILD_TYPE=Release -DENABLE_CLI=ON -DENABLE_ASSEMBLY=ON
+          - make -j$(nproc)
+          - cd ../..
+          - mkdir -p testdata
+          - ffmpeg -y -f lavfi -i testsrc=duration=2:size=416x240:rate=30 -pix_fmt yuv420p testdata/test_420_416x240.yuv
+          - X265=build/8bit/x265
+          - PASS=0
+          - FAIL=0
+          - TOTAL=0
+          - |
+            run_x265() {
+              local label="$1"; shift
+              echo "=== $label ==="
+              "$@" 2>&1 | tee /tmp/x265_last.log
+              local rc=${PIPESTATUS0}
+              local summary=$(grep "encoded" /tmp/x265_last.log | tail -1)
+               -n "$summary"  && printf "  %-45s | %s\n" "$label" "$summary" >> /tmp/summary.log
+              return $rc
+            }
+            echo "" > /tmp/summary.log
+          - |
+            for preset in ultrafast superfast veryfast faster fast medium slow slower veryslow placebo; do
+              TOTAL=$((TOTAL + 1))
+              if run_x265 "Preset $preset" $X265 --preset $preset --crf 28 --input-res 416x240 --fps 30 --frames 30 -o /tmp/out_${preset}.hevc testdata/test_420_416x240.yuv; then
+                PASS=$((PASS + 1))
+              else
+                FAIL=$((FAIL + 1))
+              fi
+              rm -f /tmp/out_${preset}.hevc
+            done
+          - |
+            for tune in psnr ssim grain fastdecode zerolatency; do
+              TOTAL=$((TOTAL + 1))
+              if run_x265 "Tune $tune" $X265 --preset medium --tune $tune --crf 28 --input-res 416x240 --fps 30 --frames 30 -o /tmp/out_tune_${tune}.hevc testdata/test_420_416x240.yuv; then
+                PASS=$((PASS + 1))
+              else
+                FAIL=$((FAIL + 1))
+              fi
+              rm -f /tmp/out_tune_${tune}.hevc
+            done
+          - echo ""
+          - echo "==================== ENCODING SUMMARY ===================="
+          - cat /tmp/summary.log 2>/dev/null || true
+          - echo "=========================================================="
+          - echo "=== Preset/Tune Results $PASS/$TOTAL passed, $FAIL failed ==="
+          - mkdir -p summaries && cp /tmp/summary.log summaries/cli-presets.log 2>/dev/null || true
+          - test $FAIL -eq 0
+        artifacts:
+          - summaries/*.log
+
+    - step: &test-rate-control
+        name: "Test Rate Control Modes"
+        script:
+          - export DEBIAN_FRONTEND=noninteractive
+          - apt-get update -qq
+          - apt-get install -y -qq build-essential cmake nasm libnuma-dev ffmpeg > /dev/null 2>&1
+          - mkdir -p build/8bit && cd build/8bit
+          - cmake ../../source -DCMAKE_BUILD_TYPE=Release -DENABLE_CLI=ON -DENABLE_ASSEMBLY=ON
+          - make -j$(nproc)
+          - cd ../..
+          - mkdir -p testdata
+          - ffmpeg -y -f lavfi -i testsrc=duration=3:size=832x480:rate=30 -pix_fmt yuv420p testdata/test_420_832x480.yuv
+          - X265=build/8bit/x265
+          - FAIL=0
+          - |
+            run_x265() {
+              local label="$1"; shift
+              echo "=== $label ==="
+              "$@" 2>&1 | tee /tmp/x265_last.log
+              local rc=${PIPESTATUS0}
+              local summary=$(grep "encoded" /tmp/x265_last.log | tail -1)
+               -n "$summary"  && printf "  %-45s | %s\n" "$label" "$summary" >> /tmp/summary.log
+              return $rc
+            }
+            echo "" > /tmp/summary.log
+          - run_x265 "CRF Mode" $X265 --preset medium --crf 22 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_crf.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "CQP Mode" $X265 --preset medium --qp 28 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_cqp.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "ABR Mode" $X265 --preset medium --bitrate 1000 -F4 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_abr.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "VBV Constrained" $X265 --preset medium --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1000 -F4 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_vbv.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "Strict CBR" $X265 --preset medium --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1000 --strict-cbr -F4 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_cbr.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "CRF + VBV" $X265 --preset medium --crf 22 --vbv-maxrate 2000 --vbv-bufsize 3000 --crf-max 32 --crf-min 18 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_crf_vbv.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "Two-pass ABR pass1" $X265 --preset medium --bitrate 1000 --pass 1 -F4 --input-res 832x480 --fps 30 --frames 60 -o /dev/null testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "Two-pass ABR pass2" $X265 --preset medium --bitrate 1000 --pass 2 -F4 --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_2pass.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "Multi-pass Opt Analysis pass1" $X265 --preset medium --bitrate 1000 --pass 1 --multi-pass-opt-analysis --input-res 832x480 --fps 30 --frames 60 -o /dev/null testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - run_x265 "Multi-pass Opt Analysis pass2" $X265 --preset medium --bitrate 1000 --pass 2 --multi-pass-opt-analysis --ssim --psnr --input-res 832x480 --fps 30 --frames 60 -o /tmp/out_mp_opt.hevc testdata/test_420_832x480.yuv || FAIL=$((FAIL + 1))
+          - rm -f /tmp/out_*.hevc x265_2pass.log x265_2pass.log.cutree
+          - echo ""
+          - echo "==================== ENCODING SUMMARY ===================="
+          - cat /tmp/summary.log 2>/dev/null || true
+          - echo "=========================================================="
+          - echo "=== Rate Control Results FAIL=$FAIL ==="
+          - mkdir -p summaries && cp /tmp/summary.log summaries/rate-control.log 2>/dev/null || true
+          - test $FAIL -eq 0
+        artifacts:
+          - summaries/*.log

x265-4.1.tar/doc/reST/releasenotes.rst -> x265-4.2.tar/doc/reST/releasenotes.rst Changed

@@ -2,6 +2,53 @@
 Release Notes
 *************
 
+Version 4.2
+===========
+
+Release date - 19th April, 2026.
+
+New feature
+-----------
+1. Threaded Motion Estimation (Experimental feature)-It uses a dedicated threadpool to precompute Motion Estimation in parallel.Improves encoding speed upto 1.5x for 1080p & lower resolution on multi core machines with low frequency setting. On high frequency systems or on machines with low number of cores, the overhead of additional Motion estimation work may outweigh parallelism
+
+Enhancements to existing features
+-------------
+
+1.Add new Levels 6.3 to 7.2 specified in ITU-T H.265 (V9) (09/2023) and above
+2.Improve Slices feature with check zeroMv
+3.Enable frame parallelism with MCSTF feature
+4.Updated support to signal AOM FGM params
+5.Improve quality with SBRC feature
+6.Updated DolbyVision P5 VUI defaults
+
+API changes
+-----------
+1. API Support to enable Threaded Motion Estimation(--threaded-me)
+
+Optimizations
+-------------
+1. RISC V optimizations including SAD, SATD, DCT, IDCT, block copy, pixel utilities, SAO, loopfilter, transpose kernels resulting in 2x encoding speed.
+2. ARM SIMD optimizations including the use of NEON and SVE instruction set extensions. The following algorithms now have optimized SIMD implementations: DST, IDCT, SSE, SSD ,intra_pred_planar, pelFilterLumaStrong, interpolation, planecopy, dequant_normal, blockcopy, pixel variance resulting in 8% faster encoding speed compared to v4.1
+
+Bug fixes
+---------
+1. Fix memory leaks (no command line option, SEI buffer, analysis save/load)
+2. Fix chroma qp offset for non yuv444 inputs
+3. Fix max supported input resolution
+4. Fix bugs with ARM SIMD optimizations
+5. Fix Alpha and Multiview feature flag support in x265_config
+6. Fix test harness issues, CMake errors
+7. Fix inconsistent output with aq-motion
+8. Fix crash with hist-scenecut on high bit-depth builds
+9. Fix lookahead concurrency bug
+10. Fix shared link issue (R_X86_64_PC32), yuv recon output issue, rd-refine and dynamic-refine issue, inputs for Windows named pipe,weighted prediction delta_chroma_offset, crf and vbv issue in abr-ladder, psnr and ssim reported with MCSTF feature, internally overflowed VBV variables
+
+Known issues
+------------
+1. Output mismatch between analysis save & load with cutree with reuse level < 10
+2. Inconsistent hash mismatch with abr-ladder feature
+3. Performance regression observed with threaded-me feature on high frequency systems and for higher resolutions (4k)
+
 Version 4.1
 ===========

x265-4.1.tar/source/CMakeLists.txt -> x265-4.2.tar/source/CMakeLists.txt Changed

@@ -6,18 +6,19 @@
         FORCE)
 endif()
 message(STATUS "cmake version ${CMAKE_VERSION}")
-if(POLICY CMP0025)
-    cmake_policy(SET CMP0025 OLD) # report Apple's Clang as just Clang
-endif()
+
 if(POLICY CMP0042)
     cmake_policy(SET CMP0042 NEW) # MACOSX_RPATH
 endif()
-if(POLICY CMP0054)
-    cmake_policy(SET CMP0054 OLD) # Only interpret if() arguments as variables or keywords when unquoted
+
+cmake_minimum_required (VERSION 2.8.8...3.10) # OBJECT libraries require 2.8.8
+
+if (POLICY CMP0075)
+    cmake_policy(SET CMP0075 NEW) # CMAKE_REQUIRED_LIBRARIES warning
 endif()
 
+
 project (x265)
-cmake_minimum_required (VERSION 2.8.8) # OBJECT libraries require 2.8.8
 include(CheckIncludeFiles)
 include(CheckFunctionExists)
 include(CheckSymbolExists)
@@ -31,7 +32,7 @@
 option(STATIC_LINK_CRT "Statically link C and C++ runtimes for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 215)
+set(X265_BUILD 216)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -48,11 +49,13 @@
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
 set(ARM_ALIASES armv6l armv7l)
 set(ARM64_ALIASES arm64 arm64e aarch64)
+set(RISCV64_ALIASES riscv64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
 list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
 set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
+list(FIND RISCV64_ALIASES "${SYSPROC}" RISCV64MATCH)
 if(X86MATCH GREATER "-1")
     set(X86 1)
     add_definitions(-DX265_ARCH_X86=1)
@@ -99,6 +102,7 @@
     option(ENABLE_NEON_I8MM "Enable Neon I8MM" ON)
     option(ENABLE_SVE "Enable SVE" ON)
     option(ENABLE_SVE2 "Enable SVE2" ON)
+    option(ENABLE_SVE2_BITPERM "Enable SVE2 BitPerm" ON)
 
     # Compiler flags for AArch64 extensions.
     set(AARCH64_NEON_FLAG "-march=armv8-a")
@@ -109,11 +113,34 @@
     set(AARCH64_SVE_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
     # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod and +sve.
     set(AARCH64_SVE2_FLAG "-march=armv9-a+i8mm+sve2")
+    # SVE2 BitPerm implies +dotprod, +sve, and +sve2.
+    set(AARCH64_SVE2_BITPERM_FLAG "-march=armv9-a+i8mm+sve2-bitperm")
+elseif(RISCV64MATCH GREATER "-1")
+    message(STATUS "Detected RISCV64 target processor")
+    set(RISCV64 1)
+
+    option(RISCV64_RUNTIME_CPU_DETECT "Enable RISCV64 run-time CPU feature detection" ON)
+
+    option(ENABLE_RVV "Enable RVV" ON)
+
+    # Compiler flags only for riscv64 intrinsic file.
+    set(RISCV64_INTRINSIC_FLAG "-march=rv64gcv")
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
 endif()
 
+if(X64)
+    option(ENABLE_CET "Enable Control-flow Enforcement" OFF)
+    if(ENABLE_CET)
+        add_definitions(-DENABLE_CET=1)
+        list(APPEND ASM_FLAGS "-DENABLE_CET=1")
+    else()
+        add_definitions(-DENABLE_CET=0)
+        list(APPEND ASM_FLAGS "-DENABLE_CET=0")
+    endif()
+endif(X64)
+
 if(UNIX)
     list(APPEND PLATFORM_LIBS pthread)
     find_library(LIBRT rt)
@@ -152,9 +179,16 @@
     if(ENABLE_LIBVMAF)
         add_definitions(-DENABLE_LIBVMAF)
     endif()
+    if(X64)
+        option(ENABLE_CET "Enable Control-flow Enforcement" OFF)
+        if(ENABLE_CET)
+            add_definitions(-DENABLE_CET)
+            list(APPEND ASM_FLAGS "-DENABLE_CET=1")
+        endif()
+    endif(X64)
 endif(UNIX)
 
-if((X64 AND NOT WIN32) OR ARM64)
+if((X64 AND NOT WIN32) OR ARM64 OR PPC64 OR RISCV64)
     option(ENABLE_PIC "Enable Position Independent Code" ON)
 else()
     option(ENABLE_PIC "Enable Position Independent Code" OFF)
@@ -168,7 +202,7 @@
   add_definitions(-DMACOS=1)
 endif()
 
-if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang")
     set(CLANG 1)
 endif()
 if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
@@ -295,12 +329,14 @@
             set(CPU_HAS_NEON_I8MM 1)
             set(CPU_HAS_SVE 1)
             set(CPU_HAS_SVE2 1)
+            set(CPU_HAS_SVE2_BITPERM 1)
         else()
             if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
                 find_package(NEON_DOTPROD)
                 find_package(NEON_I8MM)
                 find_package(SVE)
                 find_package(SVE2)
+                find_package(SVE2_BITPERM)
             else()
                 message(STATUS "Compile-time CPU feature detection unsupported on this platform")
             endif()
@@ -311,6 +347,11 @@
             string(APPEND CMAKE_REQUIRED_FLAGS " ${AARCH64_SVE_FLAG}")
             set(OLD_CMAKE_TRY_COMPILE_TARGET_TYPE ${CMAKE_TRY_COMPILE_TARGET_TYPE})
             set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+            # When compilation target is a STATIC_LIBRARY, the LINK_OPTIONS are
+            # passed to the archiver, so we must backup, clear and restore these.
+            # https://gitlab.kitware.com/cmake/cmake/-/issues/23454
+            set(OLD_CMAKE_REQUIRED_LINK_OPTIONS ${CMAKE_REQUIRED_LINK_OPTIONS})
+            set(CMAKE_REQUIRED_LINK_OPTIONS "")
 
             # Check whether the compiler can compile SVE functions that require
             # backup/restore of SVE registers according to AAPCS.
@@ -340,6 +381,7 @@
 
             set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
             set(CMAKE_TRY_COMPILE_TARGET_TYPE ${OLD_CMAKE_TRY_COMPILE_TARGET_TYPE})
+            set(CMAKE_REQUIRED_LINK_OPTIONS ${OLD_CMAKE_REQUIRED_LINK_OPTIONS})
             if (SVE_COMPILATION_C_TEST_COMPILED AND SVE_COMPILATION_CXX_TEST_COMPILED)
                 if (SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED)
                     add_definitions(-DHAVE_SVE_BRIDGE=1)
@@ -376,6 +418,11 @@
         if(NOT ENABLE_SVE2)
             message(STATUS "Disabling SVE2")
             set(CPU_HAS_SVE2 0)
+            set(ENABLE_SVE2_BITPERM 0)
+        endif()
+        if(NOT ENABLE_SVE2_BITPERM)
+            message(STATUS "Disabling SVE2 BitPerm")
+            set(CPU_HAS_SVE2_BITPERM 0)
         endif()
 
         if(CPU_HAS_NEON)
@@ -398,6 +445,10 @@
             message(STATUS "Found SVE2")
             add_definitions(-DHAVE_SVE2=1)
         endif()
+        if(CPU_HAS_SVE2_BITPERM)
+            message(STATUS "Found SVE2 BitPerm")
+            add_definitions(-DHAVE_SVE2_BITPERM=1)
+        endif()
         set(ARM_ARGS -O3)
         # Do not allow implicit vector type conversions in Clang builds (this
         # is already the default in GCC builds).
@@ -406,6 +457,87 @@
             set(ARM_ARGS ${ARM_ARGS} -flax-vector-conversions=none)
         endif()
     endif()
+    if(RISCV64)
+        add_definitions(-DX265_ARCH_RISCV64=1)
+
+        if (RISCV64_RUNTIME_CPU_DETECT)
+            add_definitions(-DRISCV64_RUNTIME_CPU_DETECT=1)
+            message(STATUS "Configuring build for run-time CPU feature detection")
+        endif()
+
+        if(RISCV64_RUNTIME_CPU_DETECT OR CROSS_COMPILE_RISCV64)
+            # Add all extensions when compiling for run-time CPU feature detection or cross compiling.
+            set(CPU_HAS_RVV 1)
+        else()
+            if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+                find_package(RVV)
+            else()
+                message(STATUS "Compile-time CPU feature detection unsupported on this platform")
+            endif()
+        endif()
+

x265-4.1.tar/source/abrEncApp.cpp -> x265-4.2.tar/source/abrEncApp.cpp Changed

x265-4.1.tar/source/cmake/FindNasm.cmake -> x265-4.2.tar/source/cmake/FindNasm.cmake Changed

x265-4.1.tar/source/cmake/FindNuma.cmake -> x265-4.2.tar/source/cmake/FindNuma.cmake Changed

x265-4.2.tar/source/cmake/FindSVE2_BITPERM.cmake Added

x265-4.1.tar/source/common/CMakeLists.txt -> x265-4.2.tar/source/common/CMakeLists.txt Changed

@@ -105,23 +105,25 @@
 
     # Add Arm intrinsics files here.
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
-    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
+    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp)
     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
-    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp pixel-prim-sve.cpp)
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
     # Add Arm assembly files here.
-    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
     set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
-    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
-    set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
+    set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+    set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
+    set(A_SRCS_SVE2_BITPERM pixel-util-sve2-bitperm.S)
     set(VEC_PRIMITIVES)
 
-    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
-    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "Arm Assembly Sources")
     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "Arm Assembly Sources that use the SVE extension")
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 extension")
+    set(ARM_ASMS_SVE2_BITPERM "${A_SRCS_SVE2_BITPERM}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 BitPerm extension")
     foreach(SRC ${C_SRCS_NEON})
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
         set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
@@ -162,6 +164,39 @@
     endif()
 endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 
+if(ENABLE_ASSEMBLY AND (RISCV64 OR CROSS_COMPILE_RISCV64))
+    if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
+        message(STATUS "Detected CXX compiler using -O3 optimization level")
+        add_definitions(-DAUTO_VECTORIZE=1)
+    endif()
+
+    set(ASM_PRIMITIVES ${ASM_PRIMITIVES} riscv64/asm-primitives.cpp)
+
+    # Add riscv64 intrinsics files here.
+    set(INTRINSIC_SRCS_RVV pixel-prim.cpp sao-prim.cpp filter-prim.cpp intrapred-prim.cpp riscv64_utils.cpp)
+    if(CPU_HAS_RVV_INTRINSIC)
+        foreach(SRC ${INTRINSIC_SRCS_RVV})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} riscv64/${SRC})
+            set_source_files_properties(riscv64/${SRC} PROPERTIES COMPILE_FLAGS ${RISCV64_INTRINSIC_FLAG})
+        endforeach()
+    endif()
+
+    enable_language(ASM)
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
+
+    # Add riscv64 assembly files here.
+    set(A_SRCS asm.S blockcopy8.S dct.S sad-a.S ssd-a.S pixel-util.S mc-a.S p2s.S sao.S loopfilter.S intrapred.S riscv64_utils.S)
+    set(VEC_PRIMITIVES)
+
+    if(CPU_HAS_RVV)
+        set(RISCV64_ASMS "${A_SRCS}" CACHE INTERNAL "RISCV64 Assembly Sources")
+    endif()
+
+    if(RISCV64_WARNINGS_AS_ERRORS)
+        set_source_files_properties(${ASM_PRIMITIVES} PROPERTIES COMPILE_FLAGS -Werror)
+    endif()
+endif(ENABLE_ASSEMBLY AND (RISCV64 OR CROSS_COMPILE_RISCV64))
+
 if(POWER)
     set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
     if(ENABLE_ALTIVEC)

x265-4.1.tar/source/common/aarch64/asm-primitives.cpp -> x265-4.2.tar/source/common/aarch64/asm-primitives.cpp Changed

@@ -380,6 +380,7 @@
 
 #include "pixel-prim.h"
 #include "filter-prim.h"
+#include "filter-prim-sve.h"
 #include "dct-prim.h"
 #include "loopfilter-prim.h"
 #include "intrapred-prim.h"
@@ -403,82 +404,6 @@
     ALL_CHROMA_444_PU(p2sNONALIGNED, filterPixelToShort, neon);
     ALL_LUMA_PU(convert_p2sNONALIGNED, filterPixelToShort, neon);
 
-#if !HIGH_BIT_DEPTH
-    // Blockcopy_pp
-    ALL_LUMA_PU(copy_pp, blockcopy_pp, neon);
-    ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon);
-    ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon);
-    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
-    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
-    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
-    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
-    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_neon);
-
-#endif // !HIGH_BIT_DEPTH
-
-    // Blockcopy_ss
-    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
-    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
-    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
-    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
-    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_neon);
-
-    // Blockcopy_ps
-    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
-    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
-    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
-    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
-    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_neon);
-
-    // Blockcopy_sp
-    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_neon);
-    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_neon);
-    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
-    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
-    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
-
-    // chroma blockcopy_ss
-    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_neon);
-
-    // chroma blockcopy_ps
-    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_neon);
-
-    // chroma blockcopy_sp
-    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_neon);
-
-    // Block_fill
-    ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, neon);
-    ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, neon);
-
     // copy_count
     p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
     p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
@@ -491,52 +416,6 @@
     p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
     p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
 
-    // cpy2Dto1D_shl
-    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
-    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
-    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon);
-    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon);
-    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon);
-
-    // cpy2Dto1D_shr
-    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
-    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
-    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
-    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
-
-    // cpy1Dto2D_shl
-    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
-    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
-    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_neon);
-    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_neon);
-    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_neon);
-
-    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
-    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
-    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_neon);
-    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_neon);
-    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_neon);
-
-    // cpy1Dto2D_shr
-    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
-    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
-    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon);
-    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
-    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
-
-#if !HIGH_BIT_DEPTH
-    // pixel_avg_pp
-    ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
-    ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
-
-    // addAvg
-    ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
-    ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
-    ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
-    ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
-    ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
-    ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
-
     // sad
     ALL_LUMA_PU(sad, pixel_sad, neon);
     ALL_LUMA_PU(sad_x3, sad_x3, neon);
@@ -570,17 +449,18 @@
     p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_neon);
     p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
     p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
+    p.cuBLOCK_64x64.ssd_sNONALIGNED = PFX(pixel_ssd_s_64x64_neon);
 
     p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_neon);
     p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_neon);
     p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
     p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
+    p.cuBLOCK_64x64.ssd_sALIGNED = PFX(pixel_ssd_s_64x64_neon);
 
-    // pixel_var
-    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_neon);
-    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
-    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
-    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
+#if !HIGH_BIT_DEPTH
+    // pixel_avg_pp
+    ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
+    ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
 
     // calc_Residual
     p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
@@ -610,38 +490,6 @@
     p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
     p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
 
-    // pixel_add_ps
-    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
-    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
-    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
-    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
-    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
-
-    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
-    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
-    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
-    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
-    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
-
-    // chroma add_ps
-    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
-
-    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
-    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
-    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_neon);

x265-4.1.tar/source/common/aarch64/blockcopy8.S -> x265-4.2.tar/source/common/aarch64/blockcopy8.S Changed

@@ -22,7 +22,6 @@
  *****************************************************************************/
 
 #include "asm.S"
-#include "blockcopy8-common.S"
 
 #ifdef __APPLE__
 .section __RODATA,__rodata
@@ -34,773 +33,6 @@
 
 .text
 
-/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
- *
- * r0   - a
- * r1   - stridea
- * r2   - b
- * r3   - strideb */
-function PFX(blockcopy_sp_4x4_neon)
-    lsl             x3, x3, #1
-.rept 2
-    ld1             {v0.8h}, x2, x3
-    ld1             {v1.8h}, x2, x3
-    xtn             v0.8b, v0.8h
-    xtn             v1.8b, v1.8h
-    st1             {v0.s}0, x0, x1
-    st1             {v1.s}0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_sp_8x8_neon)
-    lsl             x3, x3, #1
-.rept 4
-    ld1             {v0.8h}, x2, x3
-    ld1             {v1.8h}, x2, x3
-    xtn             v0.8b, v0.8h
-    xtn             v1.8b, v1.8h
-    st1             {v0.d}0, x0, x1
-    st1             {v1.d}0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_sp_16x16_neon)
-    lsl             x3, x3, #1
-    movrel          x11, xtn_xtn2_table
-    ld1             {v31.16b}, x11
-.rept 8
-    ld1             {v0.8h-v1.8h}, x2, x3
-    ld1             {v2.8h-v3.8h}, x2, x3
-    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
-    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
-    st1             {v0.16b}, x0, x1
-    st1             {v1.16b}, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_sp_32x32_neon)
-    mov             w12, #4
-    lsl             x3, x3, #1
-    movrel          x11, xtn_xtn2_table
-    ld1             {v31.16b}, x11
-.Loop_csp32:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v0.8h-v3.8h}, x2, x3
-    ld1             {v4.8h-v7.8h}, x2, x3
-    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
-    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
-    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
-    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
-    st1             {v0.16b-v1.16b}, x0, x1
-    st1             {v2.16b-v3.16b}, x0, x1
-.endr
-    cbnz            w12, .Loop_csp32
-    ret
-endfunc
-
-function PFX(blockcopy_sp_64x64_neon)
-    mov             w12, #16
-    lsl             x3, x3, #1
-    sub             x3, x3, #64
-    movrel          x11, xtn_xtn2_table
-    ld1             {v31.16b}, x11
-.Loop_csp64:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v0.8h-v3.8h}, x2, #64
-    ld1             {v4.8h-v7.8h}, x2, x3
-    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
-    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
-    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
-    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
-    st1             {v0.16b-v3.16b}, x0, x1
-.endr
-    cbnz            w12, .Loop_csp64
-    ret
-endfunc
-
-// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
-function PFX(blockcopy_ps_4x4_neon)
-    lsl             x1, x1, #1
-.rept 2
-    ld1             {v0.8b}, x2, x3
-    ld1             {v1.8b}, x2, x3
-    uxtl            v0.8h, v0.8b
-    uxtl            v1.8h, v1.8b
-    st1             {v0.4h}, x0, x1
-    st1             {v1.4h}, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_ps_8x8_neon)
-    lsl             x1, x1, #1
-.rept 4
-    ld1             {v0.8b}, x2, x3
-    ld1             {v1.8b}, x2, x3
-    uxtl            v0.8h, v0.8b
-    uxtl            v1.8h, v1.8b
-    st1             {v0.8h}, x0, x1
-    st1             {v1.8h}, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_ps_16x16_neon)
-    lsl             x1, x1, #1
-.rept 8
-    ld1             {v4.16b}, x2, x3
-    ld1             {v5.16b}, x2, x3
-    uxtl            v0.8h, v4.8b
-    uxtl2           v1.8h, v4.16b
-    uxtl            v2.8h, v5.8b
-    uxtl2           v3.8h, v5.16b
-    st1             {v0.8h-v1.8h}, x0, x1
-    st1             {v2.8h-v3.8h}, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockcopy_ps_32x32_neon)
-    lsl             x1, x1, #1
-    mov             w12, #4
-.Loop_cps32:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v16.16b-v17.16b}, x2, x3
-    ld1             {v18.16b-v19.16b}, x2, x3
-    uxtl            v0.8h, v16.8b
-    uxtl2           v1.8h, v16.16b
-    uxtl            v2.8h, v17.8b
-    uxtl2           v3.8h, v17.16b
-    uxtl            v4.8h, v18.8b
-    uxtl2           v5.8h, v18.16b
-    uxtl            v6.8h, v19.8b
-    uxtl2           v7.8h, v19.16b
-    st1             {v0.8h-v3.8h}, x0, x1
-    st1             {v4.8h-v7.8h}, x0, x1
-.endr
-    cbnz            w12, .Loop_cps32
-    ret
-endfunc
-
-function PFX(blockcopy_ps_64x64_neon)
-    lsl             x1, x1, #1
-    sub             x1, x1, #64
-    mov             w12, #16
-.Loop_cps64:
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v16.16b-v19.16b}, x2, x3
-    uxtl            v0.8h, v16.8b
-    uxtl2           v1.8h, v16.16b
-    uxtl            v2.8h, v17.8b
-    uxtl2           v3.8h, v17.16b
-    uxtl            v4.8h, v18.8b
-    uxtl2           v5.8h, v18.16b
-    uxtl            v6.8h, v19.8b
-    uxtl2           v7.8h, v19.16b
-    st1             {v0.8h-v3.8h}, x0, #64
-    st1             {v4.8h-v7.8h}, x0, x1
-.endr
-    cbnz            w12, .Loop_cps64
-    ret
-endfunc
-
-// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
-function PFX(blockcopy_ss_4x4_neon)
-    lsl             x1, x1, #1
-    lsl             x3, x3, #1
-.rept 2
-    ld1             {v0.8b}, x2, x3
-    ld1             {v1.8b}, x2, x3
-    st1             {v0.8b}, x0, x1
-    st1             {v1.8b}, x0, x1
-.endr
-    ret

x265-4.1.tar/source/common/aarch64/cpu.h -> x265-4.2.tar/source/common/aarch64/cpu.h Changed

@@ -116,6 +116,14 @@
     }
 #endif  // defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
 #endif  // HAVE_SVE2
+#if HAVE_SVE2_BITPERM
+#if defined(PF_ARM_SVE_BITPERM_INSTRUCTIONS_AVAILABLE)
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_BITPERM_INSTRUCTIONS_AVAILABLE))
+    {
+        flags |= X265_CPU_SVE2_BITPERM;
+    }
+#endif  // defined(PF_ARM_SVE_BITPERM_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_SVE2_BITPERM
     return flags;
 }
 
@@ -126,6 +134,7 @@
 #define X265_AARCH64_HWCAP_ASIMDDP (1 << 20)
 #define X265_AARCH64_HWCAP_SVE (1 << 22)
 #define X265_AARCH64_HWCAP2_SVE2 (1 << 1)
+#define X265_AARCH64_HWCAP2_SVEBITPERM (1 << 4)
 #define X265_AARCH64_HWCAP2_I8MM (1 << 13)
 
 static inline int aarch64_get_cpu_flags()
@@ -135,7 +144,7 @@
 #if HAVE_NEON_DOTPROD || HAVE_SVE
     unsigned long hwcap = getauxval(AT_HWCAP);
 #endif
-#if HAVE_NEON_I8MM || HAVE_SVE2
+#if HAVE_NEON_I8MM || HAVE_SVE2 || HAVE_SVE2_BITPERM
     unsigned long hwcap2 = getauxval(AT_HWCAP2);
 #endif
 
@@ -154,6 +163,9 @@
 #if HAVE_SVE2
     if (hwcap2 & X265_AARCH64_HWCAP2_SVE2) flags |= X265_CPU_SVE2;
 #endif
+#if HAVE_SVE2_BITPERM
+    if (hwcap2 & X265_AARCH64_HWCAP2_SVEBITPERM) flags |= X265_CPU_SVE2_BITPERM;
+#endif
 
     return flags;
 }
@@ -179,6 +191,9 @@
     // Restrict flags: SVE2 assumes that FEAT_SVE is available.
     if (!(flags & X265_CPU_SVE)) flags &= ~X265_CPU_SVE2;
 
+    // Restrict flags: SVE2_BitPerm assumes that FEAT_SVE2 is available.
+    if (!(flags & X265_CPU_SVE2)) flags &= ~X265_CPU_SVE2_BITPERM;
+
     return flags;
 }
 
@@ -203,6 +218,9 @@
 #if HAVE_SVE2
     flags |= X265_CPU_SVE2;
 #endif
+#if HAVE_SVE2_BITPERM
+    flags |= X265_CPU_SVE2_BITPERM;
+#endif
     return flags;
 }

x265-4.1.tar/source/common/aarch64/dct-prim-sve.cpp -> x265-4.2.tar/source/common/aarch64/dct-prim-sve.cpp Changed

@@ -135,9 +135,9 @@
     }
 }
 
-template<int shift>
-static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
+static inline void pass1Butterfly16_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
 {
+    const int shift = 3 + X265_DEPTH - 8;
     const int line = 16;
 
     int16x8_t Oline;
@@ -147,11 +147,11 @@
 
     for (int i = 0; i < line; i += 2)
     {
-        int16x8_t s0_lo = vld1q_s16(src + i * line);
-        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
+        int16x8_t s0_lo = vld1q_s16(src + i * srcStride);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * srcStride + 8));
 
-        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
-        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * srcStride);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * srcStride + 8));
 
         int32x4_t E02;
         E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
@@ -239,9 +239,114 @@
     }
 }
 
-template<int shift>
-static inline void partialButterfly32_sve(const int16_t *src, int16_t *dst)
+static inline void pass2Butterfly16_sve(const int16_t *src, int16_t *dst)
+{
+    const int shift = 10;
+    const int line = 16;
+
+    int16x8_t Oline;
+    int32x4_t EOline;
+    int32x4_t EEEline;
+    int32x4_t EEOline;
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
+
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
+
+        int32x4_t E02;
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
+
+        int32x4_t E12;
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
+
+        EOi + 0 = vsubq_s32(E00, rev32(E01));
+        EOi + 1 = vsubq_s32(E10, rev32(E11));
+
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(EE0),
+                       vreinterpretq_s64_s32(EE1))));
+
+        EEEi / 2 = vaddq_s32(t0, t1);
+        EEOi / 2 = vsubq_s32(t0, t1);
+    }
+
+    for (int i = 0; i < line; i += 4)
+    {
+        for (int k = 1; k < 16; k += 2)
+        {
+            int16x8_t c0_c4 = vld1q_s16(&g_t16k0);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 0);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 1);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 2);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 3);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(dst + k * line, res);
+        }
+
+        for (int k = 2; k < 16; k += 4)
+        {
+            int32x4_t c0 = x265_vld1sh_s32(&g_t16k0);
+
+            int32x4_t t0 = vmulq_s32(c0, EOi + 0);
+            int32x4_t t1 = vmulq_s32(c0, EOi + 1);
+            int32x4_t t2 = vmulq_s32(c0, EOi + 2);
+            int32x4_t t3 = vmulq_s32(c0, EOi + 3);
+            int32x4_t t = vpaddq_s32(vpaddq_s32(t0, t1), vpaddq_s32(t2, t3));
+
+            int16x4_t res = vrshrn_n_s32(t, shift);
+            vst1_s16(dst + k * line, res);
+        }
+
+        int32x4_t c0 = vld1q_s32(t8_even0);
+        int32x4_t c4 = vld1q_s32(t8_even1);
+        int32x4_t c8 = vld1q_s32(t8_even2);
+        int32x4_t c12 = vld1q_s32(t8_even3);
+
+        int32x4_t t0 = vpaddq_s32(EEEi / 2 + 0, EEEi / 2 + 1);
+        int32x4_t t1 = vmulq_s32(c0, t0);
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
+        vst1_s16(dst + 0 * line, res0);
+
+        int32x4_t t2 = vmulq_s32(c4, EEOi / 2 + 0);
+        int32x4_t t3 = vmulq_s32(c4, EEOi / 2 + 1);
+        int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
+        vst1_s16(dst + 4 * line, res4);
+
+        int32x4_t t4 = vmulq_s32(c8, EEEi / 2 + 0);
+        int32x4_t t5 = vmulq_s32(c8, EEEi / 2 + 1);
+        int16x4_t res8 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
+        vst1_s16(dst + 8 * line, res8);
+
+        int32x4_t t6 = vmulq_s32(c12, EEOi / 2 + 0);
+        int32x4_t t7 = vmulq_s32(c12, EEOi / 2 + 1);
+        int16x4_t res12 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
+        vst1_s16(dst + 12 * line, res12);
+
+        dst += 4;
+    }
+}
+
+static inline void pass1Butterfly32_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
 {
+    const int shift = 4 + X265_DEPTH - 8;
     const int line = 32;
 
     int16x8_t Oline2;
@@ -252,11 +357,11 @@
 
     for (int i = 0; i < line; i += 2)
     {
-        int16x8x4_t in_lo = vld1q_s16_x4(src + (i + 0) * line);
+        int16x8x4_t in_lo = vld1q_s16_x4(src + (i + 0) * srcStride);
         in_lo.val2 = rev16(in_lo.val2);
         in_lo.val3 = rev16(in_lo.val3);
 
-        int16x8x4_t in_hi = vld1q_s16_x4(src + (i + 1) * line);
+        int16x8x4_t in_hi = vld1q_s16_x4(src + (i + 1) * srcStride);
         in_hi.val2 = rev16(in_hi.val2);
         in_hi.val3 = rev16(in_hi.val3);
 
@@ -424,6 +529,193 @@
     }
 }
 
+static inline void pass2Butterfly32_sve(const int16_t *src, int16_t *dst)
+{
+    const int shift = 11;
+    const int line = 32;
+
+    int16x8_t Oline2;
+    int32x4_t EOline2;
+    int32x4_t EEOline;
+    int32x4_t EEEEline / 2;
+    int32x4_t EEEOline / 2;
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8x4_t in_lo = vld1q_s16_x4(src + (i + 0) * line);
+        in_lo.val2 = rev16(in_lo.val2);
+        in_lo.val3 = rev16(in_lo.val3);
+
+        int16x8x4_t in_hi = vld1q_s16_x4(src + (i + 1) * line);
+        in_hi.val2 = rev16(in_hi.val2);
+        in_hi.val3 = rev16(in_hi.val3);
+
+        int32x4_t E04;
+        E00 = vaddl_s16(vget_low_s16(in_lo.val0),
+                          vget_low_s16(in_lo.val3));
+        E01 = vaddl_s16(vget_high_s16(in_lo.val0),
+                          vget_high_s16(in_lo.val3));
+        E02 = vaddl_s16(vget_low_s16(in_lo.val1),
+                          vget_low_s16(in_lo.val2));
+        E03 = vaddl_s16(vget_high_s16(in_lo.val1),
+                          vget_high_s16(in_lo.val2));
+
+        int32x4_t E14;
+        E10 = vaddl_s16(vget_low_s16(in_hi.val0),
+                          vget_low_s16(in_hi.val3));
+        E11 = vaddl_s16(vget_high_s16(in_hi.val0),
+                          vget_high_s16(in_hi.val3));
+        E12 = vaddl_s16(vget_low_s16(in_hi.val1),

x265-4.1.tar/source/common/aarch64/dct-prim.cpp -> x265-4.2.tar/source/common/aarch64/dct-prim.cpp Changed

@@ -21,22 +21,50 @@
 {
 using namespace X265_NS;
 
-static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
+static inline void transpose_4x4_s16(int16x4_t &s0, int16x4_t &s1, int16x4_t &s2, int16x4_t &s3)
 {
-    int32x2_t s0, s1, s2, s3;
+    int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+    int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+    int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+    int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
 
-    s0 = vtrn1_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
-    s1 = vtrn1_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
-    s2 = vtrn2_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
-    s3 = vtrn2_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
+    int16x8_t s02 = vzip1q_s16(s0q, s2q);
+    int16x8_t s13 = vzip1q_s16(s1q, s3q);
 
-    x0 = vtrn1_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
-    x1 = vtrn2_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
-    x2 = vtrn1_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
-    x3 = vtrn2_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
-}
+    int16x8x2_t s0123 = vzipq_s16(s02, s13);
 
+    s0 = vget_low_s16(s0123.val0);
+    s1 = vget_high_s16(s0123.val0);
+    s2 = vget_low_s16(s0123.val1);
+    s3 = vget_high_s16(s0123.val1);
+}
 
+static inline void transpose_4x8_s16(int16x4_t s0, int16x4_t s1, int16x4_t s2, int16x4_t s3,
+                                     int16x4_t s4, int16x4_t s5, int16x4_t s6, int16x4_t s7,
+                                     int16x8_t &d0, int16x8_t &d1, int16x8_t &d2, int16x8_t &d3)
+{
+    int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+    int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+    int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+    int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+    int16x8_t s4q = vcombine_s16(s4, vdup_n_s16(0));
+    int16x8_t s5q = vcombine_s16(s5, vdup_n_s16(0));
+    int16x8_t s6q = vcombine_s16(s6, vdup_n_s16(0));
+    int16x8_t s7q = vcombine_s16(s7, vdup_n_s16(0));
+
+    int16x8_t s04 = vzip1q_s16(s0q, s4q);
+    int16x8_t s15 = vzip1q_s16(s1q, s5q);
+    int16x8_t s26 = vzip1q_s16(s2q, s6q);
+    int16x8_t s37 = vzip1q_s16(s3q, s7q);
+
+    int16x8x2_t s0246 = vzipq_s16(s04, s26);
+    int16x8x2_t s1357 = vzipq_s16(s15, s37);
+
+    d0 = vzip1q_s16(s0246.val0, s1357.val0);
+    d1 = vzip2q_s16(s0246.val0, s1357.val0);
+    d2 = vzip1q_s16(s0246.val1, s1357.val1);
+    d3 = vzip2q_s16(s0246.val1, s1357.val1);
+}
 
 static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
                            uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
@@ -228,6 +256,123 @@
 }
 
 template<int shift>
+static inline void fastForwardDst4_neon(const int16_t *src, int16_t *dst)
+{
+    int16x4_t s0 = vld1_s16(src + 0);
+    int16x4_t s1 = vld1_s16(src + 4);
+    int16x4_t s2 = vld1_s16(src + 8);
+    int16x4_t s3 = vld1_s16(src + 12);
+
+    transpose_4x4_s16(s0, s1, s2, s3);
+
+    int32x4_t c0 = vaddl_s16(s0, s3);
+    int32x4_t c1 = vaddl_s16(s1, s3);
+    int32x4_t c2 = vsubl_s16(s0, s1);
+    int32x4_t c3 = vmull_n_s16(s2, 74);
+
+    int32x4_t t0 = vmlaq_n_s32(c3, c0, 29);
+    t0 = vmlaq_n_s32(t0, c1, 55);
+
+    int32x4_t t1 = vaddl_s16(s0, s1);
+    t1 = vsubw_s16(t1, s3);
+    t1 = vmulq_n_s32(t1, 74);
+
+    int32x4_t t2 = vmulq_n_s32(c2, 29);
+    t2 = vmlaq_n_s32(t2, c0, 55);
+    t2 = vsubq_s32(t2, c3);
+
+    int32x4_t t3 = vmlaq_n_s32(c3, c2, 55);
+    t3 = vmlsq_n_s32(t3, c1, 29);
+
+    int16x4_t d0 = vrshrn_n_s32(t0, shift);
+    int16x4_t d1 = vrshrn_n_s32(t1, shift);
+    int16x4_t d2 = vrshrn_n_s32(t2, shift);
+    int16x4_t d3 = vrshrn_n_s32(t3, shift);
+
+    vst1_s16(dst + 0, d0);
+    vst1_s16(dst + 4, d1);
+    vst1_s16(dst + 8, d2);
+    vst1_s16(dst + 12, d3);
+}
+
+template<int shift>
+static inline void inverseDst4_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
+{
+    int16x4_t s0 = vld1_s16(src + 0);
+    int16x4_t s1 = vld1_s16(src + 4);
+    int16x4_t s2 = vld1_s16(src + 8);
+    int16x4_t s3 = vld1_s16(src + 12);
+
+    int32x4_t c0 = vaddl_s16(s0, s2);
+    int32x4_t c1 = vaddl_s16(s2, s3);
+    int32x4_t c2 = vsubl_s16(s0, s3);
+    int32x4_t c3 = vmull_n_s16(s1, 74);
+
+    int32x4_t t0 = vmlaq_n_s32(c3, c0, 29);
+    t0 = vmlaq_n_s32(t0, c1, 55);
+
+    int32x4_t t1 = vmlaq_n_s32(c3, c2, 55);
+    t1 = vmlsq_n_s32(t1, c1, 29);
+
+    int32x4_t t2 = vaddl_s16(s0, s3);
+    t2 = vsubw_s16(t2, s2);
+    t2 = vmulq_n_s32(t2, 74);
+
+    int32x4_t t3 = vmulq_n_s32(c0, 55);
+    t3 = vmlaq_n_s32(t3, c2, 29);
+    t3 = vsubq_s32(t3, c3);
+
+    int16x4_t d0 = vqrshrn_n_s32(t0, shift);
+    int16x4_t d1 = vqrshrn_n_s32(t1, shift);
+    int16x4_t d2 = vqrshrn_n_s32(t2, shift);
+    int16x4_t d3 = vqrshrn_n_s32(t3, shift);
+
+    transpose_4x4_s16(d0, d1, d2, d3);
+
+    vst1_s16(dst + 0 * dstStride, d0);
+    vst1_s16(dst + 1 * dstStride, d1);
+    vst1_s16(dst + 2 * dstStride, d2);
+    vst1_s16(dst + 3 * dstStride, d3);
+}
+
+template<int shift>
+static inline void partialButterfly4_neon(const int16_t *src, int16_t *dst)
+{
+    int16x4_t s0 = vld1_s16(src + 0);
+    int16x4_t s1 = vld1_s16(src + 4);
+    int16x4_t s2 = vld1_s16(src + 8);
+    int16x4_t s3 = vld1_s16(src + 12);
+
+    transpose_4x4_s16(s0, s1, s2, s3);
+
+    int32x4_t E2, O2;
+    E0 = vaddl_s16(s0, s3);
+    O0 = vsubl_s16(s0, s3);
+    E1 = vaddl_s16(s1, s2);
+    O1 = vsubl_s16(s1, s2);
+
+    // Multiply and accumulate with g_t4 constants.
+    int32x4_t t0 = vaddq_s32(E0, E1);
+    t0 = vmulq_n_s32(t0, 64);
+    int32x4_t t1 = vmulq_n_s32(O0, 83);
+    t1 = vmlaq_n_s32(t1, O1, 36);
+    int32x4_t t2 = vsubq_s32(E0, E1);
+    t2 = vmulq_n_s32(t2, 64);
+    int32x4_t t3 = vmulq_n_s32(O0, 36);
+    t3 = vmlaq_n_s32(t3, O1, -83);
+
+    int16x4_t d0 = vrshrn_n_s32(t0, shift);
+    int16x4_t d1 = vrshrn_n_s32(t1, shift);
+    int16x4_t d2 = vrshrn_n_s32(t2, shift);
+    int16x4_t d3 = vrshrn_n_s32(t3, shift);
+
+    vst1_s16(dst + 0, d0);
+    vst1_s16(dst + 4, d1);
+    vst1_s16(dst + 8, d2);
+    vst1_s16(dst + 12, d3);
+}
+
+template<int shift>
 static inline void partialButterfly16_neon(const int16_t *src, int16_t *dst)
 {
     const int line = 16;
@@ -620,385 +765,997 @@
     }
 }
 
-static void partialButterflyInverse4(const int16_t *src, int16_t *dst, int shift, int line)
+template<int shift>
+static inline void partialButterflyInverse4_neon(const int16_t *src, int16_t *dst,
+                                                 intptr_t dstStride)
 {
-    int j;
-    int E2, O2;
-    int add = 1 << (shift - 1);
+    int16x4_t s0 = vld1_s16(src + 0);
+    int16x4_t s1 = vld1_s16(src + 4);

x265-4.1.tar/source/common/aarch64/dct-prim.h -> x265-4.2.tar/source/common/aarch64/dct-prim.h Changed

x265-4.1.tar/source/common/aarch64/filter-neon-dotprod.cpp -> x265-4.2.tar/source/common/aarch64/filter-neon-dotprod.cpp Changed

@@ -21,6 +21,7 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+#include "filter-prim.h"
 #include "filter-neon-dotprod.h"
 
 #if !HIGH_BIT_DEPTH
@@ -43,6 +44,13 @@
     3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
+// This is to use with vtbl2q_s32_s16.
+// Extract the middle two bytes from each 32-bit element in a vector, using these byte
+// indices.
+static const uint8_t vert_shr_tbl16 = {
+    1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+};
+
 uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
                               const int32x4_t constant, const uint8x16x3_t tbl)
 {
@@ -972,24 +980,168 @@
     }
 }
 
+template<int coeffIdx, int coeffIdy, int width, int height>
+void interp8_hv_pp_dotprod(const pixel *src, intptr_t srcStride, pixel *dst,
+                           intptr_t dstStride)
+{
+    const int v_shift = IF_FILTER_PREC + IF_INTERNAL_PREC - X265_DEPTH;
+    // Subtract 8 from shift since we account for that in table lookups.
+    const int v_shift_offset = v_shift - 8;
+    const int16x8_t v_filter = vld1q_s16(X265_NS::g_lumaFiltercoeffIdy);
+    const int32x4_t v_offset = vdupq_n_s32((1 << (v_shift - 1)) +
+                                           (IF_INTERNAL_OFFS << IF_FILTER_PREC));
+
+    src -= 3 * srcStride + 3;
+
+    const uint8x16x3_t tbl = vld1q_u8_x3(dotprod_permute_tbl);
+    const int8x8_t filter = vmovn_s16(vld1q_s16(g_lumaFiltercoeffIdx));
+    const uint8x16_t shr_tbl = vld1q_u8(vert_shr_tbl);
+
+    uint8x16_t h_s11;
+    load_u8x16xn<7>(src, srcStride, h_s);
+
+    int16x4_t v_s11;
+    v_s0 = filter8_4_ps(h_s0, filter, tbl);
+    v_s1 = filter8_4_ps(h_s1, filter, tbl);
+    v_s2 = filter8_4_ps(h_s2, filter, tbl);
+    v_s3 = filter8_4_ps(h_s3, filter, tbl);
+    v_s4 = filter8_4_ps(h_s4, filter, tbl);
+    v_s5 = filter8_4_ps(h_s5, filter, tbl);
+    v_s6 = filter8_4_ps(h_s6, filter, tbl);
+
+    src += 7 * srcStride;
+
+    for (int row = 0; row < height; row += 4)
+    {
+        load_u8x16xn<4>(src, srcStride, h_s + 7);
+        v_s7 = filter8_4_ps(h_s7, filter, tbl);
+        v_s8 = filter8_4_ps(h_s8, filter, tbl);
+        v_s9 = filter8_4_ps(h_s9, filter, tbl);
+        v_s10 = filter8_4_ps(h_s10, filter, tbl);
+
+        int32x4_t sum4;
+        filter8_s16x4<coeffIdy>(v_s + 0, v_filter, v_offset, sum0);
+        filter8_s16x4<coeffIdy>(v_s + 1, v_filter, v_offset, sum1);
+        filter8_s16x4<coeffIdy>(v_s + 2, v_filter, v_offset, sum2);
+        filter8_s16x4<coeffIdy>(v_s + 3, v_filter, v_offset, sum3);
+
+        int16x8_t sum_s164;
+        sum_s160 = vtbl2q_s32_s16(sum0, sum1, shr_tbl);
+        sum_s161 = vtbl2q_s32_s16(sum2, sum3, shr_tbl);
+
+        uint8x8_t res2;
+        res0 = vqshrun_n_s16(sum_s160, v_shift_offset);
+        res1 = vqshrun_n_s16(sum_s161, v_shift_offset);
+
+        store_u8x4_strided_xN<4>(dst + 0 * dstStride, dstStride, res);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        v_s3 = v_s7;
+        v_s4 = v_s8;
+        v_s5 = v_s9;
+        v_s6 = v_s10;
+
+        src += 4 * srcStride;
+        dst += 4 * dstStride;
+    }
+}
+
 // Declaration for use in interp_hv_pp_dotprod().
 template<int N, int width, int height>
 void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, uint8_t *dst,
                          intptr_t dstStride, int coeffIdx);
 
-// Implementation of luma_hvpp, using Neon DotProd implementation for the
-// horizontal part, and Armv8.0 Neon implementation for the vertical part.
 template<int width, int height>
 void interp_hv_pp_dotprod(const pixel *src, intptr_t srcStride, pixel *dst,
                           intptr_t dstStride, int idxX, int idxY)
 {
+// Use the merged hv paths with Clang only as performance with GCC is worse than the
+// existing approach of doing horizontal and vertical interpolation separately.
+#ifdef __clang__
+    if (width == 4 && height <= 8)
+    {
+        switch (idxX)
+        {
+        case 1:
+        {
+            switch (idxY)
+            {
+            case 1:
+                return interp8_hv_pp_dotprod<1, 1, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            case 2:
+                return interp8_hv_pp_dotprod<1, 2, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            case 3:
+                return interp8_hv_pp_dotprod<1, 3, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            }
+
+            break;
+        }
+        case 2:
+        {
+            switch (idxY)
+            {
+            case 1:
+                return interp8_hv_pp_dotprod<2, 1, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            case 2:
+                return interp8_hv_pp_dotprod<2, 2, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            case 3:
+                return interp8_hv_pp_dotprod<2, 3, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            }
+
+            break;
+        }
+        case 3:
+        {
+            switch (idxY)
+            {
+            case 1:
+                return interp8_hv_pp_dotprod<3, 1, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            case 2:
+                return interp8_hv_pp_dotprod<3, 2, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            case 3:
+                return interp8_hv_pp_dotprod<3, 3, width, height>(src, srcStride, dst,
+                                                                  dstStride);
+            }
+
+            break;
+        }
+        }
+    }
+    else
+    {
+        // Implementation of luma_hvpp, using Neon DotProd implementation for the
+        // horizontal part, and Armv8.0 Neon implementation for the vertical part.
+        const int N_TAPS = 8;
+
+        ALIGN_VAR_32(int16_t, immedwidth * (height + N_TAPS - 1));
+
+        interp8_horiz_ps_dotprod<width, height>(src, srcStride, immed, width, idxX,
+                                                1);
+        interp_vert_sp_neon<N_TAPS, width, height>(immed + (N_TAPS / 2 - 1) * width,
+                                                   width, dst, dstStride, idxY);
+    }
+#else // __clang__
+    // Implementation of luma_hvpp, using Neon DotProd implementation for the
+    // horizontal part, and Armv8.0 Neon implementation for the vertical part.
     const int N_TAPS = 8;
+
     ALIGN_VAR_32(int16_t, immedwidth * (height + N_TAPS - 1));
 
     interp8_horiz_ps_dotprod<width, height>(src, srcStride, immed, width, idxX,
                                             1);
     interp_vert_sp_neon<N_TAPS, width, height>(immed + (N_TAPS / 2 - 1) * width,
                                                width, dst, dstStride, idxY);
+#endif // __clang__
 }
 
 #define LUMA_DOTPROD(W, H) \

x265-4.1.tar/source/common/aarch64/filter-neon-i8mm.cpp -> x265-4.2.tar/source/common/aarch64/filter-neon-i8mm.cpp Changed

@@ -23,6 +23,7 @@
 
 #if defined(HAVE_NEON_I8MM)
 #include "filter-neon-i8mm.h"
+#include "filter-prim.h"
 #if !HIGH_BIT_DEPTH
 
 #include "mem-neon.h"
@@ -37,16 +38,17 @@
 };
 
 static const uint8_t matmul_permute_tbl232 = {
-    // Permute for luma filter 3.
+    // Permute for luma filter 1.
     { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
       4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 },
-    // Permute for luma filter 1.
+    // Permute for luma filter 2 and 3.
     { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10,
       5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14 }
 };
 
-static const int8_t matmul_luma_filter216 = {
+static const int8_t matmul_luma_filter316 = {
     { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 },
+    { 4, -11, 40, 40, -11, 4, -1, 0, 0, 4, -11, 40, 40, -11, 4, -1 },
     { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 }
 };
 
@@ -59,64 +61,14 @@
     3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
-                              const uint8x16x3_t tbl)
-{
-    // Permute input samples for dot product.
-    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
-    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
-    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
-
-    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
-    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
-    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
-    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
-
-    // Narrow and combine.
-    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
-                                     vmovn_s32(dotprod_hi));
-    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
-}
-
-void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
-                                uint8x16_t *d)
-{
-    // Permute input samples for dot product.
-    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
-    d0 = vqtbl1q_u8(vcombine_u8(samples0, vdup_n_u8(0)), tbl.val0);
-    d1 = vqtbl1q_u8(vcombine_u8(samples1, vdup_n_u8(0)), tbl.val0);
-    d2 = vqtbl1q_u8(vcombine_u8(samples2, vdup_n_u8(0)), tbl.val0);
-    d3 = vqtbl1q_u8(vcombine_u8(samples3, vdup_n_u8(0)), tbl.val0);
-}
-
-uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
-                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
-{
-    // Permute input samples for dot product.
-    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-    // Already in perm_s0.
-    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
-    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
-
-    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
-    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
-    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
-    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
-
-    // Save for re-use in next iteration.
-    perm_s0 = perm_s2;
-
-    // Narrow and combine.
-    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
-                                     vmovn_s32(dotprod_hi));
-    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
-}
+// This is to use with vtbl2q_s32_s16.
+// Extract the middle two bytes from each 32-bit element in a vector, using these byte
+// indices.
+static const uint8_t vert_shr_tbl16 = {
+    1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+};
 
+template<bool coeff2>
 uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter,
                                      const uint8x16x2_t tbl)
 {
@@ -129,73 +81,19 @@
 
     // Narrow and combine.
     int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
-    return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
-}
-
-int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
-                              const int16x8_t constant, const uint8x16x3_t tbl)
-{
-    // Permute input samples for dot product.
-    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
-    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
-
-    int32x4_t dotprod = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
-    dotprod = vusdotq_lane_s32(dotprod, perm_s1, filter, 1);
-
-    // Narrow.
-    return vadd_s16(vmovn_s32(dotprod), vget_low_s16(constant));
-}
-
-int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
-                              const int16x8_t constant, const uint8x16x3_t tbl)
-{
-    // Permute input samples for dot product.
-    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
-    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
-    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
-
-    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
-    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
-    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
-    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
-
-    // Narrow and combine.
-    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
-                                     vmovn_s32(dotprod_hi));
-    return vaddq_s16(dotprod, constant);
-}
 
-int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
-                                    const int16x8_t constant,
-                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
-{
-    // Permute input samples for dot product.
-    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-    // Already in perm_s0.
-    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
-    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
-
-    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
-    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
-    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
-    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
-
-    // Save for re-use in next iteration.
-    perm_s0 = perm_s2;
+    if (coeff2)
+    {
+        // Substract the source elements corresponding to filter tap value -1,
+        // which weren't included in the initial matrix multiplication.
+        matmul = vreinterpretq_s16_u16(vsubw_u8(vreinterpretq_u16_s16(matmul),
+                                                vget_low_u8(samples)));
+    }
 
-    // Narrow and combine.
-    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
-                                     vmovn_s32(dotprod_hi));
-    return vaddq_s16(dotprod, constant);
+    return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
 }
 
+template<bool coeff2>
 int16x8_t inline filter8_8_ps_matmul(uint8x16_t samples, const int8x16_t filter,
                                      const int16x8_t constant,
                                      const uint8x16x2_t tbl)
@@ -209,9 +107,21 @@
 
     // Narrow and combine.
     int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
-    return vaddq_s16(matmul, constant);
+
+    int16x8_t offset_matmul = constant;
+
+    if (coeff2)
+    {
+        // Substract the source elements corresponding to filter tap value -1,
+        // which weren't included in the initial matrix multiplication.
+        offset_matmul = vreinterpretq_s16_u16(
+            vsubw_u8(vreinterpretq_u16_s16(offset_matmul), vget_low_u8(samples)));
+    }
+

x265-4.2.tar/source/common/aarch64/filter-prim-sve.cpp Added

@@ -0,0 +1,1057 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-prim-sve.h"
+#include "mem-neon.h"
+#include "neon-sve-bridge.h"
+
+#include <arm_neon.h>
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+
+static const uint16_t dotprod_h_permute_tbl32 = {
+    // clang-format off
+    0, 1, 2, 3, 1, 2, 3, 4,
+    2, 3, 4, 5, 3, 4, 5, 6,
+    3, 2, 1, 0, 4, 3, 2, 1,
+    5, 4, 3, 2, 6, 5, 4, 3,
+    // clang-format on
+};
+
+static const uint8_t dotprod_v_permute_tbl80 = {
+    2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19,
+    2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23,
+    2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27,
+    2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31,
+};
+
+template<bool coeff2>
+void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter,
+                          uint16x4_t maxVal)
+{
+    if (coeff2)
+    {
+        int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s0, s1));
+        int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s2, s3));
+
+        int64x2_t sum_lo = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0);
+        int64x2_t sum_hi = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0);
+
+        int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        int64x2_t sum_lo =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s0), filter, 0);
+        int64x2_t sum_hi =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s2), filter, 0);
+
+        sum_lo = x265_sdotq_lane_s16(sum_lo, vreinterpretq_s16_u16(s1), filter, 1);
+        sum_hi = x265_sdotq_lane_s16(sum_hi, vreinterpretq_s16_u16(s3), filter, 1);
+
+        int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+}
+
+template<bool coeff2>
+void inline filter8_u16x8(uint16x8_t *s, uint16x8_t &d, int16x8_t filter,
+                          uint16x8_t maxVal)
+{
+    if (coeff2)
+    {
+        int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s0, s1));
+        int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s2, s3));
+        int16x8_t sum45 = vreinterpretq_s16_u16(vaddq_u16(s4, s5));
+        int16x8_t sum67 = vreinterpretq_s16_u16(vaddq_u16(s6, s7));
+
+        int64x2_t sum0 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0);
+        int64x2_t sum1 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0);
+        int64x2_t sum2 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum45, filter, 0);
+        int64x2_t sum3 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum67, filter, 0);
+
+        int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+        int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else
+    {
+        int64x2_t sum0 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s0), filter, 0);
+        int64x2_t sum1 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s1), filter, 0);
+        int64x2_t sum2 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s2), filter, 0);
+        int64x2_t sum3 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s3), filter, 0);
+
+        sum0 = x265_sdotq_lane_s16(sum0, vreinterpretq_s16_u16(s4), filter, 1);
+        sum1 = x265_sdotq_lane_s16(sum1, vreinterpretq_s16_u16(s5), filter, 1);
+        sum2 = x265_sdotq_lane_s16(sum2, vreinterpretq_s16_u16(s6), filter, 1);
+        sum3 = x265_sdotq_lane_s16(sum3, vreinterpretq_s16_u16(s7), filter, 1);
+
+        int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum2));
+        int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum1), vmovn_s64(sum3));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+}
+
+template<bool coeff2>
+void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t *idx)
+{
+    if (coeff2)
+    {
+        d0 = x265_tblq_u16(s0, idx0);
+        d1 = x265_tblq_u16(s1, idx2);
+        d2 = x265_tblq_u16(s0, idx1);
+        d3 = x265_tblq_u16(s1, idx3);
+    }
+    else
+    {
+        d0 = x265_tblq_u16(s0, idx0);
+        d1 = x265_tblq_u16(s1, idx0);
+        d2 = x265_tblq_u16(s0, idx1);
+        d3 = x265_tblq_u16(s1, idx1);
+    }
+}
+
+template<bool coeff2>
+void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t s2,
+                           uint16x8_t *idx)
+{
+    if (coeff2)
+    {
+        d0 = x265_tblq_u16(s0, idx0);
+        d1 = x265_tblq_u16(s1, idx2);
+        d2 = x265_tblq_u16(s0, idx1);
+        d3 = x265_tblq_u16(s1, idx3);
+        d4 = x265_tblq_u16(s1, idx0);
+        d5 = x265_tblq_u16(s2, idx2);
+        d6 = x265_tblq_u16(s1, idx1);
+        d7 = x265_tblq_u16(s2, idx3);
+    }
+    else
+    {
+        d0 = x265_tblq_u16(s0, idx0);
+        d1 = x265_tblq_u16(s1, idx0);
+        d2 = x265_tblq_u16(s0, idx1);
+        d3 = x265_tblq_u16(s1, idx1);
+        d4 = d1;
+        d5 = x265_tblq_u16(s2, idx0);
+        d6 = d3;
+        d7 = x265_tblq_u16(s2, idx1);
+    }
+}
+
+template<bool coeff2, int width, int height>
+void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride,
+                            pixel *dst, intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 8;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+    const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFiltercoeffIdx);
+    uint16x8_t idx4;
+
+    idx0 = vld1q_u16(dotprod_h_permute_tbl + 0);
+    idx1 = vld1q_u16(dotprod_h_permute_tbl + 8);
+    if (coeff2)
+    {
+        idx2 = vld1q_u16(dotprod_h_permute_tbl + 16);
+        idx3 = vld1q_u16(dotprod_h_permute_tbl + 24);
+    }
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)

x265-4.2.tar/source/common/aarch64/filter-prim-sve.h Added

@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FILTER_PRIM_SVE_H
+#define X265_FILTER_PRIM_SVE_H
+
+#if defined(HAVE_SVE)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_sve(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_SVE)
+
+#endif // X265_FILTER_PRIM_SVE_H

x265-4.1.tar/source/common/aarch64/filter-prim.cpp -> x265-4.2.tar/source/common/aarch64/filter-prim.cpp Changed

@@ -1,3 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Liwei Wang <liwei@multicorewareinc.com>
+ *          Jonathan Swinney <jswinney@amazon.com>
+ *          Hari Limaye <hari.limaye@arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
 #if HAVE_NEON
 
 #include "filter-prim.h"
@@ -6,235 +32,228 @@
 #include <arm_neon.h>
 
 namespace {
-void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
-                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
+#if !HIGH_BIT_DEPTH
+// This is to use with vtbl2q_s32_s16.
+// Extract the middle two bytes from each 32-bit element in a vector, using these byte
+// indices.
+static const uint8_t vert_shr_tbl16 = {
+    1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+};
+#endif
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
+template<bool coeff4>
+void inline filter4_s16x4_sum(const int16x4_t *s, const int16x4_t f,
+                              const int32x4_t c, int32x4_t &sum)
 {
-    if (coeffIdx == 4)
+    if (coeff4)
     {
         // { -4, 36, 36, -4 }
-        int16x8_t t0 = vaddq_s16(s1, s2);
-        int16x8_t t1 = vaddq_s16(s0, s3);
-        d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
-        d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
+        int16x4_t sum03 = vadd_s16(s0, s3);
+        int16x4_t sum12 = vadd_s16(s1, s2);
 
-        d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
-        d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
+        sum = vmlal_n_s16(c, sum12, 9);
+        sum = vsubw_s16(sum, sum03);
     }
     else
     {
-        d0 = vmlal_lane_s16(c, vget_low_s16(s0), f, 0);
-        d0 = vmlal_lane_s16(d0, vget_low_s16(s1), f, 1);
-        d0 = vmlal_lane_s16(d0, vget_low_s16(s2), f, 2);
-        d0 = vmlal_lane_s16(d0, vget_low_s16(s3), f, 3);
-
-        d1 = vmlal_lane_s16(c, vget_high_s16(s0), f, 0);
-        d1 = vmlal_lane_s16(d1, vget_high_s16(s1), f, 1);
-        d1 = vmlal_lane_s16(d1, vget_high_s16(s2), f, 2);
-        d1 = vmlal_lane_s16(d1, vget_high_s16(s3), f, 3);
+        sum = vmlal_lane_s16(c, s0, f, 0);
+        sum = vmlal_lane_s16(sum, s1, f, 1);
+        sum = vmlal_lane_s16(sum, s2, f, 2);
+        sum = vmlal_lane_s16(sum, s3, f, 3);
     }
 }
 
-template<int coeffIdx>
-void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
+template<bool coeff4, int shift>
+void inline filter4_ss_s16x4(const int16x4_t *s, const int16x4_t f,
+                             const int32x4_t c, int16x4_t &d)
 {
-    if (coeffIdx == 1)
-    {
-        // { -1, 4, -10, 58, 17, -5, 1, 0 }
-        d = vsubl_s16(s6, s0);
-        d = vaddq_s32(d, c);
-        d = vmlal_n_s16(d, s1, 4);
-        d = vmlsl_n_s16(d, s2, 10);
-        d = vmlal_n_s16(d, s3, 58);
-        d = vmlal_n_s16(d, s4, 17);
-        d = vmlsl_n_s16(d, s5, 5);
-    }
-    else if (coeffIdx == 2)
-    {
-        // { -1, 4, -11, 40, 40, -11, 4, -1 }
-        int32x4_t t0 = vaddl_s16(s3, s4);
-        int32x4_t t1 = vaddl_s16(s2, s5);
-        int32x4_t t2 = vaddl_s16(s1, s6);
-        int32x4_t t3 = vaddl_s16(s0, s7);
-
-        d = vmlaq_n_s32(c, t0, 40);
-        d = vmlaq_n_s32(d, t1, -11);
-        d = vmlaq_n_s32(d, t2, 4);
-        d = vmlaq_n_s32(d, t3, -1);
-    }
-    else
-    {
-        // { 0, 1, -5, 17, 58, -10, 4, -1 }
-        d = vsubl_s16(s1, s7);
-        d = vaddq_s32(d, c);
-        d = vmlal_n_s16(d, s6, 4);
-        d = vmlsl_n_s16(d, s5, 10);
-        d = vmlal_n_s16(d, s4, 58);
-        d = vmlal_n_s16(d, s3, 17);
-        d = vmlsl_n_s16(d, s2, 5);
-    }
+    int32x4_t sum;
+
+    filter4_s16x4_sum<coeff4>(s, f, c, sum);
+
+    // We divided filter values by 4 so subtract 2 from right shift in case of filter
+    // coefficient 4.
+    const int shift_offset = coeff4 ? shift - 2 : shift;
+
+    d = vshrn_n_s32(sum, shift_offset);
 }
 
-template<int coeffIdx>
-void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
-                          int32x4_t &d1)
+template<bool coeff4, int shift>
+void inline filter4x2_sp_s16x4(const int16x4_t *s0, const int16x4_t *s1,
+                               const int16x4_t f, const int32x4_t c,
+                               const uint8x16_t shr_tbl, uint8x8_t &d)
 {
-    if (coeffIdx == 1)
-    {
-        // { -1, 4, -10, 58, 17, -5, 1, 0 }
-        d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0));
-        d0 = vaddq_s32(d0, c);
-        d0 = vmlal_n_s16(d0, vget_low_s16(s1), 4);
-        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 10);
-        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 58);
-        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 17);
-        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 5);
-
-        d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0));
-        d1 = vaddq_s32(d1, c);
-        d1 = vmlal_n_s16(d1, vget_high_s16(s1), 4);
-        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 10);
-        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 58);
-        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 17);
-        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 5);
-    }
-    else if (coeffIdx == 2)
+    int32x4_t sum0, sum1;
+
+    filter4_s16x4_sum<coeff4>(s0, f, c, sum0);
+    filter4_s16x4_sum<coeff4>(s1, f, c, sum1);
+    int16x8_t sum = vtbl2q_s32_s16(sum0, sum1, shr_tbl);
+
+    // We divided filter values by 4 so subtract 2 from right shift in case of filter
+    // coefficient 4.
+    const int shift_offset = coeff4 ? shift - 2 : shift;
+
+    d = vqshrun_n_s16(sum, shift_offset);
+}
+
+template<bool coeff4>
+void inline filter4_s16x8_sum(const int16x8_t *s, const int16x4_t f,
+                              const int32x4_t c, int32x4_t &sum_lo, int32x4_t &sum_hi)
+{
+    if (coeff4)
     {
-        // { -1, 4, -11, 40, 40, -11, 4, -1 }
-        int32x4_t t0 = vaddl_s16(vget_low_s16(s3), vget_low_s16(s4));
-        int32x4_t t1 = vaddl_s16(vget_low_s16(s2), vget_low_s16(s5));
-        int32x4_t t2 = vaddl_s16(vget_low_s16(s1), vget_low_s16(s6));
-        int32x4_t t3 = vaddl_s16(vget_low_s16(s0), vget_low_s16(s7));
-
-        d0 = vmlaq_n_s32(c, t0, 40);
-        d0 = vmlaq_n_s32(d0, t1, -11);
-        d0 = vmlaq_n_s32(d0, t2, 4);
-        d0 = vmlaq_n_s32(d0, t3, -1);
-
-        int32x4_t t4 = vaddl_s16(vget_high_s16(s3), vget_high_s16(s4));
-        int32x4_t t5 = vaddl_s16(vget_high_s16(s2), vget_high_s16(s5));
-        int32x4_t t6 = vaddl_s16(vget_high_s16(s1), vget_high_s16(s6));

x265-4.1.tar/source/common/aarch64/filter-prim.h -> x265-4.2.tar/source/common/aarch64/filter-prim.h Changed

@@ -7,6 +7,116 @@
 #include "primitives.h"
 #include "x265.h"
 
+#include <arm_neon.h>
+
+static inline int16x8_t vtbl2q_s32_s16(int32x4_t a, int32x4_t b, uint8x16_t index)
+{
+    uint8x16x2_t ab;
+
+    ab.val0 = vreinterpretq_u8_s32(a);
+    ab.val1 = vreinterpretq_u8_s32(b);
+
+    return vreinterpretq_s16_u8(vqtbl2q_u8(ab, index));
+}
+
+template<int coeffIdx>
+void inline filter8_s16x4(const int16x4_t *s, const int16x8_t filter,
+                          const int32x4_t c, int32x4_t &d)
+{
+    if (coeffIdx == 1)
+    {
+        d = vsubl_s16(s6, s0);
+        d = vaddq_s32(d, c);
+        d = vmlal_laneq_s16(d, s1, filter, 1);
+        d = vmlal_laneq_s16(d, s2, filter, 2);
+        d = vmlal_laneq_s16(d, s3, filter, 3);
+        d = vmlal_laneq_s16(d, s4, filter, 4);
+        d = vmlal_laneq_s16(d, s5, filter, 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        int16x4_t sum07 = vadd_s16(s0, s7);
+        int16x4_t sum16 = vadd_s16(s1, s6);
+        int16x4_t sum25 = vadd_s16(s2, s5);
+        int16x4_t sum34 = vadd_s16(s3, s4);
+
+        int32x4_t sum12356 =  vmlal_laneq_s16(c, sum16, filter, 1);
+        sum12356 = vmlal_laneq_s16(sum12356, sum25, filter, 2);
+        sum12356 = vmlal_laneq_s16(sum12356, sum34, filter, 3);
+
+        d = vsubw_s16(sum12356, sum07);
+    }
+    else
+    {
+        d = vsubl_s16(s1, s7);
+        d = vaddq_s32(d, c);
+        d = vmlal_laneq_s16(d, s2, filter, 2);
+        d = vmlal_laneq_s16(d, s3, filter, 3);
+        d = vmlal_laneq_s16(d, s4, filter, 4);
+        d = vmlal_laneq_s16(d, s5, filter, 5);
+        d = vmlal_laneq_s16(d, s6, filter, 6);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x8(const int16x8_t *s, const int16x8_t filter,
+                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
+{
+    if (coeffIdx == 1)
+    {
+        d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s1), filter, 1);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s2), filter, 2);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s3), filter, 3);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s4), filter, 4);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s5), filter, 5);
+
+        d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s1), filter, 1);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s2), filter, 2);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s3), filter, 3);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s4), filter, 4);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s5), filter, 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        int16x8_t sum07 = vaddq_s16(s0, s7);
+        int16x8_t sum16 = vaddq_s16(s1, s6);
+        int16x8_t sum25 = vaddq_s16(s2, s5);
+        int16x8_t sum34 = vaddq_s16(s3, s4);
+
+        int32x4_t sum123456_lo = vmlal_laneq_s16(c, vget_low_s16(sum16), filter, 1);
+        sum123456_lo = vmlal_laneq_s16(sum123456_lo, vget_low_s16(sum25), filter, 2);
+        sum123456_lo = vmlal_laneq_s16(sum123456_lo, vget_low_s16(sum34), filter, 3);
+
+        int32x4_t sum123456_hi = vmlal_laneq_s16(c, vget_high_s16(sum16), filter, 1);
+        sum123456_hi = vmlal_laneq_s16(sum123456_hi, vget_high_s16(sum25), filter, 2);
+        sum123456_hi = vmlal_laneq_s16(sum123456_hi, vget_high_s16(sum34), filter, 3);
+
+        d0 = vsubw_s16(sum123456_lo, vget_low_s16(sum07));
+        d1 = vsubw_s16(sum123456_hi, vget_high_s16(sum07));
+    }
+    else
+    {
+        int16x8_t sum17 = vsubq_s16(s1, s7);
+        d0 = vaddw_s16(c, vget_low_s16(sum17));
+        d1 = vaddw_s16(c, vget_high_s16(sum17));
+
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s2), filter, 2);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s3), filter, 3);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s4), filter, 4);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s5), filter, 5);
+        d0 = vmlal_laneq_s16(d0, vget_low_s16(s6), filter, 6);
+
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s2), filter, 2);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s3), filter, 3);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s4), filter, 4);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s5), filter, 5);
+        d1 = vmlal_laneq_s16(d1, vget_high_s16(s6), filter, 6);
+    }
+}
 
 namespace X265_NS
 {

x265-4.1.tar/source/common/aarch64/fun-decls.h -> x265-4.2.tar/source/common/aarch64/fun-decls.h Changed

@@ -162,13 +162,6 @@
 FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
 FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
 
-void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
-
-uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
-
 void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
@@ -228,16 +221,10 @@
 
 int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
-void PFX(weight_pp_neon)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 
-uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
-
 void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
@@ -258,4 +245,5 @@
 
 int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
+
+int PFX(scanPosLast_sve2_bitperm)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);

x265-4.1.tar/source/common/aarch64/intrapred-prim.cpp -> x265-4.2.tar/source/common/aarch64/intrapred-prim.cpp Changed

@@ -4,6 +4,7 @@
 
 #if HAVE_NEON
 #include "arm64-utils.h"
+#include "mem-neon.h"
 #include <arm_neon.h>
 
 using namespace X265_NS;
@@ -399,6 +400,136 @@
         }
 }
 
+#if !HIGH_BIT_DEPTH
+void intra_pred_planar4_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix,
+                             int /*dirMode*/, int /*bFilter*/)
+{
+    const int log2Size = 2;
+    const int blkSize = 1 << log2Size;
+
+    uint8x16_t src = vld1q_u8(srcPix + 1);
+
+    uint8x8_t above =
+        vreinterpret_u8_u32(vdup_laneq_u32(vreinterpretq_u32_u8(src), 0));
+
+    uint8x8_t topRight = vdup_laneq_u8(src, blkSize);
+    uint8x8_t bottomLeft = vdup_laneq_u8(src, 3 * blkSize);
+
+    const uint8_t c216 =
+    {
+        {3, 2, 1, 0, 3, 2, 1, 0, 1, 2, 3, 4, 1, 2, 3, 4},
+        {3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3}
+    };
+
+    const uint8x16_t c0 = vld1q_u8(c0);
+    const uint8x16_t c1 = vld1q_u8(c1);
+
+    uint16x8_t t = vmull_u8(topRight, vget_high_u8(c0));
+    t = vmlal_u8(t, above, vget_low_u8(c1));
+    t = vmlal_u8(t, bottomLeft, vget_high_u8(c1));
+
+    uint8x8_t index02 = vcreate_u8(0x0A0A0A0A08080808);
+    uint8x8_t left02 = vqtbl1_u8(src, index02);
+    uint16x8_t t02 = vmlal_u8(t, left02, vget_low_u8(c0));
+    uint8x8_t d02 = vrshrn_n_u16(t02, log2Size + 1);
+
+    uint8x8_t index13 = vcreate_u8(0x0B0B0B0B09090909);
+    uint8x8_t left13 = vqtbl1_u8(src, index13);
+    uint16x8_t t13 = vmlal_u8(t, left13, vget_low_u8(c0));
+    uint16x8_t sub_bottomLeft_above = vsubl_u8(bottomLeft, above);
+    t13 = vaddq_u16(t13, sub_bottomLeft_above);
+    uint8x8_t d13 = vrshrn_n_u16(t13, log2Size + 1);
+
+    store_u8x4_strided_xN<2>(dst + 0 * dstStride, 2 * dstStride, &d02);
+    store_u8x4_strided_xN<2>(dst + 1 * dstStride, 2 * dstStride, &d13);
+}
+#endif
+
+#if !HIGH_BIT_DEPTH
+void intra_pred_planar32_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix,
+                              int /*dirMode*/, int /*bFilter*/)
+{
+    const int log2Size = 5;
+    const int blkSize = 1 << log2Size;
+
+    const pixel *src0 = srcPix + 1;
+    const pixel *src1 = srcPix + 2 * blkSize + 1;
+
+    uint8x8_t above0 = vld1_u8(src0 + 0 * 8);
+    uint8x8_t above1 = vld1_u8(src0 + 1 * 8);
+    uint8x8_t above2 = vld1_u8(src0 + 2 * 8);
+    uint8x8_t above3 = vld1_u8(src0 + 3 * 8);
+
+    uint8x8_t topRight = vdup_n_u8(src0blkSize);
+    uint8x8_t bottomLeft = vdup_n_u8(src1blkSize);
+
+    const uint8_t c232 =
+    {
+        {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+         15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0},
+        { 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
+         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+    };
+
+    // left constant
+    const uint8x8_t l0 = vld1_u8(c0 + 0 * 8);
+    const uint8x8_t l1 = vld1_u8(c0 + 1 * 8);
+    const uint8x8_t l2 = vld1_u8(c0 + 2 * 8);
+    const uint8x8_t l3 = vld1_u8(c0 + 3 * 8);
+
+    // topRight constant
+    const uint8x8_t tR0 = vld1_u8(c1 + 0 * 8);
+    const uint8x8_t tR1 = vld1_u8(c1 + 1 * 8);
+    const uint8x8_t tR2 = vld1_u8(c1 + 2 * 8);
+    const uint8x8_t tR3 = vld1_u8(c1 + 3 * 8);
+
+    const uint16x8_t offset = vdupq_n_u16(blkSize);
+    const uint16x8_t offset_bottomLeft = vaddw_u8(offset, bottomLeft);
+
+    const uint8x8_t c31 = vdup_n_u8(31);
+
+    uint16x8_t t0 = vmlal_u8(offset_bottomLeft, topRight, tR0);
+    t0 = vmlal_u8(t0, above0, c31);
+
+    uint16x8_t t1 = vmlal_u8(offset_bottomLeft, topRight, tR1);
+    t1 = vmlal_u8(t1, above1, c31);
+
+    uint16x8_t t2 = vmlal_u8(offset_bottomLeft, topRight, tR2);
+    t2 = vmlal_u8(t2, above2, c31);
+
+    uint16x8_t t3 = vmlal_u8(offset_bottomLeft, topRight, tR3);
+    t3 = vmlal_u8(t3, above3, c31);
+
+    uint16x8_t sub_bottomLeft_above0 = vsubl_u8(bottomLeft, above0);
+    uint16x8_t sub_bottomLeft_above1 = vsubl_u8(bottomLeft, above1);
+    uint16x8_t sub_bottomLeft_above2 = vsubl_u8(bottomLeft, above2);
+    uint16x8_t sub_bottomLeft_above3 = vsubl_u8(bottomLeft, above3);
+
+    for (int y = 0; y < 32; y++)
+    {
+        uint8x8_t left = vdup_n_u8(src1y);
+
+        uint16x8_t r0 = vmlal_u8(t0, left, l0);
+        uint16x8_t r1 = vmlal_u8(t1, left, l1);
+        uint16x8_t r2 = vmlal_u8(t2, left, l2);
+        uint16x8_t r3 = vmlal_u8(t3, left, l3);
+
+        uint8x8_t d4;
+        d0 = vshrn_n_u16(r0, log2Size + 1);
+        d1 = vshrn_n_u16(r1, log2Size + 1);
+        d2 = vshrn_n_u16(r2, log2Size + 1);
+        d3 = vshrn_n_u16(r3, log2Size + 1);
+
+        store_u8x8xn<4>(dst + y * dstStride, 8, d);
+
+        t0 = vaddq_u16(t0, sub_bottomLeft_above0);
+        t1 = vaddq_u16(t1, sub_bottomLeft_above1);
+        t2 = vaddq_u16(t2, sub_bottomLeft_above2);
+        t3 = vaddq_u16(t3, sub_bottomLeft_above3);
+    }
+}
+#endif
+
 static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
 {
     // boundary pixels processing
@@ -576,8 +707,10 @@
     p.cuBLOCK_32x32.intra_pred_allangs = all_angs_pred_neon<5>;
 
 #if !HIGH_BIT_DEPTH
+    p.cuBLOCK_4x4.intra_predPLANAR_IDX = intra_pred_planar4_neon;
     p.cuBLOCK_8x8.intra_predPLANAR_IDX = PFX(intra_pred_planar8_neon);
     p.cuBLOCK_16x16.intra_predPLANAR_IDX = PFX(intra_pred_planar16_neon);
+    p.cuBLOCK_32x32.intra_predPLANAR_IDX = intra_pred_planar32_neon;
 #endif
 
     p.cuBLOCK_4x4.intra_predDC_IDX = intra_pred_dc_neon<4>;

x265-4.1.tar/source/common/aarch64/loopfilter-prim.cpp -> x265-4.2.tar/source/common/aarch64/loopfilter-prim.cpp Changed

@@ -1,9 +1,10 @@
 #include "common.h"
 #include "loopfilter-prim.h"
+#include "mem-neon.h"
 
 #define PIXEL_MIN 0
 
-
+using namespace X265_NS;
 
 #if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
 #include<arm_neon.h>
@@ -255,9 +256,138 @@
     }
 }
 
+void pelFilterLumaStrong_V_neon(pixel *src, intptr_t srcStep, intptr_t offset,
+                                int32_t tcP, int32_t tcQ)
+{
+    X265_CHECK(offset == 1, "Offset value must be 1 for LumaStrong Vertical\n");
+
+    src -= offset * 4;
+
+    const int16x8_t tc_vec = vcombine_s16(vdup_n_s16(tcP), vdup_n_s16(tcQ));
+    const int16x8_t neg_tc_vec = vnegq_s16(tc_vec);
+
+    static const uint8_t filter38 =
+    {
+        { 0, 2, 1, 2, 2, 1, 1, 0 },
+        { 0, 3, 1, 2, 2, 1, 3, 0 },
+        { 0, 1, 1, 2, 2, 1, 2, 0 },
+    };
+
+    const uint8x8_t f0 = vld1_u8(filter0);
+    const uint8x8_t f1 = vld1_u8(filter1);
+    const uint8x8_t f2 = vld1_u8(filter2);
+
+    // -1 index means value is zero because TBL instructions
+    // zero elements that have out of range indices.
+    const uint8x8_t idx0 = { 255, 0,   1, 2, 3,   4, 5, 255 };
+    const uint8x8_t idx1 = { 255, 1,   2, 3, 4,   5, 6, 255 };
+    const uint8x8_t idx2 = { 255, 2,   3, 4, 5,   6, 7, 255 };
+    const uint8x8_t idx3 = { 255, 3,   4, 5, 6, 255, 3, 255 };
+    const uint8x8_t idx4 = { 255, 4, 255, 1, 2,   3, 4, 255 };
+
+    const int16x8_t neg_shift = { 0, -3, -2, -3, -3, -2, -3, 0 };
+
+    for (int i = 0; i < UNIT_SIZE; i++, src += srcStep)
+    {
+        uint8x8_t s = vld1_u8(src);
+        uint8x8_t s0 = vtbl1_u8(s, idx0);
+        uint8x8_t s1 = vtbl1_u8(s, idx1);
+        uint8x8_t s2 = vtbl1_u8(s, idx2);
+        uint8x8_t s3 = vtbl1_u8(s, idx3);
+        uint8x8_t s4 = vtbl1_u8(s, idx4);
+
+        uint16x8_t s34 = vaddl_u8(s3, s4);
+        uint16x8_t sum = vmlal_u8(s34, s0, f0);
+        sum = vmlal_u8(sum, s1, f1);
+        sum = vmlal_u8(sum, s2, f2);
+
+        sum = vrshlq_u16(sum, neg_shift);
+        sum = vsubw_u8(sum, s1);
+        sum = vreinterpretq_u16_s16(
+            vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(sum))));
+
+        uint8x8_t d = vmovn_u16(sum);
+        d = vadd_u8(d, s);
+        vst1_u8(src, d);
+    }
 }
 
+void pelFilterLumaStrong_H_neon(pixel *src, intptr_t srcStep, intptr_t offset,
+                                int32_t tcP, int32_t tcQ)
+{
+    X265_CHECK(UNIT_SIZE == 4 && srcStep == 1,
+               "UNIT_SIZE must be 4 and srcStep must be 1 for LumaStrong Horizontal\n");
+
+    (void)srcStep;
+
+    const int16x8_t tc_vec = vcombine_s16(vdup_n_s16(tcP), vdup_n_s16(tcQ));
+    const int16x8_t neg_tc_vec = vnegq_s16(tc_vec);
+
+    uint8x8_t m0 = vld1_u8(src - 4 * offset);
+    uint8x8_t m1 = vld1_u8(src - 3 * offset);
+    uint8x8_t m2 = vld1_u8(src - 2 * offset);
+    uint8x8_t m3 = vld1_u8(src - 1 * offset);
+    uint8x8_t m4 = vld1_u8(src - 0 * offset);
+    uint8x8_t m5 = vld1_u8(src + 1 * offset);
+    uint8x8_t m6 = vld1_u8(src + 2 * offset);
+    uint8x8_t m7 = vld1_u8(src + 3 * offset);
+
+    uint8x8_t m12 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m2)));
+    uint8x8_t m23 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m3)));
+    uint8x8_t m34 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m3), vreinterpret_u32_u8(m4)));
+    uint8x8_t m45 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m4), vreinterpret_u32_u8(m5)));
+    uint8x8_t m56 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m5), vreinterpret_u32_u8(m6)));
+
+    // src-1 * offset, src0 * offset
+    uint16x8_t p0 = vaddl_u8(m23, m34);
+    p0 = vaddw_u8(p0, m45);
+    uint16x8_t t0 = vshlq_n_u16(p0, 1);
+    uint16x8_t t1 = vaddl_u8(m12, m56);
+    uint16x8_t t01 = vaddq_u16(t0, t1);
+    t01 = vrshrq_n_u16(t01, 3);
+    t01 = vsubw_u8(t01, m34);
+    t01 = vreinterpretq_u16_s16(
+        vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t01))));
+    uint8x8_t d01 = vmovn_u16(t01);
+    d01 = vadd_u8(d01, m34);
+    store_u8x4_strided_xN<2>(&src-1 * offset, 1 * offset, &d01);
+
+    uint8x8_t m16 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m6)));
+    uint8x8_t m25 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m5)));
+
+    // src-2 * offset, src1 * offset
+    uint16x8_t p1 = vaddw_u8(p0, m16);
+    uint16x8_t t23 = vrshrq_n_u16(p1, 2);
+    t23 = vsubw_u8(t23, m25);
+    t23 = vreinterpretq_u16_s16(
+        vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t23))));
+    uint8x8_t d23 = vmovn_u16(t23);
+    d23 = vadd_u8(d23, m25);
+    store_u8x4_strided_xN<2>(&src-2 * offset, 3 * offset, &d23);
+
+    uint8x8_t m07 =
+        vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m0), vreinterpret_u32_u8(m7)));
+
+    // src-3 * offset, src2 * offset
+    uint16x8_t p2 = vaddl_u8(m07, m16);
+    uint16x8_t t45 = vmlaq_n_u16(p1, p2, 2);
+    t45 = vrshrq_n_u16(t45, 3);
+    t45 = vsubw_u8(t45, m16);
+    t45 = vreinterpretq_u16_s16(
+        vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t45))));
+    uint8x8_t d45 = vmovn_u16(t45);
+    d45 = vadd_u8(d45, m16);
+    store_u8x4_strided_xN<2>(&src-3 * offset, 5 * offset, &d45);
+}
 
+} // namespace
 
 namespace X265_NS
 {
@@ -273,6 +403,8 @@
     p.saoCuOrgB0 = processSaoCUB0_neon;
     p.sign = calSign_neon;
 
+    p.pelFilterLumaStrong0 = pelFilterLumaStrong_V_neon;
+    p.pelFilterLumaStrong1 = pelFilterLumaStrong_H_neon;
 }

x265-4.1.tar/source/common/aarch64/mc-a-sve2.S -> x265-4.2.tar/source/common/aarch64/mc-a-sve2.S Changed

@@ -298,627 +298,3 @@
 pixel_avg_pp_64xN_sve2 32
 pixel_avg_pp_64xN_sve2 48
 pixel_avg_pp_64xN_sve2 64
-
-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
-
-.macro addAvg_2xN_sve2 h
-function PFX(addAvg_2x\h\()_sve2)
-    ptrue           p0.s, vl2
-    ptrue           p1.h, vl4
-    ptrue           p2.h, vl2
-.rept \h / 2
-    ld1rw           {z0.s}, p0/z, x0
-    ld1rw           {z1.s}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    ld1rw           {z2.s}, p0/z, x0
-    ld1rw           {z3.s}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p1/m, z0.h, z1.h
-    add             z2.h, p1/m, z2.h, z3.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
-    sqrshrnb        z2.b, z2.h, #7
-    add             z2.b, z2.b, #0x80
-    st1b            {z0.h}, p2, x2
-    add             x2, x2, x5
-    st1b            {z2.h}, p2, x2
-    add             x2, x2, x5
-.endr
-    ret
-endfunc
-.endm
-
-addAvg_2xN_sve2 4
-addAvg_2xN_sve2 8
-addAvg_2xN_sve2 16
-
-.macro addAvg_6xN_sve2 h
-function PFX(addAvg_6x\h\()_sve2)
-    mov             w12, #\h / 2
-    ptrue           p0.b, vl16
-    ptrue           p2.h, vl6
-.Loop_sve2_addavg_6x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    ld1b            {z2.b}, p0/z, x0
-    ld1b            {z3.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    add             z2.h, p0/m, z2.h, z3.h
-    sqrshrnb        z0.b, z0.h, #7
-    sqrshrnb        z2.b, z2.h, #7
-    add             z0.b, z0.b, #0x80
-    add             z2.b, z2.b, #0x80
-    st1b            {z0.h}, p2, x2
-    add             x2, x2, x5
-    st1b            {z2.h}, p2, x2
-    add             x2, x2, x5
-    cbnz            w12, .Loop_sve2_addavg_6x\h
-    ret
-endfunc
-.endm
-
-addAvg_6xN_sve2 8
-addAvg_6xN_sve2 16
-
-.macro addAvg_8xN_sve2 h
-function PFX(addAvg_8x\h\()_sve2)
-    ptrue           p0.b, vl16
-.rept \h / 2
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    ld1b            {z2.b}, p0/z, x0
-    ld1b            {z3.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    add             z2.h, p0/m, z2.h, z3.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
-    sqrshrnb        z2.b, z2.h, #7
-    add             z2.b, z2.b, #0x80
-    st1b            {z0.h}, p0, x2
-    add             x2, x2, x5
-    st1b            {z2.h}, p0, x2
-    add             x2, x2, x5
-.endr
-    ret
-endfunc
-.endm
-
-.macro addAvg_8xN1_sve2 h
-function PFX(addAvg_8x\h\()_sve2)
-    mov             w12, #\h / 2
-    ptrue           p0.b, vl16
-.Loop_sve2_addavg_8x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    ld1b            {z2.b}, p0/z, x0
-    ld1b            {z3.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    add             z2.h, p0/m, z2.h, z3.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
-    sqrshrnb        z2.b, z2.h, #7
-    add             z2.b, z2.b, #0x80
-    st1b            {z0.h}, p0, x2
-    add             x2, x2, x5
-    st1b            {z2.h}, p0, x2
-    add             x2, x2, x5
-    cbnz            w12, .Loop_sve2_addavg_8x\h
-    ret
-endfunc
-.endm
-
-addAvg_8xN_sve2 2
-addAvg_8xN_sve2 4
-addAvg_8xN_sve2 6
-addAvg_8xN_sve2 8
-addAvg_8xN_sve2 12
-addAvg_8xN_sve2 16
-addAvg_8xN1_sve2 32
-addAvg_8xN1_sve2 64
-
-.macro addAvg_12xN_sve2 h
-function PFX(addAvg_12x\h\()_sve2)
-    mov             w12, #\h
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_addAvg_12x\h
-    ptrue           p0.b, vl16
-    ptrue           p1.b, vl8
-.Loop_sve2_addavg_12x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x1
-    ld1b            {z2.b}, p1/z, x0, #1, mul vl
-    ld1b            {z3.b}, p1/z, x1, #1, mul vl
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    add             z2.h, p1/m, z2.h, z3.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
-    sqrshrnb        z2.b, z2.h, #7
-    add             z2.b, z2.b, #0x80
-    st1b            {z0.h}, p0, x2
-    st1b            {z2.h}, p1, x2, #1, mul vl
-    add             x2, x2, x5
-    cbnz            w12, .Loop_sve2_addavg_12x\h
-    ret
-.vl_gt_16_addAvg_12x\h\():
-    mov             x10, #24
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.Loop_sve2_gt_16_addavg_12x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, x0
-    ld1b            {z1.b}, p0/z, x1
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
-    sqrshrnb        z2.b, z2.h, #7
-    add             z2.b, z2.b, #0x80
-    st1b            {z0.h}, p0, x2
-    add             x2, x2, x5
-    cbnz            w12, .Loop_sve2_gt_16_addavg_12x\h
-    ret
-endfunc
-.endm
-
-addAvg_12xN_sve2 16
-addAvg_12xN_sve2 32
-
-.macro addAvg_16xN_sve2 h
-function PFX(addAvg_16x\h\()_sve2)
-    mov             w12, #\h
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_addAvg_16x\h
-    ptrue           p0.b, vl16
-.Loop_eq_16_sve2_addavg_16x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, x0

x265-4.1.tar/source/common/aarch64/mc-a.S -> x265-4.2.tar/source/common/aarch64/mc-a.S Changed

@@ -38,10 +38,13 @@
 .macro pixel_avg_pp_4xN_neon h
 function PFX(pixel_avg_pp_4x\h\()_neon)
 .rept \h
-    ld1             {v0.s}0, x2, x3
-    ld1             {v1.s}0, x4, x5
+    ldr             s0, x2
+    ldr             s1, x4
+    add             x2, x2, x3
+    add             x4, x4, x5
     urhadd          v2.8b, v0.8b, v1.8b
-    st1             {v2.s}0, x0, x1
+    str             s2, x0
+    add             x0, x0, x1
 .endr
     ret
 endfunc
@@ -73,13 +76,13 @@
     sub             x3, x3, #4
     sub             x5, x5, #4
 .rept 16
-    ld1             {v0.s}0, x2, #4
+    ldr             s0, x2, #4
     ld1             {v1.8b}, x2, x3
-    ld1             {v2.s}0, x4, #4
+    ldr             s2, x4, #4
     ld1             {v3.8b}, x4, x5
     urhadd          v4.8b, v0.8b, v2.8b
     urhadd          v5.8b, v1.8b, v3.8b
-    st1             {v4.s}0, x0, #4
+    str             s4, x0, #4
     st1             {v5.8b}, x0, x1
 .endr
     ret
@@ -214,344 +217,3 @@
 pixel_avg_pp_64xN_neon 32
 pixel_avg_pp_64xN_neon 48
 pixel_avg_pp_64xN_neon 64
-
-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
-.macro addAvg_2xN h
-function PFX(addAvg_2x\h\()_neon)
-    addAvg_start
-.rept \h / 2
-    ldr             w10, x0
-    ldr             w11, x1
-    add             x0, x0, x3
-    add             x1, x1, x4
-    ldr             w12, x0
-    ldr             w13, x1
-    add             x0, x0, x3
-    add             x1, x1, x4
-    dup             v0.2s, w10
-    dup             v1.2s, w11
-    dup             v2.2s, w12
-    dup             v3.2s, w13
-    add             v0.4h, v0.4h, v1.4h
-    add             v2.4h, v2.4h, v3.4h
-    saddl           v0.4s, v0.4h, v30.4h
-    saddl           v2.4s, v2.4h, v30.4h
-    shrn            v0.4h, v0.4s, #7
-    shrn2           v0.8h, v2.4s, #7
-    sqxtun          v0.8b, v0.8h
-    st1             {v0.h}0, x2, x5
-    st1             {v0.h}2, x2, x5
-.endr
-    ret
-endfunc
-.endm
-
-addAvg_2xN 4
-addAvg_2xN 8
-addAvg_2xN 16
-
-.macro addAvg_4xN h
-function PFX(addAvg_4x\h\()_neon)
-    addAvg_start
-.rept \h / 2
-    ld1             {v0.8b}, x0, x3
-    ld1             {v1.8b}, x1, x4
-    ld1             {v2.8b}, x0, x3
-    ld1             {v3.8b}, x1, x4
-    add             v0.4h, v0.4h, v1.4h
-    add             v2.4h, v2.4h, v3.4h
-    saddl           v0.4s, v0.4h, v30.4h
-    saddl           v2.4s, v2.4h, v30.4h
-    shrn            v0.4h, v0.4s, #7
-    shrn2           v0.8h, v2.4s, #7
-    sqxtun          v0.8b, v0.8h
-    st1             {v0.s}0, x2, x5
-    st1             {v0.s}1, x2, x5
-.endr
-    ret
-endfunc
-.endm
-
-addAvg_4xN 2
-addAvg_4xN 4
-addAvg_4xN 8
-addAvg_4xN 16
-addAvg_4xN 32
-
-.macro addAvg_6xN h
-function PFX(addAvg_6x\h\()_neon)
-    addAvg_start
-    mov             w12, #\h / 2
-    sub             x5, x5, #4
-.Loop_addavg_6x\h:
-    sub             w12, w12, #1
-    ld1             {v0.16b}, x0, x3
-    ld1             {v1.16b}, x1, x4
-    ld1             {v2.16b}, x0, x3
-    ld1             {v3.16b}, x1, x4
-    add             v0.8h, v0.8h, v1.8h
-    add             v2.8h, v2.8h, v3.8h
-    saddl           v16.4s, v0.4h, v30.4h
-    saddl2          v17.4s, v0.8h, v30.8h
-    saddl           v18.4s, v2.4h, v30.4h
-    saddl2          v19.4s, v2.8h, v30.8h
-    shrn            v0.4h, v16.4s, #7
-    shrn2           v0.8h, v17.4s, #7
-    shrn            v1.4h, v18.4s, #7
-    shrn2           v1.8h, v19.4s, #7
-    sqxtun          v0.8b, v0.8h
-    sqxtun          v1.8b, v1.8h
-    str             s0, x2, #4
-    st1             {v0.h}2, x2, x5
-    str             s1, x2, #4
-    st1             {v1.h}2, x2, x5
-    cbnz            w12, .Loop_addavg_6x\h
-    ret
-endfunc
-.endm
-
-addAvg_6xN 8
-addAvg_6xN 16
-
-.macro addAvg_8xN h
-function PFX(addAvg_8x\h\()_neon)
-    addAvg_start
-.rept \h / 2
-    ld1             {v0.16b}, x0, x3
-    ld1             {v1.16b}, x1, x4
-    ld1             {v2.16b}, x0, x3
-    ld1             {v3.16b}, x1, x4
-    add             v0.8h, v0.8h, v1.8h
-    add             v2.8h, v2.8h, v3.8h
-    saddl           v16.4s, v0.4h, v30.4h
-    saddl2          v17.4s, v0.8h, v30.8h
-    saddl           v18.4s, v2.4h, v30.4h
-    saddl2          v19.4s, v2.8h, v30.8h
-    shrn            v0.4h, v16.4s, #7
-    shrn2           v0.8h, v17.4s, #7
-    shrn            v1.4h, v18.4s, #7
-    shrn2           v1.8h, v19.4s, #7
-    sqxtun          v0.8b, v0.8h
-    sqxtun          v1.8b, v1.8h
-    st1             {v0.8b}, x2, x5
-    st1             {v1.8b}, x2, x5
-.endr
-    ret
-endfunc
-.endm
-
-.macro addAvg_8xN1 h
-function PFX(addAvg_8x\h\()_neon)
-    addAvg_start
-    mov             w12, #\h / 2
-.Loop_addavg_8x\h:
-    sub             w12, w12, #1
-    ld1             {v0.16b}, x0, x3
-    ld1             {v1.16b}, x1, x4
-    ld1             {v2.16b}, x0, x3
-    ld1             {v3.16b}, x1, x4
-    add             v0.8h, v0.8h, v1.8h
-    add             v2.8h, v2.8h, v3.8h
-    saddl           v16.4s, v0.4h, v30.4h
-    saddl2          v17.4s, v0.8h, v30.8h
-    saddl           v18.4s, v2.4h, v30.4h
-    saddl2          v19.4s, v2.8h, v30.8h
-    shrn            v0.4h, v16.4s, #7
-    shrn2           v0.8h, v17.4s, #7
-    shrn            v1.4h, v18.4s, #7
-    shrn2           v1.8h, v19.4s, #7
-    sqxtun          v0.8b, v0.8h
-    sqxtun          v1.8b, v1.8h
-    st1             {v0.8b}, x2, x5
-    st1             {v1.8b}, x2, x5
-    cbnz            w12, .Loop_addavg_8x\h
-    ret
-endfunc
-.endm
-
-addAvg_8xN 2
-addAvg_8xN 4
-addAvg_8xN 6
-addAvg_8xN 8
-addAvg_8xN 12
-addAvg_8xN 16
-addAvg_8xN1 32

x265-4.1.tar/source/common/aarch64/mem-neon.h -> x265-4.2.tar/source/common/aarch64/mem-neon.h Changed

@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2024 MulticoreWare, Inc
+ * Copyright (C) 2024-2025 MulticoreWare, Inc
  *
  * Authors: Hari Limaye <hari.limaye@arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -28,6 +29,22 @@
 #include <cassert>
 #include <stdint.h>
 
+using namespace X265_NS;
+
+template<int N>
+static void inline store_u8x2_strided_xN(uint8_t *d, intptr_t stride,
+                                         const uint8x8_t *s)
+{
+    X265_CHECK(N % 2 == 0, "N should be divisible by 2");
+    for (int i = 0; i < N / 2; ++i)
+    {
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(si), 0);
+        d += stride;
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(si), 2);
+        d += stride;
+    }
+}
+
 // Load 4 bytes into the low half of a uint8x8_t, zero the upper half.
 static uint8x8_t inline load_u8x4x1(const uint8_t *s)
 {
@@ -57,6 +74,26 @@
     vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
 }
 
+// Store 2 bytes from the low half of a uint8x8_t.
+static void inline store_u8x2x1(uint8_t *d, const uint8x8_t s)
+{
+    vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(s), 0);
+}
+
+// Load 2 int16_t into a int16x8_t.
+static inline int16x8_t load_s16x2x1(const int16_t *p)
+{
+    int32x4_t ret = vld1q_lane_s32((const int32_t *)p, vdupq_n_s32(0), 0);
+
+    return vreinterpretq_s16_s32(ret);
+}
+
+// Store 2 uint16_t from the low half of a uint16x8_t.
+static inline void store_u16x2x1(const uint16_t *d, const uint16x8_t s)
+{
+    vst1q_lane_u32((uint32_t *)d, vreinterpretq_u32_u16(s), 0);
+}
+
 // Store N blocks of 32-bits from (N / 2) D-Registers.
 template<int N>
 static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
@@ -89,8 +126,7 @@
 {
     for (int i = 0; i < N; ++i)
     {
-        dsti = vld1q_u8(src);
-        src += stride;
+        dsti = vld1q_u8(src + i * stride);
     }
 }
 
@@ -152,6 +188,17 @@
     }
 }
 
+template<int N, int M>
+static void inline store_u8xnxm_strided(uint8_t *dst, intptr_t dst_stride,
+                                        const uint8x8_t *src)
+{
+    switch (N)
+    {
+    case 2: return store_u8x2_strided_xN<M>(dst, dst_stride, src);
+    case 4: return store_u8x4_strided_xN<M>(dst, dst_stride, src);
+    }
+}
+
 template<int N>
 static void inline store_u8x16xn(uint8_t *dst, intptr_t dst_stride,
                                  const uint8x16_t *src)
@@ -186,6 +233,94 @@
 }
 
 template<int N>
+static void inline load_u16x4xn(const uint16_t *src, const intptr_t stride,
+                                uint16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1_u16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
+                                uint16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dsti = vld1q_u16(src + i * stride);
+    }
+}
+
+template<int N>
+static void inline store_u16x2xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u16(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x2xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1q_lane_u32((uint32_t *)dst, vreinterpretq_u32_u16(srci), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x4xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u16(dst, srci);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x4xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u16(dst, vget_low_u16(srci));
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x6xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u16(dst, vget_low_u16(srci));
+        vst1q_lane_u32((uint32_t *)(dst + 4), vreinterpretq_u32_u16(srci), 2);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x8xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1q_u16(dst, srci);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
 static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x4_t *src)
 {
@@ -265,4 +400,39 @@
     }
 }
 
+template<int N, int M>
+static void inline store_s16xnxm(const int16x4_t *src, int16_t *dst,
+                                 intptr_t dst_stride)
+{
+    switch (N)
+    {
+    case 2: return store_s16x2xn<M>(dst, dst_stride, src);
+    case 4: return store_s16x4xn<M>(dst, dst_stride, src);
+    }
+}
+
+template<int N, int M>
+static void inline store_u16xnxm(uint16_t *dst, intptr_t dst_stride,

x265-4.1.tar/source/common/aarch64/neon-sve-bridge.h -> x265-4.2.tar/source/common/aarch64/neon-sve-bridge.h Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Hari Limaye <hari.limaye@arm.com>
  *          Jonathan Wright <jonathan.wright@arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -52,6 +53,24 @@
                                      svset_neonq_s16(svundef_s16(), y)));
 }
 
+#define x265_sdotq_lane_s16(sum, s0, f, lane)                               \
+        svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \
+                                       svset_neonq_s16(svundef_s16(), s0),  \
+                                       svset_neonq_s16(svundef_s16(), f), lane))
+
+static inline uint64x2_t x265_udotq_u16(uint64x2_t acc, uint16x8_t x, uint16x8_t y)
+{
+    return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                     svset_neonq_u16(svundef_u16(), x),
+                                     svset_neonq_u16(svundef_u16(), y)));
+}
+
+static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx)
+{
+    return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x),
+                                     svset_neonq_u16(svundef_u16(), idx)));
+}
+
 static inline int8x16_t x265_sve_mask(const int x, const int endX,
                                       const int8x16_t in)
 {

x265-4.1.tar/source/common/aarch64/p2s-sve.S -> x265-4.2.tar/source/common/aarch64/p2s-sve.S Changed

x265-4.2.tar/source/common/aarch64/pixel-prim-neon-dotprod.cpp Added

@@ -0,0 +1,115 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Li Zhang <li.zhang2@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "pixel-prim.h"
+#include "mem-neon.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if !HIGH_BIT_DEPTH
+template<int size>
+uint64_t pixel_var_neon_dotprod(const uint8_t *pix, intptr_t i_stride)
+{
+    if (size >= 16)
+    {
+        uint32x4_t sum2 = { vdupq_n_u32(0), vdupq_n_u32(0) };
+        uint32x4_t sqr2 = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+        for (int h = 0; h < size; h += 2)
+        {
+            for (int w = 0; w + 16 <= size; w += 16)
+            {
+                uint8x16_t s2;
+                load_u8x16xn<2>(pix + w, i_stride, s);
+
+                sum0 = vdotq_u32(sum0, s0, vdupq_n_u8(1));
+                sum1 = vdotq_u32(sum1, s1, vdupq_n_u8(1));
+
+                sqr0 = vdotq_u32(sqr0, s0, s0);
+                sqr1 = vdotq_u32(sqr1, s1, s1);
+            }
+
+            pix += 2 * i_stride;
+        }
+
+        sum0 = vaddq_u32(sum0, sum1);
+        sqr0 = vaddq_u32(sqr0, sqr1);
+
+        return vaddvq_u32(sum0) + (vaddlvq_u32(sqr0) << 32);
+    }
+    if (size == 8)
+    {
+        uint16x8_t sum = vdupq_n_u16(0);
+        uint32x2_t sqr = vdup_n_u32(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint8x8_t s = vld1_u8(pix);
+
+            sum = vaddw_u8(sum, s);
+            sqr = vdot_u32(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);
+    }
+    if (size == 4) {
+        uint16x8_t sum = vdupq_n_u16(0);
+        uint32x2_t sqr = vdup_n_u32(0);
+
+        for (int h = 0; h < size; h += 2)
+        {
+            uint8x8_t s = load_u8x4x2(pix, i_stride);
+
+            sum = vaddw_u8(sum, s);
+            sqr = vdot_u32(sqr, s, s);
+
+            pix += 2 * i_stride;
+        }
+
+        return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);
+    }
+}
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+#if HIGH_BIT_DEPTH
+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &)
+{
+}
+#else // !HIGH_BIT_DEPTH
+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p)
+{
+    p.cuBLOCK_4x4.var   = pixel_var_neon_dotprod<4>;
+    p.cuBLOCK_8x8.var   = pixel_var_neon_dotprod<8>;
+    p.cuBLOCK_16x16.var = pixel_var_neon_dotprod<16>;
+    p.cuBLOCK_32x32.var = pixel_var_neon_dotprod<32>;
+    p.cuBLOCK_64x64.var = pixel_var_neon_dotprod<64>;
+}
+#endif // HIGH_BIT_DEPTH
+}

x265-4.2.tar/source/common/aarch64/pixel-prim-sve.cpp Added

@@ -0,0 +1,141 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Li Zhang <li.zhang2@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "pixel-prim.h"
+#include "mem-neon.h"
+#include "neon-sve-bridge.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if HIGH_BIT_DEPTH
+template<int size>
+uint64_t pixel_var_sve(const uint16_t *pix, intptr_t i_stride)
+{
+    if (size > 16)
+    {
+        uint64x2_t sum2 = { vdupq_n_u64(0), vdupq_n_u64(0) };
+        uint64x2_t sqr2 = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+        for (int h = 0; h < size; ++h)
+        {
+            for (int w = 0; w + 16 <= size; w += 16)
+            {
+                uint16x8_t s2;
+                load_u16x8xn<2>(pix + w, 8, s);
+
+                sum0 = x265_udotq_u16(sum0, s0, vdupq_n_u16(1));
+                sum1 = x265_udotq_u16(sum1, s1, vdupq_n_u16(1));
+
+                sqr0 = x265_udotq_u16(sqr0, s0, s0);
+                sqr1 = x265_udotq_u16(sqr1, s1, s1);
+            }
+
+            pix += i_stride;
+        }
+
+        sum0 = vaddq_u64(sum0, sum1);
+        sqr0 = vaddq_u64(sqr0, sqr1);
+
+        return vaddvq_u64(sum0) + (vaddvq_u64(sqr0) << 32);
+    }
+    if (size == 16)
+    {
+        uint16x8_t sum2 = { vdupq_n_u16(0), vdupq_n_u16(0) };
+        uint64x2_t sqr2 = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x8_t s2;
+            load_u16x8xn<2>(pix, 8, s);
+
+            sum0 = vaddq_u16(sum0, s0);
+            sum1 = vaddq_u16(sum1, s1);
+
+            sqr0 = x265_udotq_u16(sqr0, s0, s0);
+            sqr1 = x265_udotq_u16(sqr1, s1, s1);
+
+            pix += i_stride;
+        }
+
+        uint32x4_t sum_u32 = vpaddlq_u16(sum0);
+        sum_u32 = vpadalq_u16(sum_u32, sum1);
+        sqr0 = vaddq_u64(sqr0, sqr1);
+
+        return vaddvq_u32(sum_u32) + (vaddvq_u64(sqr0) << 32);
+    }
+    if (size == 8)
+    {
+        uint16x8_t sum = vdupq_n_u16(0);
+        uint64x2_t sqr = vdupq_n_u64(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x8_t s = vld1q_u16(pix);
+
+            sum = vaddq_u16(sum, s);
+            sqr = x265_udotq_u16(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddlvq_u16(sum) + (vaddvq_u64(sqr) << 32);
+    }
+    if (size == 4) {
+        uint16x4_t sum = vdup_n_u16(0);
+        uint32x4_t sqr = vdupq_n_u32(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x4_t s = vld1_u16(pix);
+
+            sum = vadd_u16(sum, s);
+            sqr = vmlal_u16(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddv_u16(sum) + (vaddlvq_u32(sqr) << 32);
+    }
+}
+#endif // HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+#if HIGH_BIT_DEPTH
+void setupPixelPrimitives_sve(EncoderPrimitives &p)
+{
+    p.cuBLOCK_4x4.var   = pixel_var_sve<4>;
+    p.cuBLOCK_8x8.var   = pixel_var_sve<8>;
+    p.cuBLOCK_16x16.var = pixel_var_sve<16>;
+    p.cuBLOCK_32x32.var = pixel_var_sve<32>;
+    p.cuBLOCK_64x64.var = pixel_var_sve<64>;
+}
+#else // !HIGH_BIT_DEPTH
+void setupPixelPrimitives_sve(EncoderPrimitives &)
+{
+}
+#endif // HIGH_BIT_DEPTH
+}

x265-4.1.tar/source/common/aarch64/pixel-prim.cpp -> x265-4.2.tar/source/common/aarch64/pixel-prim.cpp Changed

@@ -19,818 +19,806 @@
 {
 
 
-/* SATD SA8D variants - based on x264 */
-static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+static inline void sumsubq_s16(int16x8_t *sum, int16x8_t *sub, const int16x8_t a, const int16x8_t b)
 {
-    sum = vaddq_s16(a, b);
-    sub = vsubq_s16(a, b);
+    *sum = vaddq_s16(a, b);
+    *sub = vsubq_s16(a, b);
 }
 
-static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s16x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s16(s1, s2);
-    t2 = vtrn2q_s16(s1, s2);
+    *t1 = vtrn1q_s16(s1, s2);
+    *t2 = vtrn2q_s16(s1, s2);
 }
 
-static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s32x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
     int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
     int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
 
-    t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
-    t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
+    *t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+    *t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
 }
 
-static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s64x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
     int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
     int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
 
-    t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
-    t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+    *t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+    *t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
 }
 
-static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
-                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
+static inline uint16x8_t max_abs_s16(const int16x8_t a, const int16x8_t b)
 {
-    SUMSUB_AB(s1, d1, a, b);
-    SUMSUB_AB(s2, d2, c, d);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(a));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(b));
+
+    return vmaxq_u16(abs0, abs1);
 }
 
-static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
-                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+#if X265_DEPTH == 12
+static inline void sumsubq_s32(int32x4_t *sum, int32x4_t *sub, const int32x4_t a, const int32x4_t b)
 {
-    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
-    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+    *sum = vaddq_s32(a, b);
+    *sub = vsubq_s32(a, b);
 }
 
-
-static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
-
+static inline void sumsublq_s16(int32x4_t *sum_lo, int32x4_t *sum_hi,
+                                int32x4_t *sub_lo, int32x4_t *sub_hi,
+                                const int16x8_t a, const int16x8_t b)
 {
+    *sum_lo = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+    *sub_lo = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+    *sum_hi = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+    *sub_hi = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+}
 
-    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
-
-
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
-
-    SUMSUB_AB(v4 , v6 , v16, v18);
-    SUMSUB_AB(v5 , v7 , v17, v19);
-
-    transpose_8h_8h(v0, v1, v4, v5);
-    transpose_8h_8h(v2, v3, v6, v7);
+static inline void transpose_inplace_s32_s64x2(int32x4_t *t1, int32x4_t *t2)
+{
+    int64x2_t tmp1 = vreinterpretq_s64_s32(*t1);
+    int64x2_t tmp2 = vreinterpretq_s64_s32(*t2);
 
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
+    *t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+    *t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+}
 
-    transpose_4s_8h(v0, v1, v16, v18);
-    transpose_4s_8h(v2, v3, v17, v19);
+static inline uint32x4_t max_abs_s32(int32x4_t a, int32x4_t b)
+{
+    uint32x4_t abs0 = vreinterpretq_u32_s32(vabsq_s32(a));
+    uint32x4_t abs1 = vreinterpretq_u32_s32(vabsq_s32(b));
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+    return vmaxq_u32(abs0, abs1);
+}
 
-    uint16x8_t max0 = vmaxq_u16(abs0, abs1);
-    uint16x8_t max1 = vmaxq_u16(abs2, abs3);
+#endif // X265_DEPTH == 12
 
-    uint16x8_t sum = vaddq_u16(max0, max1);
-    return vaddlvq_u16(sum);
+#if HIGH_BIT_DEPTH
+static inline void load_diff_u16x8x4(const uint16_t *pix1, intptr_t stride_pix1,
+                                     const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff4)
+{
+    uint16x8_t r4, t4;
+    load_u16x8xn<4>(pix1, stride_pix1, r);
+    load_u16x8xn<4>(pix2, stride_pix2, t);
+
+    diff0 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
+    diff1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+    diff2 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
+    diff3 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
 }
 
-static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+static inline void load_diff_u16x8x4_dual(const uint16_t *pix1, intptr_t stride_pix1,
+                                          const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff8)
 {
-    int16x8_t v2, v3;
-    SUMSUB_AB(v2,  v3,  v0,  v1);
-
-    transpose_2d_8h(v0, v1, v2, v3);
-    SUMSUB_AB(v2,  v3,  v0,  v1);
-
-    transpose_8h_8h(v0, v1, v2, v3);
-    SUMSUB_AB(v2,  v3,  v0,  v1);
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+                      pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
+}
 
-    transpose_4s_8h(v0, v1, v2, v3);
+static inline void load_diff_u16x8x8(const uint16_t *pix1, intptr_t stride_pix1,
+                                     const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff8)
+{
+    uint16x8_t r8, t8;
+    load_u16x8xn<8>(pix1, stride_pix1, r);
+    load_u16x8xn<8>(pix2, stride_pix2, t);
+
+    diff0 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
+    diff1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+    diff2 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
+    diff3 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    diff4 = vreinterpretq_s16_u16(vsubq_u16(r4, t4));
+    diff5 = vreinterpretq_s16_u16(vsubq_u16(r5, t5));
+    diff6 = vreinterpretq_s16_u16(vsubq_u16(r6, t6));
+    diff7 = vreinterpretq_s16_u16(vsubq_u16(r7, t7));
+}
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t max = vmaxq_u16(abs0, abs1);
+#else // !HIGH_BIT_DEPTH
+static inline void load_diff_u8x8x4(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff4)
+{
+    uint8x8_t r4, t4;
+    load_u8x8xn<4>(pix1, stride_pix1, r);
+    load_u8x8xn<4>(pix2, stride_pix2, t);
+
+    diff0 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
+    diff1 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
+    diff2 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
+    diff3 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+}
 
-    return vaddlvq_u16(max);
+static inline void load_diff_u8x8x8(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff8)

x265-4.1.tar/source/common/aarch64/pixel-prim.h -> x265-4.2.tar/source/common/aarch64/pixel-prim.h Changed

x265-4.1.tar/source/common/aarch64/pixel-util-common.S -> x265-4.2.tar/source/common/aarch64/pixel-util-common.S Changed

x265-4.1.tar/source/common/aarch64/pixel-util-sve.S -> x265-4.2.tar/source/common/aarch64/pixel-util-sve.S Changed

@@ -56,261 +56,3 @@
     ret
 endfunc
 
-//******* satd *******
-.macro satd_4x4_sve
-    ld1b            {z0.h}, p0/z, x0
-    ld1b            {z2.h}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z1.h}, p0/z, x0
-    ld1b            {z3.h}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, x0
-    ld1b            {z6.h}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z5.h}, p0/z, x0
-    ld1b            {z7.h}, p0/z, x2
-    add             x0, x0, x1
-    add             x2, x2, x3
-
-    sub             z0.h, z0.h, z2.h
-    sub             z1.h, z1.h, z3.h
-    sub             z2.h, z4.h, z6.h
-    sub             z3.h, z5.h, z7.h
-
-    add             z4.h, z0.h, z2.h
-    add             z5.h, z1.h, z3.h
-    sub             z6.h, z0.h, z2.h
-    sub             z7.h, z1.h, z3.h
-
-    add             z0.h, z4.h, z5.h
-    sub             z1.h, z4.h, z5.h
-
-    add             z2.h, z6.h, z7.h
-    sub             z3.h, z6.h, z7.h
-
-    trn1            z4.h, z0.h, z2.h
-    trn2            z5.h, z0.h, z2.h
-
-    trn1            z6.h, z1.h, z3.h
-    trn2            z7.h, z1.h, z3.h
-
-    add             z0.h, z4.h, z5.h
-    sub             z1.h, z4.h, z5.h
-
-    add             z2.h, z6.h, z7.h
-    sub             z3.h, z6.h, z7.h
-
-    trn1            z4.s, z0.s, z1.s
-    trn2            z5.s, z0.s, z1.s
-
-    trn1            z6.s, z2.s, z3.s
-    trn2            z7.s, z2.s, z3.s
-
-    abs             z4.h, p0/m, z4.h
-    abs             z5.h, p0/m, z5.h
-    abs             z6.h, p0/m, z6.h
-    abs             z7.h, p0/m, z7.h
-
-    smax            z4.h, p0/m, z4.h, z5.h
-    smax            z6.h, p0/m, z6.h, z7.h
-
-    add             z0.h, z4.h, z6.h
-
-    uaddlp          v0.2s, v0.4h
-    uaddlp          v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_sve)
-    ptrue           p0.h, vl4
-    satd_4x4_sve
-    fmov            x0, d0
-    ret
-endfunc
-
-function PFX(pixel_satd_8x4_sve)
-    ptrue           p0.h, vl4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_sve
-    add             x0, x4, #4
-    add             x2, x5, #4
-    umov            x6, v0.d0
-    satd_4x4_sve
-    umov            x0, v0.d0
-    add             x0, x0, x6
-    ret
-endfunc
-
-function PFX(pixel_satd_8x12_sve)
-    ptrue           p0.h, vl4
-    mov             x4, x0
-    mov             x5, x2
-    mov             x7, #0
-    satd_4x4_sve
-    umov            x6, v0.d0
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_sve
-    umov            x6, v0.d0
-    add             x7, x7, x6
-.rept 2
-    sub             x0, x0, #4
-    sub             x2, x2, #4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_sve
-    umov            x6, v0.d0
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_sve
-    umov            x6, v0.d0
-    add             x7, x7, x6
-.endr
-    mov             x0, x7
-    ret
-endfunc
-
-.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
-    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
-    ld1b            {z0.h}, p0/z, x0
-    ld1b            {z1.h}, p0/z, x0, x11
-    ld1b            {z2.h}, p0/z, x2
-    ld1b            {z3.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, x0
-    ld1b            {z5.h}, p0/z, x0, x11
-    ld1b            {z6.h}, p0/z, x2
-    ld1b            {z7.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             \v0\().h, z0.h, z2.h
-    sub             \v4\().h, z1.h, z3.h
-    sub             \v1\().h, z4.h, z6.h
-    sub             \v5\().h, z5.h, z7.h
-
-    ld1b            {z0.h}, p0/z, x0
-    ld1b            {z1.h}, p0/z, x0, x11
-    ld1b            {z2.h}, p0/z, x2
-    ld1b            {z3.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, x0
-    ld1b            {z5.h}, p0/z, x0, x11
-    ld1b            {z6.h}, p0/z, x2
-    ld1b            {z7.h}, p0/z, x2, x11
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             \v2\().h, z0.h, z2.h
-    sub             \v6\().h, z1.h, z3.h
-    sub             \v3\().h, z4.h, z6.h
-    sub             \v7\().h, z5.h, z7.h
-.endm
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_sve), export=0
-    HADAMARD4_V     z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
-    HADAMARD4_V     z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
-    trn4            z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
-    trn4            z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
-    SUMSUB_ABCD     z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
-    SUMSUB_ABCD     z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
-    trn4            z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
-    trn4            z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
-    ABS8_SVE        z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
-    smax            z0.h, p0/m, z0.h, z2.h
-    smax            z1.h, p0/m, z1.h, z3.h
-    smax            z4.h, p0/m, z4.h, z6.h
-    smax            z5.h, p0/m, z5.h, z7.h
-    ret
-endfunc
-
-function PFX(satd_16x4_sve), export=0
-    LOAD_DIFF_16x4_sve  z16, z17, z18, z19, z20, z21, z22, z23
-    b                    PFX(satd_8x4v_8x8h_sve)
-endfunc
-
-.macro pixel_satd_32x8_sve
-    mov             x4, x0
-    mov             x5, x2
-.rept 2
-    bl              PFX(satd_16x4_sve)
-    add             z30.h, z30.h, z0.h
-    add             z31.h, z31.h, z1.h
-    add             z30.h, z30.h, z4.h
-    add             z31.h, z31.h, z5.h
-.endr
-    add             x0, x4, #16
-    add             x2, x5, #16
-.rept 2
-    bl              PFX(satd_16x4_sve)
-    add             z30.h, z30.h, z0.h
-    add             z31.h, z31.h, z1.h

x265-4.2.tar/source/common/aarch64/pixel-util-sve2-bitperm.S Added

@@ -0,0 +1,125 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: George Steed <george.steed@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve2+sve2-bitperm
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// int scanPosLast(
+//     const uint16_t *scan,      // x0
+//     const coeff_t *coeff,      // x1
+//     uint16_t *coeffSign,       // x2
+//     uint16_t *coeffFlag,       // x3
+//     uint8_t *coeffNum,         // x4
+//     int numSig,                // x5
+//     const uint16_t* scanCG4x4, // x6
+//     const int trSize)          // x7
+function PFX(scanPosLast_sve2_bitperm)
+    // Convert unit of trSize stride from elements (int16) to bytes.
+    add             x7, x7, x7
+
+    // Load scan table and convert to bytes.
+    ldp             q0, q1, x6
+    uzp1            v0.16b, v0.16b, v1.16b  // v0 - Zigzag scan table.
+
+    movrel          x10, g_SPL_and_mask
+    ldr             q28, x10              // v28 = mask for pmovmskb.
+    add             x10, x7, x7             // 2*x7
+    add             x11, x7, x7, lsl #1     // 3*x7
+    add             x9, x4, #1              // CG count
+
+1:
+    // Position of current CG.
+    ldrh            w6, x0, #32
+    add             x6, x1, x6, lsl #1
+
+    // Loading current CG and saturate to bytes.
+    ldr             d2, x6
+    ldr             d3, x6, x7
+    ldr             d4, x6, x10
+    ldr             d5, x6, x11
+    mov             v2.d1, v3.d0
+    mov             v4.d1, v5.d0
+    sqxtn           v2.8b, v2.8h
+    sqxtn2          v2.16b, v4.8h
+
+    // Apply zigzag.
+    tbl             v3.16b, {v2.16b}, v0.16b
+
+    // Get zero/sign.
+    cmeq            v5.16b, v3.16b, #0   // v5 = zero
+    cmlt            v3.16b, v3.16b, #0   // v3 = negative
+
+    //  val: v3.h0 = pmovmskb(v3).
+    // mask: v3.h1 = pmovmskb(v4).
+    and             v3.16b, v3.16b, v28.16b
+    bic             v4.16b, v28.16b, v5.16b
+    addp            v3.16b, v3.16b, v4.16b
+    addp            v3.16b, v3.16b, v3.16b
+    addp            v3.16b, v3.16b, v3.16b
+    fmov            w15, s3
+
+    // coeffNum = addv(v3 != 0) = 16 - addv(v5).
+    addv            b5, v5.16b
+    smov            w6, v5.b0
+    add             w6, w6, #16
+    sub             x5, x5, x6
+    strb            w6, x4, #1
+
+    // coeffFlag = reverse_bit(w15) in 16-bit.
+    rbit            w12, w15
+    strh            w12, x3, #2
+
+    // Pack bits from z3.h0 into z30.h0, based on z3.h1 mask.
+    mov             h31, v3.h1
+    bext            z30.h, z3.h, z31.h
+    str             h30, x2, #2
+
+    cbnz            x5, 1b
+
+    // Count trailing zeros in (reversed) coeffFlag.
+    clz             w13, w15
+    lsr             w12, w12, w13
+    strh            w12, x3, #-2
+
+    // Get last pos.
+    sub             x9, x4, x9
+    eor             w13, w13, #15
+    add             x0, x13, x9, lsl #4
+    ret
+endfunc
+
+const g_SPL_and_mask, align=8
+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+endconst

x265-4.1.tar/source/common/aarch64/pixel-util-sve2.S -> x265-4.2.tar/source/common/aarch64/pixel-util-sve2.S Changed

@@ -36,200 +36,6 @@
 
 .text
 
-// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
-function PFX(pixel_var_8x8_sve2)
-    ptrue           p0.h, vl8
-    ld1b            {z0.h}, p0/z, x0
-    add             x0, x0, x1
-    mul             z31.h, z0.h, z0.h
-    uaddlp          v1.4s, v31.8h
-.rept 7
-    ld1b            {z4.h}, p0/z, x0
-    add             x0, x0, x1
-    add             z0.h, z0.h, z4.h
-    mul             z31.h, z4.h, z4.h
-    uadalp          z1.s, p0/m, z31.h
-.endr
-    uaddlv          s0, v0.8h
-    uaddlv          d1, v1.4s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-endfunc
-
-function PFX(pixel_var_16x16_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_var_16x16
-    pixel_var_start
-    mov             w12, #16
-.Loop_var_16_sve2:
-    sub             w12, w12, #1
-    ld1             {v4.16b}, x0, x1
-    pixel_var_1 v4
-    cbnz            w12, .Loop_var_16_sve2
-    pixel_var_end
-    ret
-.vl_gt_16_pixel_var_16x16:
-    ptrue           p0.h, vl16
-    mov             z0.d, #0
-.rept 16
-    ld1b            {z4.h}, p0/z, x0
-    add             x0, x0, x1
-    add             z0.h, z0.h, z4.h
-    mul             z30.h, z4.h, z4.h
-    uadalp          z1.s, p0/m, z30.h
-.endr
-    uaddv           d0, p0, z0.h
-    uaddv           d1, p0, z1.s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-endfunc
-
-function PFX(pixel_var_32x32_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_var_32x32
-    pixel_var_start
-    mov             w12, #32
-.Loop_var_32_sve2:
-    sub             w12, w12, #1
-    ld1             {v4.16b-v5.16b}, x0, x1
-    pixel_var_1 v4
-    pixel_var_1 v5
-    cbnz            w12, .Loop_var_32_sve2
-    pixel_var_end
-    ret
-.vl_gt_16_pixel_var_32x32:
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_var_32x32
-    ptrue           p0.b, vl32
-    mov             z0.d, #0
-    mov             z1.d, #0
-.rept 32
-    ld1b            {z4.b}, p0/z, x0
-    add             x0, x0, x1
-    uaddwb          z0.h, z0.h, z4.b
-    uaddwt          z0.h, z0.h, z4.b
-    umullb          z28.h, z4.b, z4.b
-    umullt          z29.h, z4.b, z4.b
-    uadalp          z1.s, p0/m, z28.h
-    uadalp          z1.s, p0/m, z29.h
-.endr
-    uaddv           d0, p0, z0.h
-    uaddv           d1, p0, z1.s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-.vl_gt_48_pixel_var_32x32:
-    ptrue           p0.h, vl32
-    mov             z0.d, #0
-    mov             z1.d, #0
-.rept 32
-    ld1b            {z4.h}, p0/z, x0
-    add             x0, x0, x1
-    add             z0.h, z0.h, z4.h
-    mul             z28.h, z4.h, z4.h
-    uadalp          z1.s, p0/m, z28.h
-.endr
-    uaddv           d0, p0, z0.h
-    uaddv           d1, p0, z1.s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-endfunc
-
-function PFX(pixel_var_64x64_sve2)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_pixel_var_64x64
-    pixel_var_start
-    mov             w12, #64
-.Loop_var_64_sve2:
-    sub             w12, w12, #1
-    ld1             {v4.16b-v7.16b}, x0, x1
-    pixel_var_1 v4
-    pixel_var_1 v5
-    pixel_var_1 v6
-    pixel_var_1 v7
-    cbnz            w12, .Loop_var_64_sve2
-    pixel_var_end
-    ret
-.vl_gt_16_pixel_var_64x64:
-    cmp             x9, #48
-    bgt             .vl_gt_48_pixel_var_64x64
-    ptrue           p0.b, vl32
-    mov             z0.d, #0
-    mov             z2.d, #0
-.rept 64
-    ld1b            {z4.b}, p0/z, x0
-    ld1b            {z5.b}, p0/z, x0, #1, mul vl
-    add             x0, x0, x1
-    uaddwb          z0.h, z0.h, z4.b
-    uaddwt          z0.h, z0.h, z4.b
-    uaddwb          z0.h, z0.h, z5.b
-    uaddwt          z0.h, z0.h, z5.b
-    umullb          z24.h, z4.b, z4.b
-    umullt          z25.h, z4.b, z4.b
-    umullb          z26.h, z5.b, z5.b
-    umullt          z27.h, z5.b, z5.b
-    uadalp          z2.s, p0/m, z24.h
-    uadalp          z2.s, p0/m, z25.h
-    uadalp          z2.s, p0/m, z26.h
-    uadalp          z2.s, p0/m, z27.h
-.endr
-    uaddv           d0, p0, z0.h
-    uaddv           d1, p0, z2.s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-.vl_gt_48_pixel_var_64x64:
-    cmp             x9, #112
-    bgt             .vl_gt_112_pixel_var_64x64
-    ptrue           p0.b, vl64
-    mov             z0.d, #0
-    mov             z1.d, #0
-.rept 64
-    ld1b            {z4.b}, p0/z, x0
-    add             x0, x0, x1
-    uaddwb          z0.h, z0.h, z4.b
-    uaddwt          z0.h, z0.h, z4.b
-    umullb          z24.h, z4.b, z4.b
-    umullt          z25.h, z4.b, z4.b
-    uadalp          z2.s, p0/m, z24.h
-    uadalp          z2.s, p0/m, z25.h
-.endr
-    uaddv           d0, p0, z0.h
-    uaddv           d1, p0, z2.s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-.vl_gt_112_pixel_var_64x64:
-    ptrue           p0.h, vl64
-    mov             z0.d, #0
-    mov             z1.d, #0
-.rept 64
-    ld1b            {z4.h}, p0/z, x0
-    add             x0, x0, x1
-    add             z0.h, z0.h, z4.h
-    mul             z24.h, z4.h, z4.h
-    uadalp          z1.s, p0/m, z24.h
-.endr
-    uaddv           d0, p0, z0.h
-    uaddv           d1, p0, z1.s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32
-    ret
-endfunc
-
 function PFX(getResidual16_sve2)
     rdvl            x9, #1

x265-4.1.tar/source/common/aarch64/pixel-util.S -> x265-4.2.tar/source/common/aarch64/pixel-util.S Changed

@@ -36,67 +36,6 @@
 
 .text
 
-// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
-function PFX(pixel_var_8x8_neon)
-    ld1             {v4.8b}, x0, x1        // pixx
-    uxtl            v0.8h, v4.8b             // sum = pixx
-    umull           v1.8h, v4.8b, v4.8b
-    uaddlp          v1.4s, v1.8h             // sqr = pixx * pixx
-
-.rept 7
-    ld1             {v4.8b}, x0, x1        // pixx
-    umull           v31.8h, v4.8b, v4.8b
-    uaddw           v0.8h, v0.8h, v4.8b      // sum += pixx
-    uadalp          v1.4s, v31.8h            // sqr += pixx * pixx
-.endr
-    uaddlv          s0, v0.8h
-    uaddlv          d1, v1.4s
-    fmov            w0, s0
-    fmov            x1, d1
-    orr             x0, x0, x1, lsl #32      // return sum + ((uint64_t)sqr << 32);
-    ret
-endfunc
-
-function PFX(pixel_var_16x16_neon)
-    pixel_var_start
-    mov             w12, #16
-.Loop_var_16:
-    sub             w12, w12, #1
-    ld1             {v4.16b}, x0, x1
-    pixel_var_1 v4
-    cbnz            w12, .Loop_var_16
-    pixel_var_end
-    ret
-endfunc
-
-function PFX(pixel_var_32x32_neon)
-    pixel_var_start
-    mov             w12, #32
-.Loop_var_32:
-    sub             w12, w12, #1
-    ld1             {v4.16b-v5.16b}, x0, x1
-    pixel_var_1 v4
-    pixel_var_1 v5
-    cbnz            w12, .Loop_var_32
-    pixel_var_end
-    ret
-endfunc
-
-function PFX(pixel_var_64x64_neon)
-    pixel_var_start
-    mov             w12, #64
-.Loop_var_64:
-    sub             w12, w12, #1
-    ld1             {v4.16b-v7.16b}, x0, x1
-    pixel_var_1 v4
-    pixel_var_1 v5
-    pixel_var_1 v6
-    pixel_var_1 v7
-    cbnz            w12, .Loop_var_64
-    pixel_var_end
-    ret
-endfunc
-
 // void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
 function PFX(getResidual4_neon)
     lsl             x4, x3, #1
@@ -340,189 +279,6 @@
     ret
 endfunc
 
-// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
-function PFX(pixel_add_ps_4x4_neon)
-    lsl             x5, x5, #1
-.rept 2
-    ld1             {v0.8b}, x2, x4
-    ld1             {v1.8b}, x2, x4
-    ld1             {v2.4h}, x3, x5
-    ld1             {v3.4h}, x3, x5
-    uxtl            v0.8h, v0.8b
-    uxtl            v1.8h, v1.8b
-    add             v4.8h, v0.8h, v2.8h
-    add             v5.8h, v1.8h, v3.8h
-    sqxtun          v4.8b, v4.8h
-    sqxtun          v5.8b, v5.8h
-    st1             {v4.s}0, x0, x1
-    st1             {v5.s}0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(pixel_add_ps_8x8_neon)
-    lsl             x5, x5, #1
-.rept 4
-    ld1             {v0.8b}, x2, x4
-    ld1             {v1.8b}, x2, x4
-    ld1             {v2.8h}, x3, x5
-    ld1             {v3.8h}, x3, x5
-    uxtl            v0.8h, v0.8b
-    uxtl            v1.8h, v1.8b
-    add             v4.8h, v0.8h, v2.8h
-    add             v5.8h, v1.8h, v3.8h
-    sqxtun          v4.8b, v4.8h
-    sqxtun          v5.8b, v5.8h
-    st1             {v4.8b}, x0, x1
-    st1             {v5.8b}, x0, x1
-.endr
-    ret
-endfunc
-
-.macro pixel_add_ps_16xN_neon h
-function PFX(pixel_add_ps_16x\h\()_neon)
-    lsl             x5, x5, #1
-    mov             w12, #\h / 8
-.Loop_add_ps_16x\h\():
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v0.16b}, x2, x4
-    ld1             {v1.16b}, x2, x4
-    ld1             {v16.8h-v17.8h}, x3, x5
-    ld1             {v18.8h-v19.8h}, x3, x5
-    uxtl            v4.8h, v0.8b
-    uxtl2           v5.8h, v0.16b
-    uxtl            v6.8h, v1.8b
-    uxtl2           v7.8h, v1.16b
-    add             v24.8h, v4.8h, v16.8h
-    add             v25.8h, v5.8h, v17.8h
-    add             v26.8h, v6.8h, v18.8h
-    add             v27.8h, v7.8h, v19.8h
-    sqxtun          v4.8b, v24.8h
-    sqxtun2         v4.16b, v25.8h
-    sqxtun          v5.8b, v26.8h
-    sqxtun2         v5.16b, v27.8h
-    st1             {v4.16b}, x0, x1
-    st1             {v5.16b}, x0, x1
-.endr
-    cbnz            w12, .Loop_add_ps_16x\h
-    ret
-endfunc
-.endm
-
-pixel_add_ps_16xN_neon 16
-pixel_add_ps_16xN_neon 32
-
-.macro pixel_add_ps_32xN_neon h
- function PFX(pixel_add_ps_32x\h\()_neon)
-    lsl             x5, x5, #1
-    mov             w12, #\h / 4
-.Loop_add_ps_32x\h\():
-    sub             w12, w12, #1
-.rept 4
-    ld1             {v0.16b-v1.16b}, x2, x4
-    ld1             {v16.8h-v19.8h}, x3, x5
-    uxtl            v4.8h, v0.8b
-    uxtl2           v5.8h, v0.16b
-    uxtl            v6.8h, v1.8b
-    uxtl2           v7.8h, v1.16b
-    add             v24.8h, v4.8h, v16.8h
-    add             v25.8h, v5.8h, v17.8h
-    add             v26.8h, v6.8h, v18.8h
-    add             v27.8h, v7.8h, v19.8h
-    sqxtun          v4.8b, v24.8h
-    sqxtun2         v4.16b, v25.8h
-    sqxtun          v5.8b, v26.8h
-    sqxtun2         v5.16b, v27.8h
-    st1             {v4.16b-v5.16b}, x0, x1
-.endr
-    cbnz            w12, .Loop_add_ps_32x\h
-    ret
-endfunc
-.endm
-
-pixel_add_ps_32xN_neon 32
-pixel_add_ps_32xN_neon 64
-
-function PFX(pixel_add_ps_64x64_neon)
-    lsl             x5, x5, #1
-    sub             x5, x5, #64
-    mov             w12, #32
-.Loop_add_ps_64x64:
-    sub             w12, w12, #1
-.rept 2
-    ld1             {v0.16b-v3.16b}, x2, x4
-    ld1             {v16.8h-v19.8h}, x3, #64
-    ld1             {v20.8h-v23.8h}, x3, x5
-    uxtl            v4.8h, v0.8b
-    uxtl2           v5.8h, v0.16b
-    uxtl            v6.8h, v1.8b
-    uxtl2           v7.8h, v1.16b
-    uxtl            v24.8h, v2.8b
-    uxtl2           v25.8h, v2.16b
-    uxtl            v26.8h, v3.8b
-    uxtl2           v27.8h, v3.16b
-    add             v0.8h, v4.8h, v16.8h
-    add             v1.8h, v5.8h, v17.8h
-    add             v2.8h, v6.8h, v18.8h
-    add             v3.8h, v7.8h, v19.8h
-    add             v4.8h, v24.8h, v20.8h
-    add             v5.8h, v25.8h, v21.8h

x265-4.1.tar/source/common/aarch64/sad-a.S -> x265-4.2.tar/source/common/aarch64/sad-a.S Changed

@@ -3,7 +3,8 @@
  *
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
  *          Sebastian Pop <spop@amazon.com>
-            Hari Limaye <hari.limaye@arm.com>
+ *          Hari Limaye <hari.limaye@arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
 
 .text
 
+#if !HIGH_BIT_DEPTH
 .macro SAD_START_4 f
     ldr             s0, x0
     ldr             s1, x2
@@ -653,3 +655,814 @@
 const sad12_mask, align=8
 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
 endconst
+
+#else // HIGH_BIT_DEPTH
+
+// int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+.macro SAD_4 f
+    ld1             {v0.4h}, x0, x1
+    ld1             {v1.4h}, x2, x3
+    \f              v16.4s, v0.4h, v1.4h
+.endm
+
+.macro SAD_4xH h
+    SAD_4 uabdl
+.rept \h - 1
+    SAD_4 uabal
+.endr
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_8x2 f
+    ld1             {v0.8h}, x0, x1
+    ld1             {v1.8h}, x2, x3
+    \f              v16.8h, v0.8h, v1.8h
+
+    ld1             {v0.8h}, x0, x1
+    ld1             {v1.8h}, x2, x3
+    \f              v17.8h, v0.8h, v1.8h
+.endm
+
+.macro SAD_8xH h
+    SAD_8x2 uabd
+.rept \h / 2 - 1
+    SAD_8x2 uaba
+.endr
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w\()xH \h
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+.macro SAD_12 f
+    ldr             q0, x0
+    ldr             q1, x2
+    ldr             d2, x0, #16
+    ldr             d3, x2, #16
+    \f              v16.8h, v0.8h, v1.8h
+    \f              v17.4h, v2.4h, v3.4h
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+.macro SAD_16 f
+    ld1             {v0.8h-v1.8h}, x0, x1
+    ld1             {v2.8h-v3.8h}, x2, x3
+    \f              v16.8h, v0.8h, v2.8h
+    \f              v17.8h, v1.8h, v3.8h
+.endm
+
+.macro SAD_32 f
+    ld1             {v0.8h-v3.8h}, x0, x1
+    ld1             {v4.8h-v7.8h}, x2, x3
+    \f              v16.8h, v0.8h, v4.8h
+    \f              v17.8h, v1.8h, v5.8h
+    \f              v18.8h, v2.8h, v6.8h
+    \f              v19.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_END_2_ACCUM
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+.endm
+
+.macro SAD_END_2_ACCUM_WIDEN
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_END_4_ACCUM_WIDEN
+    add             v16.8h, v16.8h, v17.8h
+    add             v18.8h, v18.8h, v19.8h
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v18.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_FUNC_LOOP w, h end_type
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w uabd
+    SAD_\w uaba
+
+    mov             w9, #(\h - 2)/2
+
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 2
+    SAD_\w uaba
+.endr
+    cbnz            w9, .Loop_\w\()x\h
+
+    SAD_\end_type
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+// SAD_<w>_WIDEN kernels widen into 32-bit accumulators.
+.macro SAD_16_WIDEN f
+    ld1             {v0.8h-v1.8h}, x0, x1
+    ld1             {v2.8h-v3.8h}, x2, x3
+    uabd            v18.8h, v0.8h, v2.8h
+    \f              v16.4s, v18.8h
+    uabd            v19.8h, v1.8h, v3.8h
+    \f              v17.4s, v19.8h
+.endm
+
+.macro SAD_24_WIDEN f
+    ld1             {v0.8h-v2.8h}, x0, x1
+    ld1             {v3.8h-v5.8h}, x2, x3
+    uabd            v19.8h, v0.8h, v3.8h
+    \f              v16.4s, v19.8h
+    uabd            v20.8h, v1.8h, v4.8h
+    \f              v17.4s, v20.8h
+    uabd            v21.8h, v2.8h, v5.8h
+    \f              v18.4s, v21.8h
+.endm
+
+.macro SAD_32_WIDEN f
+    ld1             {v0.8h-v3.8h}, x0, x1
+    ld1             {v4.8h-v7.8h}, x2, x3
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+.endm
+
+.macro SAD_48_WIDEN f
+    ld1             {v0.8h-v3.8h}, x0
+    ld1             {v4.8h-v7.8h}, x2
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+
+    ldp             q0, q1, x0, #64
+    ldp             q4, q5, x2, #64
+    uabd            v20.8h, v0.8h, v4.8h
+    uadalp          v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    uadalp          v17.4s, v21.8h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm

x265-4.2.tar/source/common/aarch64/ssd-a-sve.S Added

@@ -0,0 +1,483 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "ssd-a-common.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.arch armv8-a+sve
+
+#if HIGH_BIT_DEPTH
+.macro SSE_PP_4x2
+    ldr             d16, x0
+    ldr             d17, x2
+    ldr             d18, x0, x1
+    ldr             d19, x2, x3
+    uabd            v2.4h, v16.4h, v17.4h
+    uabd            v3.4h, v18.4h, v19.4h
+    udot            z0.d, z2.h, z2.h
+    udot            z0.d, z3.h, z3.h
+.endm
+
+.macro SSE_PP_4xN h
+function PFX(pixel_sse_pp_4x\h\()_sve)
+    movi            v0.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+.rept (\h / 2) - 1
+    SSE_PP_4x2
+    add             x0, x0, x1, lsl #1
+    add             x2, x2, x3, lsl #1
+.endr
+    SSE_PP_4x2
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_4xN 4
+SSE_PP_4xN 8
+
+.macro SSE_PP_8xN h
+function PFX(pixel_sse_pp_8x\h\()_sve)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+.rept \h / 2
+    ld1             {v16.8h}, x0, x1
+    ld1             {v17.8h}, x2, x3
+    uabd            v2.8h, v16.8h, v17.8h
+    udot            z0.d, z2.h, z2.h
+    ld1             {v18.8h}, x0, x1
+    ld1             {v19.8h}, x2, x3
+    uabd            v3.8h, v18.8h, v19.8h
+    udot            z1.d, z3.h, z3.h
+.endr
+
+    add             v0.2d, v0.2d, v1.2d
+    addp            d0, v0.2d
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SSE_PP_8xN 8
+SSE_PP_8xN 16
+
+.macro SSE_PP_16xN h
+function PFX(pixel_sse_pp_16x\h\()_sve)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    mov             w12, \h
+.Loop_sse_pp_16x\h:
+    sub             w12, w12, #1
+
+    ld1             {v16.8h-v17.8h}, x0, x1
+    ld1             {v18.8h-v19.8h}, x2, x3
+    uabd            v2.8h, v16.8h, v18.8h
+    udot            z0.d, z2.h, z2.h
+    uabd            v3.8h, v17.8h, v19.8h
+    udot            z1.d, z3.h, z3.h
+    cbnz            w12, .Loop_sse_pp_16x\h
+
+    add             v0.2d, v0.2d, v1.2d
+    addp            d0, v0.2d
+    fmov            x0, d0
+    ret
+endfunc
+.endm
+
+SSE_PP_16xN 16
+SSE_PP_16xN 32
+
+.macro SSE_PP_32xN h
+function  PFX(pixel_sse_pp_32x\h\()_sve)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    mov             w12, \h
+.Loop_sse_pp_32x\h:
+    sub             w12, w12, #1
+
+    ld1             {v16.8h-v17.8h}, x0
+    ld1             {v20.8h-v21.8h}, x2
+    uabd            v2.8h, v16.8h, v20.8h
+    udot            z0.d, z2.h, z2.h
+    uabd            v3.8h, v17.8h, v21.8h
+    udot            z1.d, z3.h, z3.h
+
+    ldp             q18, q19, x0, #32
+    ldp             q22, q23, x2, #32
+    uabd            v2.8h, v18.8h, v22.8h
+    udot            z0.d, z2.h, z2.h
+    uabd            v3.8h, v19.8h, v23.8h
+    udot            z1.d, z3.h, z3.h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+    cbnz            w12, .Loop_sse_pp_32x\h
+
+    add             v0.2d, v0.2d, v1.2d
+    addp            d0, v0.2d
+    fmov            x0, d0
+    ret
+endfunc
+.endm
+
+SSE_PP_32xN 32
+SSE_PP_32xN 64
+
+function PFX(pixel_sse_pp_64x64_sve)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    mov             w12, #64
+.Loop_sse_pp_64x1:
+    sub             w12, w12, #1
+
+    ld1             {v16.8h-v17.8h}, x0
+    ld1             {v20.8h-v21.8h}, x2
+    uabd            v2.8h, v16.8h, v20.8h
+    udot            z0.d, z2.h, z2.h
+    uabd            v3.8h, v17.8h, v21.8h
+    udot            z1.d, z3.h, z3.h
+
+    ldp             q18, q19, x0, #32
+    ldp             q22, q23, x2, #32
+    uabd            v2.8h, v18.8h, v22.8h
+    udot            z0.d, z2.h, z2.h
+    uabd            v3.8h, v19.8h, v23.8h
+    udot            z1.d, z3.h, z3.h
+
+    ldp             q16, q17, x0, #64
+    ldp             q20, q21, x2, #64
+    uabd            v2.8h, v16.8h, v20.8h
+    udot            z0.d, z2.h, z2.h
+    uabd            v3.8h, v17.8h, v21.8h
+    udot            z1.d, z3.h, z3.h
+
+    ldp             q18, q19,  x0, #96

x265-4.1.tar/source/common/aarch64/ssd-a.S -> x265-4.2.tar/source/common/aarch64/ssd-a.S Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Sebastian Pop <spop@amazon.com>
  *          Hari Limaye <hari.limaye@arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
 
 .text
 
+#if !HIGH_BIT_DEPTH
 // Fully unrolled.
 .macro SSE_PP_4xN h
 function PFX(pixel_sse_pp_4x\h\()_neon)
@@ -403,3 +405,561 @@
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
+
+function PFX(pixel_ssd_s_64x64_neon)
+    add             x1, x1, x1
+    sub             x1, x1, #64
+    sub             x3, x3, #64
+
+    mov             w12, #32
+    movi            v0.16b, #0
+    movi            v1.16b, #0
+.Loop_ssd_ss_64:
+    sub             w12, w12, #1
+.rept 2
+    ld1             {v16.16b-v19.16b}, x0, #64
+    ld1             {v20.16b-v23.16b}, x0, x1
+    smlal           v0.4s, v16.4h, v16.4h
+    smlal2          v1.4s, v16.8h, v16.8h
+    smlal           v0.4s, v17.4h, v17.4h
+    smlal2          v1.4s, v17.8h, v17.8h
+    smlal           v0.4s, v18.4h, v18.4h
+    smlal2          v1.4s, v18.8h, v18.8h
+    smlal           v0.4s, v19.4h, v19.4h
+    smlal2          v1.4s, v19.8h, v19.8h
+    smlal           v0.4s, v20.4h, v20.4h
+    smlal2          v1.4s, v20.8h, v20.8h
+    smlal           v0.4s, v21.4h, v21.4h
+    smlal2          v1.4s, v21.8h, v21.8h
+    smlal           v0.4s, v22.4h, v22.4h
+    smlal2          v1.4s, v22.8h, v22.8h
+    smlal           v0.4s, v23.4h, v23.4h
+    smlal2          v1.4s, v23.8h, v23.8h
+.endr
+    cbnz            w12, .Loop_ssd_ss_64
+    add             v0.4s, v0.4s, v1.4s
+    ret_v0_w0
+endfunc
+
+#else // HIGH_BIT_DEPTH
+
+.macro SSE_PP_4x2
+    ldr             d16, x0
+    ldr             d17, x2
+    ldr             d18, x0, x1
+    ldr             d19, x2, x3
+    uabd            v2.4h, v16.4h, v17.4h
+    uabd            v3.4h, v18.4h, v19.4h
+    umlal           v0.4s, v2.4h, v2.4h
+    umlal           v0.4s, v3.4h, v3.4h
+.endm
+
+.macro SSE_PP_4xN h
+function PFX(pixel_sse_pp_4x\h\()_neon)
+    movi            v0.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+.rept (\h / 2) - 1
+    SSE_PP_4x2
+    add             x0, x0, x1, lsl #1
+    add             x2, x2, x3, lsl #1
+.endr
+    SSE_PP_4x2
+
+    ret_v0_w0
+endfunc
+.endm
+
+SSE_PP_4xN 4
+SSE_PP_4xN 8
+
+.macro SSE_PP_8xN h
+function PFX(pixel_sse_pp_8x\h\()_neon)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+.rept \h
+    ld1             {v16.8h}, x0, x1
+    ld1             {v17.8h}, x2, x3
+    uabd            v2.8h, v16.8h, v17.8h
+    umlal           v0.4s, v2.4h, v2.4h
+    umlal2          v1.4s, v2.8h, v2.8h
+.endr
+    add             v0.4s, v0.4s, v1.4s
+
+    ret_v0_w0
+endfunc
+.endm
+
+SSE_PP_8xN 8
+SSE_PP_8xN 16
+
+.macro SSE_PP_16xN h
+function PFX(pixel_sse_pp_16x\h\()_neon)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+.rept \h
+    ld1             {v16.8h-v17.8h}, x0, x1
+    ld1             {v18.8h-v19.8h}, x2, x3
+    uabd            v2.8h, v16.8h, v18.8h
+    umlal           v0.4s, v2.4h, v2.4h
+    umlal2          v1.4s, v2.8h, v2.8h
+    uabd            v3.8h, v17.8h, v19.8h
+    umlal           v0.4s, v3.4h, v3.4h
+    umlal2          v1.4s, v3.8h, v3.8h
+.endr
+
+.if \h == 16
+    add             v0.4s, v0.4s, v1.4s
+    addv            s0, v0.4s
+    fmov            w0, s0
+.else
+    uaddlv          d0, v0.4s
+    uaddlv          d1, v1.4s
+    add             d0, d0, d1
+    fmov            x0, d0
+.endif
+
+    ret
+endfunc
+.endm
+
+SSE_PP_16xN 16
+SSE_PP_16xN 32
+
+.macro SSE_PP_32xN h
+function  PFX(pixel_sse_pp_32x\h\()_neon)
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    mov             w12, \h
+.Loop_sse_pp_32x\h:
+    sub             w12, w12, #1
+
+    ld1             {v16.8h-v17.8h}, x0
+    ld1             {v20.8h-v21.8h}, x2
+    uabd            v2.8h, v16.8h, v20.8h
+    umlal           v0.4s, v2.4h, v2.4h
+    umlal2          v1.4s, v2.8h, v2.8h
+    uabd            v3.8h, v17.8h, v21.8h
+    umlal           v0.4s, v3.4h, v3.4h
+    umlal2          v1.4s, v3.8h, v3.8h
+
+    ldp             q18, q19, x0, #32
+    ldp             q22, q23, x2, #32
+    uabd            v2.8h, v18.8h, v22.8h
+    umlal           v0.4s, v2.4h, v2.4h
+    umlal2          v1.4s, v2.8h, v2.8h
+    uabd            v3.8h, v19.8h, v23.8h
+    umlal           v0.4s, v3.4h, v3.4h
+    umlal2          v1.4s, v3.8h, v3.8h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+    cbnz            w12, .Loop_sse_pp_32x\h
+
+    uaddlv          d0, v0.4s
+    uaddlv          d1, v1.4s
+    add             d0, d0, d1
+    fmov            x0, d0
+    ret
+endfunc
+.endm
+
+SSE_PP_32xN 32
+SSE_PP_32xN 64
+
+function PFX(pixel_sse_pp_64x64_neon)
+    mov             w12, #64
+
+    movi            v0.4s, #0
+    movi            v1.4s, #0
+    movi            v2.4s, #0
+    movi            v3.4s, #0
+
+    add             x1, x1, x1
+    add             x3, x3, x3

x265-4.1.tar/source/common/bitstream.cpp -> x265-4.2.tar/source/common/bitstream.cpp Changed

x265-4.1.tar/source/common/common.h -> x265-4.2.tar/source/common/common.h Changed

@@ -39,6 +39,7 @@
 #include <stdint.h>
 #include <memory.h>
 #include <assert.h>
+#include <stdlib.h>
 
 #include "x265.h"
 
@@ -150,6 +151,7 @@
 #endif
 
 #define MAX_UINT        0xFFFFFFFFU // max. value of unsigned 32-bit integer
+#define MAX_UINT64      0xFFFFFFFFFFFFFFFFULL // max. value of unsigned 64-bit integer
 #define MAX_INT         2147483647  // max. value of signed 32-bit integer
 #define MAX_INT64       0x7FFFFFFFFFFFFFFFLL  // max. value of signed 64-bit integer
 #define MAX_DOUBLE      1.7e+308    // max. value of double-type value
@@ -231,7 +233,7 @@
         var = (type*)x265_malloc(sizeof(type) * (count)); \
         if (!var) \
         { \
-            x265_log(NULL, X265_LOG_ERROR, "malloc of size %d failed\n", sizeof(type) * (count)); \
+            x265_log(NULL, X265_LOG_ERROR, "malloc of size %llu failed\n", sizeof(type) * (count)); \
             goto fail; \
         } \
     }
@@ -242,7 +244,7 @@
             memset((void*)var, 0, sizeof(type) * (count)); \
         else \
         { \
-            x265_log(NULL, X265_LOG_ERROR, "malloc of size %d failed\n", sizeof(type) * (count)); \
+            x265_log(NULL, X265_LOG_ERROR, "malloc of size %llu failed\n", sizeof(type) * (count)); \
             goto fail; \
         } \
     }
@@ -350,6 +352,10 @@
 
 #define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
 
+#define MAX_NUM_PUS_PER_CTU      593   // Maximum number of PUs in a 64x64 CTU
+#define MAX_NUM_PU_SIZES         24    // Number of distinct PU sizes in a 64x64 CTU
+#define MIN_TME_THREADS          32    // Recommended number of threads for ThreadedME
+
 namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };

x265-4.1.tar/source/common/cpu.cpp -> x265-4.2.tar/source/common/cpu.cpp Changed

@@ -62,7 +62,9 @@
 #endif // if X265_ARCH_ARM
 
 namespace X265_NS {
+#if X265_ARCH_X86
 static bool enable512 = false;
+#endif
 const cpu_name_t cpu_names =
 {
 #if X265_ARCH_X86
@@ -121,9 +123,16 @@
 #if defined(HAVE_NEON_I8MM)
     { "Neon_I8MM",       X265_CPU_NEON_I8MM },
 #endif
+#if defined(HAVE_SVE2_BITPERM)
+    { "SVE2_BitPerm",    X265_CPU_SVE2_BITPERM },
+#endif
 #elif X265_ARCH_POWER8
     { "Altivec",         X265_CPU_ALTIVEC },
 
+#elif X265_ARCH_RISCV64
+    { "RVV",           X265_CPU_RVV },
+    { "Zbb",           X265_CPU_ZBB },
+
 #endif // if X265_ARCH_X86
     { "", 0 },
 };
@@ -352,7 +361,7 @@
 {
     int flags = 0;
 
-#if HAVE_ARMV6
+#if HAVE_ARMV6 && ENABLE_ASSEMBLY
     flags |= X265_CPU_ARMV6;
 
     // don't do this hack if compiled with -mfpu=neon
@@ -404,6 +413,21 @@
     return flags;
 }
 
+#elif X265_ARCH_RISCV64
+#include "riscv64/cpu.h"
+
+uint32_t cpu_detect(bool benableavx512)
+{
+    (void)benableavx512;
+    uint32_t flags = 0;
+
+#ifdef ENABLE_ASSEMBLY
+    flags = riscv64_cpu_detect();
+#endif
+
+    return flags;
+}
+
 #elif X265_ARCH_POWER8
 
 uint32_t cpu_detect(bool benableavx512)

x265-4.1.tar/source/common/cudata.cpp -> x265-4.2.tar/source/common/cudata.cpp Changed

@@ -1740,6 +1740,64 @@
     return count;
 }
 
+bool CUData::getMedianColMV(const CUData* colCU, const Frame* colPic, int list, int ref, MV& outMV) const
+{
+    int mvCount = 0;
+    int mvXMAX_NUM_PARTITIONS, mvYMAX_NUM_PARTITIONS;
+
+    for (uint32_t partIdx = 0; partIdx < colCU->m_numPartitions; partIdx++)
+    {
+        uint32_t absPartAddr = partIdx & TMVP_UNIT_MASK;
+        if (colCU->m_predModepartIdx == MODE_NONE || colCU->isIntra(absPartAddr))
+            continue;
+
+        int8_t refIdx = colCU->m_refIdxlistpartIdx;
+        if (refIdx < 0)
+            continue;
+
+        MV rawMv = colCU->m_mvlistpartIdx;
+
+        int colPOC = colPic->m_encData->m_slice->m_poc;
+        int colRefPOC = colPic->m_encData->m_slice->m_refPOCListlistrefIdx;
+
+        int curPOC = m_slice->m_poc;
+        int curRefPOC = this->m_slice->m_refPOCListlistref;
+
+        MV scaledMv = scaleMvByPOCDist(rawMv, curPOC, curRefPOC, colPOC, colRefPOC);
+
+        if (mvCount >= MAX_NUM_PARTITIONS)
+            break;
+
+        mvXmvCount = scaledMv.x;
+        mvYmvCount = scaledMv.y;
+        mvCount++;
+    }
+
+    if (mvCount == 0)
+        return false;
+
+    size_t mid = mvCount >> 1;
+
+    std::nth_element(mvX, mvX + mid, mvX + mvCount);
+    std::nth_element(mvY, mvY + mid, mvY + mvCount);
+
+    if (mvCount & 1)
+    {
+        outMV.x = mvXmid;
+        outMV.y = mvYmid;
+    }
+    else
+    {
+        int lowerMaxX = *std::max_element(mvX, mvX + mid);
+        int lowerMaxY = *std::max_element(mvY, mvY + mid);
+
+        outMV.x = (lowerMaxX + mvXmid) >> 1;
+        outMV.y = (lowerMaxY + mvYmid) >> 1;
+    }
+
+    return true;
+}
+
 // Create the PMV list. Called for each reference index.
 #if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
 int CUData::getPMV(InterNeighbourMV* neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx, uint32_t absPartIdx) const

x265-4.1.tar/source/common/cudata.h -> x265-4.2.tar/source/common/cudata.h Changed

x265-4.1.tar/source/common/dct.cpp -> x265-4.2.tar/source/common/dct.cpp Changed

x265-4.1.tar/source/common/frame.cpp -> x265-4.2.tar/source/common/frame.cpp Changed

@@ -36,6 +36,7 @@
     m_reconRowFlag = NULL;
     m_reconColCount = NULL;
     m_countRefEncoders = 0;
+    m_ctuMEFlags = NULL;
     m_encData = NULL;
     for (int i = 0; i < NUM_RECON_VERSION; i++)
         m_reconPici = NULL;
@@ -179,9 +180,10 @@
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
         m_numRows = (m_fencPic->m_picHeight + param->maxCUSize - 1)  / param->maxCUSize;
+        int32_t m_numCols = (m_fencPic->m_picWidth + param->maxCUSize - 1)  / param->maxCUSize;
         m_reconRowFlag = new ThreadSafeIntegerm_numRows;
         m_reconColCount = new ThreadSafeIntegerm_numRows;
-
+        m_ctuMEFlags = new ThreadSafeIntegerm_numRows * m_numCols;
         if (quantOffsets)
         {
             int32_t cuCount = (param->rc.qgSize == 8) ? m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes :
@@ -358,6 +360,12 @@
         m_reconColCount = NULL;
     }
 
+    if (m_ctuMEFlags)
+    {
+        delete m_ctuMEFlags;
+        m_ctuMEFlags = NULL;
+    }
+
     if (m_quantOffsets)
     {
         delete m_quantOffsets;

x265-4.1.tar/source/common/frame.h -> x265-4.2.tar/source/common/frame.h Changed

x265-4.1.tar/source/common/framedata.cpp -> x265-4.2.tar/source/common/framedata.cpp Changed

x265-4.1.tar/source/common/loopfilter.cpp -> x265-4.2.tar/source/common/loopfilter.cpp Changed

@@ -28,6 +28,8 @@
 
 #define PIXEL_MIN 0
 
+using namespace X265_NS;
+
 namespace {
 
 static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
@@ -152,26 +154,94 @@
     }
 }
 
-/* Deblocking of one line/column for the chrominance component
-* \param src     pointer to picture data
-* \param offset  offset value for picture data
-* \param tc      tc value
-* \param maskP   indicator to disable filtering on partP
-* \param maskQ   indicator to disable filtering on partQ */
-static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
+void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
+                         int32_t maskP, int32_t maskQ)
 {
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4 = (int16_t)src0;
-        int16_t m3 = (int16_t)src-offset;
-        int16_t m5 = (int16_t)srcoffset;
-        int16_t m2 = (int16_t)src-offset * 2;
-
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
-        src-offset  = x265_clip(m3 + (delta & maskP));
-        src0        = x265_clip(m4 - (delta & maskQ));
-    }
+    X265_CHECK(offset == 1, "Offset value must be 1 for Chroma Vertical\n");
+
+    (void)offset;
+
+    int16_t m2 = (int16_t)src0 * srcStep - 2;
+    int16_t m3 = (int16_t)src0 * srcStep - 1;
+    int16_t m4 = (int16_t)src0 * srcStep + 0;
+    int16_t m5 = (int16_t)src0 * srcStep + 1;
+
+    int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src0 * srcStep - 1 = x265_clip(m3 + (delta & maskP));
+    src0 * srcStep + 0 = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src1 * srcStep - 2;
+    m3 = (int16_t)src1 * srcStep - 1;
+    m4 = (int16_t)src1 * srcStep + 0;
+    m5 = (int16_t)src1 * srcStep + 1;
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src1 * srcStep - 1 = x265_clip(m3 + (delta & maskP));
+    src1 * srcStep + 0 = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src2 * srcStep - 2;
+    m3 = (int16_t)src2 * srcStep - 1;
+    m4 = (int16_t)src2 * srcStep + 0;
+    m5 = (int16_t)src2 * srcStep + 1;
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src2 * srcStep - 1 = x265_clip(m3 + (delta & maskP));
+    src2 * srcStep + 0 = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src3 * srcStep - 2;
+    m3 = (int16_t)src3 * srcStep - 1;
+    m4 = (int16_t)src3 * srcStep + 0;
+    m5 = (int16_t)src3 * srcStep + 1;
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src3 * srcStep - 1 = x265_clip(m3 + (delta & maskP));
+    src3 * srcStep + 0 = x265_clip(m4 - (delta & maskQ));
 }
+
+void pelFilterChroma_H_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
+                         int32_t maskP, int32_t maskQ)
+{
+    X265_CHECK(srcStep == 1, "srcStep value must be 1 for Chroma Horizontal\n");
+
+    (void)srcStep;
+
+    int16_t m2 = (int16_t)src0 - offset * 2;
+    int16_t m3 = (int16_t)src0 - offset * 1;
+    int16_t m4 = (int16_t)src0 + offset * 0;
+    int16_t m5 = (int16_t)src0 + offset * 1;
+
+    int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src0 - offset * 1 = x265_clip(m3 + (delta & maskP));
+    src0 + offset * 0 = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src1 - offset * 2;
+    m3 = (int16_t)src1 - offset * 1;
+    m4 = (int16_t)src1 + offset * 0;
+    m5 = (int16_t)src1 + offset * 1;
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src1 - offset * 1 = x265_clip(m3 + (delta & maskP));
+    src1 + offset * 0 = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src2 - offset * 2;
+    m3 = (int16_t)src2 - offset * 1;
+    m4 = (int16_t)src2 + offset * 0;
+    m5 = (int16_t)src2 + offset * 1;
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src2 - offset * 1 = x265_clip(m3 + (delta & maskP));
+    src2 + offset * 0 = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src3 - offset * 2;
+    m3 = (int16_t)src3 - offset * 1;
+    m4 = (int16_t)src3 + offset * 0;
+    m5 = (int16_t)src3 + offset * 1;
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src3 - offset * 1 = x265_clip(m3 + (delta & maskP));
+    src3 + offset * 0 = x265_clip(m4 - (delta & maskQ));
+}
+
 }
 
 namespace X265_NS {
@@ -190,7 +260,7 @@
     // C code is same for EDGE_VER and EDGE_HOR only asm code is different
     p.pelFilterLumaStrong0 = pelFilterLumaStrong_c;
     p.pelFilterLumaStrong1 = pelFilterLumaStrong_c;
-    p.pelFilterChroma0     = pelFilterChroma_c;
-    p.pelFilterChroma1     = pelFilterChroma_c;
+    p.pelFilterChroma0     = pelFilterChroma_V_c;
+    p.pelFilterChroma1     = pelFilterChroma_H_c;
 }
 }

x265-4.1.tar/source/common/lowres.cpp -> x265-4.2.tar/source/common/lowres.cpp Changed

x265-4.1.tar/source/common/param.cpp -> x265-4.2.tar/source/common/param.cpp Changed

@@ -423,6 +423,10 @@
     param->searchRangeForLayer1 = 3;
     param->searchRangeForLayer2 = 3;
 
+    /* Threaded ME */
+    param->tmeTaskBlockSize = 1;
+    param->tmeNumBufferRows = 10;
+
     /*Alpha Channel Encoding*/
     param->bEnableAlpha = 0;
     param->numScalableLayers = 1;
@@ -641,6 +645,7 @@
     }
     if (tune)
     {
+        param->tune = tune;
         if (!strcmp(tune, "psnr"))
         {
             param->rc.aqStrength = 0.0;
@@ -904,12 +909,16 @@
     bool bValueWasNull = !value;
     bool bExtraParams = false;
     char nameBuf64;
+#ifdef SVT_HEVC
     static int count;
+#endif
 
     if (!name)
         return X265_PARAM_BAD_NAME;
 
+#ifdef SVT_HEVC
     count++;
+#endif
     // skip -- prefix if provided
     if (name0 == '-' && name1 == '-')
         name += 2;
@@ -1518,6 +1527,7 @@
         }
 #endif
         OPT("frame-rc") p->bConfigRCFrame = atobool(value);
+        OPT("threaded-me") p->bThreadedME = atobool(value);
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1667,6 +1677,8 @@
 {
 #define CHECK(expr, msg) check_failed |= _confirm(param, expr, msg)
     int check_failed = 0; /* abort if there is a fatal configuration problem */
+    CHECK((uint64_t)param->sourceWidth * param->sourceHeight > 142606336ULL && !param->bAllowNonConformance,
+          "Input video resolution exceeds the maximum supported luma samples 142,606,336 (16384x8704) of Level 7.2.");
     CHECK(param->uhdBluray == 1 && (X265_DEPTH != 10 || param->internalCsp != 1 || param->interlaceMode != 0),
         "uhd-bd: bit depth, chroma subsample, source picture type must be 10, 4:2:0, progressive");
     CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16,
@@ -1803,10 +1815,10 @@
           " smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, bt1361e,"
           " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428 or arib-std-b67");
     CHECK(param->vui.matrixCoeffs < 0
-          || param->vui.matrixCoeffs > 14
+          || param->vui.matrixCoeffs > 15
           || param->vui.matrixCoeffs == 3,
           "Matrix Coefficients must be unknown, bt709, fcc, bt470bg, smpte170m,"
-          " smpte240m, gbr, ycgco, bt2020nc, bt2020c, smpte-st-2085, chroma-nc, chroma-c or ictcp");
+          " smpte240m, gbr, ycgco, bt2020nc, bt2020c, smpte-st-2085, chroma-nc, chroma-c, ictcp or ipt-pq-c2");
     CHECK(param->vui.chromaSampleLocTypeTopField < 0
           || param->vui.chromaSampleLocTypeTopField > 5,
           "Chroma Sample Location Type Top Field must be 0-5");
@@ -1855,6 +1867,11 @@
         "Valid final VBV buffer emptiness must be a fraction 0 - 1, or size in kbits");
     CHECK(param->vbvEndFrameAdjust < 0,
         "Valid vbv-end-fr-adj must be a fraction 0 - 1");
+    if ((param->rc.vbvBufferSize > 0 || param->rc.vbvMaxBitrate > 0) && param->bThreadedME)
+    {
+        param->bThreadedME = 0;
+        x265_log(param, X265_LOG_WARNING, "VBV and threaded-me both enabled. Disabling threaded-me\n");
+    }
     CHECK(param->minVbvFullness < 0 && param->minVbvFullness > 100,
         "min-vbv-fullness must be a fraction 0 - 100");
     CHECK(param->maxVbvFullness < 0 && param->maxVbvFullness > 100,
@@ -1870,7 +1887,7 @@
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
           "Constant QP is incompatible with 2pass");
     CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0),
-          "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
+          "Strict-cbr cannot be applied without specifying both target bitrate and vbv bufsize");
     CHECK(strlen(param->analysisSave) && (param->analysisSaveReuseLevel < 0 || param->analysisSaveReuseLevel > 10),
         "Invalid analysis save refine level. Value must be between 1 and 10 (inclusive)");
     CHECK(strlen(param->analysisLoad) && (param->analysisLoadReuseLevel < 0 || param->analysisLoadReuseLevel > 10),
@@ -1948,7 +1965,7 @@
             CHECK(param->hmeRangelevel < 0 || param->hmeRangelevel >= 32768,
                 "Search Range for HME levels must be between 0 and 32768");
     }
-#if !X86_64
+#if !X86_64 && !X265_ARCH_ARM64 && !X265_ARCH_RISCV64
     CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
         "SEA motion search does not support resolutions greater than 480p in 32 bit build");
 #endif
@@ -1971,11 +1988,6 @@
         param->bSingleSeiNal = 0;
         x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
     }
-    if (param->bEnableTemporalFilter && (param->frameNumThreads != 1))
-    {
-        param->bEnableTemporalFilter = 0;
-        x265_log(param, X265_LOG_WARNING, "MCSTF can be enabled with frame thread = 1 only. Disabling MCSTF\n");
-    }
     CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
     CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
     CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
@@ -2024,6 +2036,19 @@
         if (checkValid)
             param->bEnableSCC = 0;
     }
+    if (!!param->bEnableSCC)
+    {
+        if (param->bEnableRdRefine && param->bDynamicRefine)
+        {
+            param->bEnableRdRefine = 0;
+            x265_log(param, X265_LOG_WARNING, "Disabling rd-refine as it can not be used with scc and dynamic-refine\n");
+        }
+        if (param->bEnableRdRefine && param->interRefine > 0)
+        {
+            param->bEnableRdRefine = 0;
+            x265_log(param, X265_LOG_WARNING, "Disabling rd-refine as it can not be used with scc and inter-refine\n");
+        }
+    }
     CHECK(!!param->bEnableSCC&& param->rdLevel != 6, "Enabling scc extension in x265 requires rdlevel of 6 ");
 #endif
 
@@ -2073,6 +2098,9 @@
 
     x265_log(param, X265_LOG_INFO, "Coding QT: max CU size, min CU size : %d / %d\n", param->maxCUSize, param->minCUSize);
 
+    if (param->bThreadedME)
+        x265_log(param, X265_LOG_INFO, "ThreadedME: task block / buf rows   : %d / %d\n", param->tmeTaskBlockSize, param->tmeNumBufferRows);
+
     x265_log(param, X265_LOG_INFO, "Residual QT: max TU size, max depth : %d / %d inter / %d intra\n",
              param->maxTUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth);
 
@@ -2200,6 +2228,8 @@
 #if ENABLE_HDR10_PLUS
     TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
 #endif
+    if(param->bEnableTemporalFilter)
+        TOOLOPT(param->bEnableTemporalFilter, "mcstf");
     x265_log(param, X265_LOG_INFO, "tools:%s\n", buf);
     fflush(stderr);
 }
@@ -2702,6 +2732,7 @@
     if (strlen(src->numaPools)) snprintf(dst->numaPools, X265_MAX_STRING_SIZE, "%s", src->numaPools);
     else dst->numaPools0 = 0;
 
+    dst->tune = src->tune;
     dst->bEnableWavefront = src->bEnableWavefront;
     dst->bDistributeModeAnalysis = src->bDistributeModeAnalysis;
     dst->bDistributeMotionEstimation = src->bDistributeMotionEstimation;
@@ -2976,6 +3007,9 @@
     dst->bEnableHRDConcatFlag = src->bEnableHRDConcatFlag;
     dst->dolbyProfile = src->dolbyProfile;
     dst->bEnableSvtHevc = src->bEnableSvtHevc;
+    dst->bThreadedME = src->bThreadedME;
+    dst->tmeTaskBlockSize = src->tmeTaskBlockSize;
+    dst->tmeNumBufferRows = src->tmeNumBufferRows;
     dst->bEnableFades = src->bEnableFades;
     dst->bEnableSceneCutAwareQp = src->bEnableSceneCutAwareQp;
     dst->fwdMaxScenecutWindow = src->fwdMaxScenecutWindow;
@@ -3017,11 +3051,9 @@
     memcpy(dst->svtHevcParam, src->svtHevcParam, sizeof(EB_H265_ENC_CONFIGURATION));
 #endif
     /* Film grain */
-    if (src->filmGrain)
-        dst->filmGrain = src->filmGrain;
+    dst->filmGrain = src->filmGrain;
     /* Aom Film grain*/
-    if (src->aomFilmGrain)
-        dst->aomFilmGrain = src->aomFilmGrain;
+    dst->aomFilmGrain = src->aomFilmGrain;
     dst->bEnableSBRC = src->bEnableSBRC;
     dst->bConfigRCFrame = src->bConfigRCFrame;
     dst->isAbrLadderEnable = src->isAbrLadderEnable;

x265-4.1.tar/source/common/picyuv.cpp -> x265-4.2.tar/source/common/picyuv.cpp Changed

x265-4.1.tar/source/common/primitives.cpp -> x265-4.2.tar/source/common/primitives.cpp Changed

x265-4.1.tar/source/common/quant.cpp -> x265-4.2.tar/source/common/quant.cpp Changed

x265-4.2.tar/source/common/riscv64 Added

x265-4.2.tar/source/common/riscv64/asm-primitives.cpp Added

@@ -0,0 +1,495 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+extern "C" {
+#include "fun-decls.h"
+}
+
+#define ALL_LUMA_TU_TYPED_L(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _2_ ## cpu); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _3_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _4_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _5_ ## cpu);
+#define ALL_LUMA_TU_L(prim, fname, cpu)  ALL_LUMA_TU_TYPED_L(prim, , fname, cpu)
+
+#define ALL_LUMA_TU_TYPED_S(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4_ ## cpu); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32_ ## cpu);
+#define ALL_LUMA_TU_S(prim, fname, cpu)  ALL_LUMA_TU_TYPED_S(prim, , fname, cpu)
+
+#define ALL_LUMA_BLOCKS_TYPED_S(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4_ ## cpu); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32_ ## cpu); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64_ ## cpu);
+#define ALL_LUMA_BLOCKS_S(prim, fname, cpu)  ALL_LUMA_BLOCKS_TYPED_S(prim, , fname, cpu)
+
+#define ALL_LUMA_BLOCKS_TYPED(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu);
+#define ALL_LUMA_BLOCKS(prim, fname, cpu)  ALL_LUMA_BLOCKS_TYPED(prim, , fname, cpu)
+
+#define ALL_LUMA_BLOCKS_TYPED_B(prim, fncdef, fname, cpu) \
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu);
+#define ALL_LUMA_BLOCKS_B(prim, fname, cpu)  ALL_LUMA_BLOCKS_TYPED_B(prim, , fname, cpu)
+
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu)                            \
+  p.puLUMA_4x4.prim = fncdef PFX(fname##_4x4_##cpu);                         \
+  p.puLUMA_8x8.prim = fncdef PFX(fname##_8x8_##cpu);                         \
+  p.puLUMA_16x16.prim = fncdef PFX(fname##_16x16_##cpu);                     \
+  p.puLUMA_32x32.prim = fncdef PFX(fname##_32x32_##cpu);                     \
+  p.puLUMA_64x64.prim = fncdef PFX(fname##_64x64_##cpu);                     \
+  p.puLUMA_8x4.prim = fncdef PFX(fname##_8x4_##cpu);                         \
+  p.puLUMA_4x8.prim = fncdef PFX(fname##_4x8_##cpu);                         \
+  p.puLUMA_16x8.prim = fncdef PFX(fname##_16x8_##cpu);                       \
+  p.puLUMA_8x16.prim = fncdef PFX(fname##_8x16_##cpu);                       \
+  p.puLUMA_16x32.prim = fncdef PFX(fname##_16x32_##cpu);                     \
+  p.puLUMA_32x16.prim = fncdef PFX(fname##_32x16_##cpu);                     \
+  p.puLUMA_64x32.prim = fncdef PFX(fname##_64x32_##cpu);                     \
+  p.puLUMA_32x64.prim = fncdef PFX(fname##_32x64_##cpu);                     \
+  p.puLUMA_16x12.prim = fncdef PFX(fname##_16x12_##cpu);                     \
+  p.puLUMA_12x16.prim = fncdef PFX(fname##_12x16_##cpu);                     \
+  p.puLUMA_16x4.prim = fncdef PFX(fname##_16x4_##cpu);                       \
+  p.puLUMA_4x16.prim = fncdef PFX(fname##_4x16_##cpu);                       \
+  p.puLUMA_32x24.prim = fncdef PFX(fname##_32x24_##cpu);                     \
+  p.puLUMA_24x32.prim = fncdef PFX(fname##_24x32_##cpu);                     \
+  p.puLUMA_32x8.prim = fncdef PFX(fname##_32x8_##cpu);                       \
+  p.puLUMA_8x32.prim = fncdef PFX(fname##_8x32_##cpu);                       \
+  p.puLUMA_64x48.prim = fncdef PFX(fname##_64x48_##cpu);                     \
+  p.puLUMA_48x64.prim = fncdef PFX(fname##_48x64_##cpu);                     \
+  p.puLUMA_64x16.prim = fncdef PFX(fname##_64x16_##cpu);                     \
+  p.puLUMA_16x64.prim = fncdef PFX(fname##_16x64_##cpu)
+#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_420_CU_TYPED(prim, fncdef, fname, cpu) \
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu)
+#define ALL_CHROMA_420_CU(prim, fname, cpu) ALL_CHROMA_420_CU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_420_CU_TYPED_B(prim, fncdef, fname, cpu) \
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu)
+#define ALL_CHROMA_420_CU_B(prim, fname, cpu) ALL_CHROMA_420_CU_TYPED_B(prim, , fname, cpu)
+
+#define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu)
+#define ALL_CHROMA_422_CU(prim, fname, cpu) ALL_CHROMA_422_CU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_422_CU_TYPED_B(prim, fncdef, fname, cpu) \
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu)
+#define ALL_CHROMA_422_CU_B(prim, fname, cpu) ALL_CHROMA_422_CU_TYPED_B(prim, , fname, cpu)
+
+#define ALL_CHROMA_420_PU_TYPED(prim, fncdef, fname, cpu)                                  \
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
+#define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_422_PU_TYPED(prim, fncdef, fname, cpu)               \
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## cpu)
+#define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
+    p.chromaX265_CSP_I444.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \

x265-4.2.tar/source/common/riscv64/asm.S Added

@@ -0,0 +1,340 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef ASM_S_  // #include guards
+#define ASM_S_
+
+.option arch, +v
+
+#define PFX3(prefix, name) prefix ## _ ## name
+#define PFX2(prefix, name) PFX3(prefix, name)
+#define PFX(name)          PFX2(X265_NS, name)
+
+#if defined __clang__
+#define EXTERN_ASM
+#define HAVE_AS_FUNC 0
+#define PREFIX 1
+#else
+#define EXTERN_ASM
+#define HAVE_AS_FUNC 1
+#endif
+
+#ifdef __ELF__
+#define ELF
+#else
+#ifdef PREFIX
+#define ELF #
+#else
+#define ELF @
+#endif
+#endif
+
+#if HAVE_AS_FUNC
+#define FUNC
+#else
+#ifdef PREFIX
+#define FUNC #
+#else
+#define FUNC @
+#endif
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
+#define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
+
+#define FENC_STRIDE 64
+#define FDNC_STRIDE 128
+
+// Alignment of stack arguments of size less than 8 bytes.
+#define STACK_ARG_ALIGNMENT 8
+
+// Get offset from SP of stack argument at index `idx`.
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+.macro  const   name, align=2
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+#ifdef __MACH__
+    .const_data
+#else
+    .section .rodata
+#endif
+    .align          \align
+\name:
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+    vadd.vv         \sum,  \a, \b
+    vsub.vv         \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB       \s1, \d1, \a, \b
+    SUMSUB_AB       \s2, \d2, \c, \d
+.endm
+
+.macro HADAMARD4 r1, r2, r3, r4, t1, t2, t3, t4
+    SUMSUB_ABCD     \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+    SUMSUB_ABCD     \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
+
+.macro SIGNOF x, t1, t2
+    srai            \t1, \x, 31
+    neg             \x, \x
+    li              \t2, 63
+    srl             \x, \x, \t2
+    or              \x, \x, \t1
+.endm
+
+.macro SIGNOF_RVV  vd, va, vb
+    vmv.v.i         \vd, 0
+    vmsltu.vv       v0, \va, \vb
+    vmerge.vim      \vd, \vd, -1, v0
+    vmsgtu.vv       v0, \va, \vb
+    vmerge.vim      \vd, \vd, 1, v0
+.endm
+
+.macro VABS vd, va, vb
+#if HAVE_RVV_ZF
+    vabs.v          \vd, \va
+#else
+    vrsub.vi        \vb, \va, 0
+    vmax.vv         \vd, \va, \vb
+#endif
+.endm
+
+.macro VABD d0, s0, s1, t0
+#if HAVE_RVV_ZF
+    vabd.vv         \d0, \s0, \s1
+#else
+    vmax.vv         \d0, \s0, \s1
+    vmin.vv         \t0, \s0, \s1
+    vsub.vv         \d0, \d0, \t0
+#endif
+.endm
+
+.macro VABDU d0, s0, s1, t0
+#if HAVE_RVV_ZF
+    vabdu.vv        \d0, \s0, \s1
+#else
+    vmaxu.vv        \d0, \s0, \s1
+    vminu.vv        \t0, \s0, \s1
+    vsub.vv         \d0, \d0, \t0
+#endif
+.endm
+
+.macro UWABDU d0, s0, s1, t0, t1
+    vmaxu.vv        \t1, \s0, \s1
+    vminu.vv        \t0, \s0, \s1
+    vwsubu.vv       \d0, \t1, \t0
+.endm
+
+.macro UWADBDACCU d0, s0, s1, t0, t1
+#if HAVE_RVV_ZF
+    vwabdaccu.vv    \d0, \s0, \s1
+#else
+    vmaxu.vv        \t1, \s0, \s1
+    vminu.vv        \t0, \s0, \s1
+    vsub.vv         \t1, \t1, \t0
+    vwaddu.wv       \d0, \d0, \t1
+#endif
+.endm
+
+// v0 must be 0x5 as mask
+.macro TRANSPOSE_4x4 sew, d0, d1, d2, d3, r0, r1, r2, r3
+
+    # Stage 1: Interleave within pairs (\es, VL=4)
+    vslideup.vi     \d0, \r1, 1
+    vslidedown.vi   \d1, \r0, 1
+    vslideup.vi     \d2, \r3, 1
+    vslidedown.vi   \d3, \r2, 1
+
+    # Reinterpret v0 as \es mask
+    vmerge.vvm      \d0, \d0, \r0, v0
+    vmerge.vvm      \d1, \r1, \d1, v0

x265-4.2.tar/source/common/riscv64/blockcopy8.S Added

@@ -0,0 +1,100 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 4
+
+.text
+
+// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
+.macro COPY_CNT_N_V tr, lmul
+function PFX(copy_cnt_\tr\()_v)
+    slli            a2, a2, 1
+    li              a3, \tr
+    vsetvli         zero, a3, e16, m\lmul, ta, ma
+    vmv.v.i         v20, 0
+    vmv.v.i         v24, 1
+    vmv.v.i         v28, 0
+
+.rept \tr / 2
+    vle16.v         v4, (a1)
+    add             a1, a1, a2
+    vle16.v         v8, (a1)
+    add             a1, a1, a2
+    vse16.v         v4, (a0)
+    addi            a0, a0, \tr * 2
+    vse16.v         v8, (a0)
+    addi            a0, a0, \tr * 2
+
+    vmsne.vi        v0, v4, 0
+    vsetvli         zero, a3, e16, m\lmul, ta, mu
+    vadd.vv         v20, v20, v24, v0.t
+    vmsne.vi        v0, v8, 0
+    vadd.vv         v20, v20, v24, v0.t
+    vsetvli         zero, a3, e16, m\lmul, ta, ma
+.endr
+
+    vredsum.vs      v20, v20, v28
+    vmv.x.s         a0, v20
+    ret
+endfunc
+.endm
+
+COPY_CNT_N_V 4, 1
+COPY_CNT_N_V 8, 1
+COPY_CNT_N_V 16, 2
+COPY_CNT_N_V 32, 4
+
+// int  count_nonzero_c(const int16_t* quantCoeff)
+.macro COUNT_NONZERO_N_V tr, lmul
+function PFX(count_nonzero_\tr\()_v)
+    li              a3, \tr
+    vsetvli         zero, a3, e16, m\lmul, ta, ma
+    vmv.v.i         v12, 0
+    vmv.v.i         v16, 1
+    vmv.v.i         v20, 0
+
+.rept \tr / 2
+    vle16.v         v4, (a0)
+    addi            a0, a0, \tr * 2
+    vle16.v         v8, (a0)
+    addi            a0, a0, \tr * 2
+    vmsne.vi        v0, v4, 0
+    vsetvli         zero, a3, e16, m\lmul, ta, mu
+    vadd.vv         v12, v12, v16, v0.t
+    vmsne.vi        v0, v8, 0
+    vadd.vv         v12, v12, v16, v0.t
+    vsetvli         zero, a3, e16, m\lmul, ta, ma
+.endr
+
+    vredsum.vs      v12, v12, v20
+    vmv.x.s         a0, v12
+    ret
+endfunc
+.endm
+
+COUNT_NONZERO_N_V 4, 1
+COUNT_NONZERO_N_V 8, 1
+COUNT_NONZERO_N_V 16, 2
+COUNT_NONZERO_N_V 32, 4

x265-4.2.tar/source/common/riscv64/cpu.h Added

@@ -0,0 +1,94 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_RISCV64_CPU_H
+#define X265_COMMON_RISCV64_CPU_H
+
+#include "x265.h"
+
+#if RISCV64_RUNTIME_CPU_DETECT
+
+#if defined(__linux__)
+
+#include <sys/auxv.h>
+
+#define HWCAP_RISCV64_RVV     (1 << ('V' - 'A'))
+
+static int parse_proc_cpuinfo(const char *flag) {
+    FILE *file = fopen("/proc/cpuinfo", "r");
+    if (file == NULL)
+        return 0;
+
+    char line1024;
+    int found = 0;
+
+    while (fgets(line, sizeof(line), file)) {
+        if (strstr(line, flag) != NULL) {
+            found = 1;
+            break;
+        }
+    }
+
+    fclose(file);
+    return found;
+}
+
+static inline uint32_t riscv64_cpu_detect()
+{
+    uint32_t flags = 0;
+
+    unsigned long hwcap = getauxval(AT_HWCAP);
+
+    if (hwcap & HWCAP_RISCV64_RVV) {
+        flags |= X265_CPU_RVV;
+
+        if (parse_proc_cpuinfo("zbb"))
+            flags |= X265_CPU_ZBB;
+    }
+
+    return flags;
+}
+
+#else // defined(__linux__)
+#error                                                                 \
+    "Run-time CPU feature detection selected, but no detection method" \
+    "available for your platform. Rerun cmake configure with"          \
+    "-DRISCV64_RUNTIME_CPU_DETECT=OFF."
+#endif // defined(__linux__)
+
+#else // if AARCH64_RUNTIME_CPU_DETECT
+
+static inline uint32_t riscv64_cpu_detect()
+{
+    uint32_t flags = 0;
+
+#if HAVE_RVV
+    flags |= X265_CPU_RVV;
+    flags |= X265_CPU_ZBB;
+#endif
+    return flags;
+}
+
+#endif // if RISCV64_RUNTIME_CPU_DETECT
+
+#endif // ifndef X265_COMMON_RISCV64_CPU_H

x265-4.2.tar/source/common/riscv64/dct.S Added

@@ -0,0 +1,1538 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+.text
+
+.macro IDCT_MUL_1V4IMM m1, m2, m3, m4, src, d1, d2, d3, d4
+    li              t1, \m1
+    li              t2, \m2
+    li              t3, \m3
+    li              t4, \m4
+    vmul.vx         \d1, \src, t1
+    vmul.vx         \d2, \src, t2
+    vmul.vx         \d3, \src, t3
+    vmul.vx         \d4, \src, t4
+.endm
+
+.macro IDCT_MULADD_1V4IMM m1, m2, m3, m4, src, d1, d2, d3, d4
+    li              t1, \m1
+    li              t2, \m2
+    li              t3, \m3
+    li              t4, \m4
+    vmacc.vx        \d1, t1, \src
+    vmacc.vx        \d2, t2, \src
+    vmacc.vx        \d3, t3, \src
+    vmacc.vx        \d4, t4, \src
+.endm
+
+.macro IDCT_MULADD_2x2V2IMM m1, m2, m3, m4, src1, src2, dst1, dst2
+    li              t1, \m1
+    li              t2, \m2
+    li              t3, \m3
+    li              t4, \m4
+    vmul.vx         \dst1, \src1, t1
+    vmacc.vx        \dst1, t2, \src2
+    vmul.vx         \dst2, \src1, t3
+    vmacc.vx        \dst2, t4, \src2
+.endm
+
+.macro IDCT_SUMSUB_SHIFT dst1, dst2, src1, src2, shift
+    vadd.vv \dst1, \src1, \src2
+    vsub.vv \dst2, \src1, \src2
+    vsll.vi \dst1, \dst1, \shift
+    vsll.vi \dst2, \dst2, \shift
+.endm
+
+.macro TEMP_STORE len, r1, r2, r3, r4
+    addi            sp, sp, -4 * \len
+    addi            t1, sp, \len
+    addi            t2, sp, \len * 2
+    addi            t3, sp, \len * 3
+    addi            a4, a4, 4 * \len
+    vse32.v         \r1, (sp)
+    vse32.v         \r2, (t1)
+    vse32.v         \r3, (t2)
+    vse32.v         \r4, (t3)
+.endm
+
+.macro TEMP_LOAD len, num, r1, r2, r3, r4
+    addi            t1, sp, \num * \len
+    addi            t2, sp, (\num + 1) * \len
+    addi            t3, sp, (\num + 2) * \len
+    addi            t4, sp, (\num + 3) * \len
+    vle32.v         \r1, (t1)
+    vle32.v         \r2, (t2)
+    vle32.v         \r3, (t3)
+    vle32.v         \r4, (t4)
+.endm
+
+.macro PBFI32_STORE8REG dst, off1, off2, shift, strided
+    vnclip.wi       v17, v16, \shift
+    vnclip.wi       v19, v18, \shift
+    vnclip.wi       v21, v20, \shift
+    vnclip.wi       v23, v22, \shift
+
+    addi            t0, \dst, \off1
+    slli            t2, \strided, 1
+    add             t3, t2, \strided
+    add             t1, t0, \strided
+    add             t2, t2, t0
+    add             t3, t3, t0
+
+    vmv.v.i         v0, 5
+    TRANSPOSE_4x4   16, v16, v18, v20, v22, v17, v19, v21, v23
+    vsetivli        zero, 4, e16, mf2, ta, ma
+    vse16.v         v16, (t0)
+    vse16.v         v18, (t1)
+    vse16.v         v20, (t2)
+    vse16.v         v22, (t3)
+
+    vnclip.wi       v17, v24, \shift
+    vnclip.wi       v19, v26, \shift
+    vnclip.wi       v21, v28, \shift
+    vnclip.wi       v23, v30, \shift
+
+    addi            t0, \dst, \off2
+    slli            t2, \strided, 1
+    add             t3, t2, \strided
+    add             t1, t0, \strided
+    add             t2, t2, t0
+    add             t3, t3, t0
+
+    TRANSPOSE_4x4   16, v16, v18, v20, v22, v17, v19, v21, v23
+    vsetivli        zero, 4, e16, mf2, ta, ma
+    vse16.v         v16, (t0)
+    vse16.v         v18, (t1)
+    vse16.v         v20, (t2)
+    vse16.v         v22, (t3)
+.endm
+
+.macro PBFIM_STORE8REG4 dst, off1, shift, strided
+    addi            t0, \dst, \off1
+    li              t1, 0x55
+    vmv.v.x         v0, t1
+    vnclip.wi       v1, v16, \shift
+    vnclip.wi       v2, v18, \shift
+    vnclip.wi       v3, v20, \shift
+    vnclip.wi       v4, v22, \shift
+    vnclip.wi       v5, v24, \shift
+    vnclip.wi       v6, v26, \shift
+    vnclip.wi       v7, v28, \shift
+    vnclip.wi       v8, v30, \shift
+
+    TRANSPOSE_4x4   16, v16, v17, v18, v19, v1, v2, v3, v4
+    vsetivli        zero, 4, e16, mf2, ta, ma
+    TRANSPOSE_4x4   16, v20, v21, v22, v23, v5, v6, v7, v8
+
+    vsetivli        zero, 8, e16, m1, ta, ma
+    vslideup.vi     v16, v20, 4
+    vslideup.vi     v17, v21, 4
+    vslideup.vi     v18, v22, 4
+    vslideup.vi     v19, v23, 4
+
+    slli            t2, \strided, 1
+    add             t1, t0, \strided
+    add             t2, t2, t0
+    add             t3, t2, \strided
+    vse16.v         v16, (t0)
+    vse16.v         v17, (t1)
+    vse16.v         v18, (t2)
+    vse16.v         v19, (t3)
+.endm
+
+.macro PBFIM_STORE8REG8 dst, off1, shift, strided
+    addi            t0, \dst, \off1
+    li              t1, 0x55
+    vmv.v.x         v0, t1
+    vnclip.wi       v1, v16, \shift
+    vnclip.wi       v2, v18, \shift
+    vnclip.wi       v3, v20, \shift
+    vnclip.wi       v4, v22, \shift
+    vnclip.wi       v5, v24, \shift
+    vnclip.wi       v6, v26, \shift
+    vnclip.wi       v7, v28, \shift
+    vnclip.wi       v8, v30, \shift
+
+    TRANSPOSE_8x8   16, v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4, v5, v6, v7, v8
+    vsetivli        zero, 8, e16, m1, ta, ma
+
+    slli            t4, \strided, 2
+    slli            t2, \strided, 1
+    add             t1, t0, \strided
+    add             t2, t2, t0
+    add             t3, t2, \strided
+    vse16.v         v16, (t0)
+    vse16.v         v17, (t1)
+    vse16.v         v18, (t2)
+    vse16.v         v19, (t3)
+
+    add             t0, t0, t4
+    add             t1, t1, t4

x265-4.2.tar/source/common/riscv64/filter-prim.cpp Added

@@ -0,0 +1,1280 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Yujiao He <he.yujiao@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-prim.h"
+#include <stdint.h>
+
+namespace X265_NS
+{
+#if !HIGH_BIT_DEPTH
+
+template<int coeffIdx, int width, int height>
+void interp8_horiz_pp_rvv(const pixel *src, intptr_t srcStride, pixel *dst,
+                           intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    src -= (N_TAPS / 2 - 1);
+    size_t vl = 32;
+
+    if (width > 8)
+    {
+        const vuint8m1_t shift = __riscv_vmv_v_x_u8m1(IF_FILTER_PREC, vl);
+        const vint16m2_t zero = __riscv_vmv_v_x_i16m2(0, vl);
+        vint16m2_t t0, t1, t2, t3, t4, t5, t6, t7;
+        vint16m2_t r0, r1, r2, r3, r4, r5, r6, r7;
+        vint16m2_t *s08 = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7};
+        vint16m2_t *s18 = {&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7};
+
+        for (int row = 0; row < height; row +=2)
+        {
+            for (int col = 0; col < width; col += vl)
+            {
+                vl = __riscv_vsetvl_e16m2(width - col);
+
+                load_u8x16xn<8>(src + col + 0 * srcStride, s0, 1, vl);
+                load_u8x16xn<8>(src + col + 1 * srcStride, s1, 1, vl);
+
+                vint16m2_t d0 = filter8_s16x16<coeffIdx>(s0, zero, vl);
+                vint16m2_t d1 = filter8_s16x16<coeffIdx>(s1, zero, vl);
+
+                vuint16m2_t d0_u16 = __riscv_vreinterpret_v_i16m2_u16m2(__riscv_vmax_vv_i16m2(d0, zero, vl));
+                vuint16m2_t d1_u16 = __riscv_vreinterpret_v_i16m2_u16m2(__riscv_vmax_vv_i16m2(d1, zero, vl));
+
+                vuint8m1_t d0_u8 = __riscv_vnclipu_wv_u8m1(d0_u16, shift, 0, vl);
+                vuint8m1_t d1_u8 = __riscv_vnclipu_wv_u8m1(d1_u16, shift, 0, vl);
+
+                __riscv_vse8_v_u8m1(dst + col + 0 * dstStride, d0_u8, vl);
+                __riscv_vse8_v_u8m1(dst + col + 1 * dstStride, d1_u8, vl);
+            }
+
+            src += 2 * srcStride;
+            dst += 2 * dstStride;
+        }
+    }
+    else
+    {
+        vuint8mf2_t shift = __riscv_vmv_v_x_u8mf2(IF_FILTER_PREC, vl);
+        vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7;
+        vint16m1_t r0, r1, r2, r3, r4, r5, r6, r7;
+        vint16m1_t *s08 = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7};
+        vint16m1_t *s18 = {&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7};
+
+        for (int row = 0; row < height; row += 2)
+        {
+            int col = 0;
+            for (; col < width; col += vl)
+            {
+                vl = __riscv_vsetvl_e16m1(width - col);
+                load_u8x8xn<8>(src + col + 0 * srcStride, s0, 1, vl);
+                load_u8x8xn<8>(src + col + 1 * srcStride, s1, 1, vl);
+
+                vint16m1_t d0 = filter8_s16x8<coeffIdx>(s0, zero, vl);
+                vint16m1_t d1 = filter8_s16x8<coeffIdx>(s1, zero, vl);
+
+                vuint16m1_t d0_u16 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(d0, zero, vl));
+                vuint16m1_t d1_u16 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(d1, zero, vl));
+
+                vuint8mf2_t d0_u8 = __riscv_vnclipu_wv_u8mf2(d0_u16, shift, 0, vl);
+                vuint8mf2_t d1_u8 = __riscv_vnclipu_wv_u8mf2(d1_u16, shift, 0, vl);
+
+                __riscv_vse8_v_u8mf2(dst + col + 0 * dstStride, d0_u8, vl);
+                __riscv_vse8_v_u8mf2(dst + col + 1 * dstStride, d1_u8, vl);
+            }
+
+            src += 2 * srcStride;
+            dst += 2 * dstStride;
+        }
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_vert_pp_rvv(const pixel *src, intptr_t srcStride, pixel *dst,
+                           intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    size_t vl = 16;
+
+    vuint8mf2_t shift = __riscv_vmv_v_x_u8mf2(IF_FILTER_PREC, vl);
+    vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, vl);
+    vint16m1_t t0, t1, t2, t3, t4, t5, t6;
+    vint16m1_t r0, r1, r2, r3;
+    vint16m1_t *s011 = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &r0, &r1, &r2, &r3};
+
+    for (int col = 0; col < width; col += vl)
+    {
+        vl = __riscv_vsetvl_e16m1(width - col);
+        const uint8_t *s = src;
+        uint8_t *d = dst;
+
+        load_u8x8xn<7>(s, s0, srcStride, vl);
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            load_u8x8xn<4>(s, s0 + 7, srcStride, vl);
+            vint16m1_t d0 = filter8_s16x8<coeffIdx>(s0, zero, vl);
+            vint16m1_t d1 = filter8_s16x8<coeffIdx>(s0 + 1, zero, vl);
+            vint16m1_t d2 = filter8_s16x8<coeffIdx>(s0 + 2, zero, vl);
+            vint16m1_t d3 = filter8_s16x8<coeffIdx>(s0 + 3, zero, vl);
+
+            vuint16m1_t d0_u16 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(d0, zero, vl));
+            vuint16m1_t d1_u16 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(d1, zero, vl));
+            vuint16m1_t d2_u16 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(d2, zero, vl));
+            vuint16m1_t d3_u16 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(d3, zero, vl));
+
+            vuint8mf2_t d0_u8 = __riscv_vnclipu_wv_u8mf2(d0_u16, shift, 0, vl);
+            vuint8mf2_t d1_u8 = __riscv_vnclipu_wv_u8mf2(d1_u16, shift, 0, vl);
+            vuint8mf2_t d2_u8 = __riscv_vnclipu_wv_u8mf2(d2_u16, shift, 0, vl);
+            vuint8mf2_t d3_u8 = __riscv_vnclipu_wv_u8mf2(d3_u16, shift, 0, vl);
+
+            __riscv_vse8_v_u8mf2(d + 0 * dstStride, d0_u8, vl);
+            __riscv_vse8_v_u8mf2(d + 1 * dstStride, d1_u8, vl);
+            __riscv_vse8_v_u8mf2(d + 2 * dstStride, d2_u8, vl);
+            __riscv_vse8_v_u8mf2(d + 3 * dstStride, d3_u8, vl);
+
+            *s00 = *s04;
+            *s01 = *s05;
+            *s02 = *s06;
+            *s03 = *s07;
+            *s04 = *s08;
+            *s05 = *s09;
+            *s06 = *s010;
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += vl;
+        dst += vl;
+    }
+}
+
+// Element-wise of g_chromaFilter
+const int16_t g_chromaFilter88NTAPS_CHROMA =
+{
+    { 0, 64,  0, 0 },
+    { -2, 58, 10, -2 },
+    { -4, 54, 16, -2 },
+    { -6, 46, 28, -4 },
+    { -4, 36, 36, -4 },
+    { -4, 28, 46, -6 },
+    { -2, 16, 54, -4 },
+    { -2, 10, 58, -2 }
+};
+
+template<bool coeff4, int width, int height>
+void interp4_horiz_pp_rvv(const pixel *src, intptr_t srcStride, pixel *dst,
+                           intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 4;
+    src -= (N_TAPS / 2 - 1) * 1;
+
+    const int16_t* filter = g_chromaFilter8coeffIdx;
+
+    size_t vl = 32;
+
+    if (width > 8)

x265-4.2.tar/source/common/riscv64/filter-prim.h Added

@@ -0,0 +1,256 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Yujiao He <he.yujiao@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <riscv_vector.h>
+#include "primitives.h"
+
+namespace X265_NS
+{
+template<int N>
+static void inline load_u8x8xn(const uint8_t *s, vint16m1_t **d, const intptr_t stride, const size_t vl)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        *di = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(__riscv_vle8_v_u8mf2(s + i * stride, vl),vl));
+    }
+}
+
+template<int N>
+static void inline load_u8x16xn(const uint8_t *s, vint16m2_t **d, const intptr_t stride, const size_t vl)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        *di = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(__riscv_vle8_v_u8m1(s + i * stride, vl),vl));
+    }
+}
+
+template<int N>
+static void inline load_s16x8xn(const int16_t *s, vint32m2_t **d, const intptr_t stride, const size_t vl)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        *di = __riscv_vsext_vf2_i32m2(__riscv_vle16_v_i16m1(s + i * stride, vl), vl);
+    }
+}
+
+#if !HIGH_BIT_DEPTH
+
+/* N_TAPS = 8 */
+template<int coeffIdx>
+vint16m1_t inline filter8_s16x8(vint16m1_t **s, const vint16m1_t c, const size_t vl)
+{
+    vint16m1_t d0;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = __riscv_vsub_vv_i16m1(*s6, *s0, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 4, *s1, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -10, *s2, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 58, *s3, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 17, *s4, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -5, *s5, vl);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        d0 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t t0 = __riscv_vadd_vv_i16m1(*s3, *s4, vl);
+        vint16m1_t t1 = __riscv_vadd_vv_i16m1(*s2, *s5, vl);
+        vint16m1_t t2 = __riscv_vadd_vv_i16m1(*s1, *s6, vl);
+        vint16m1_t t3 = __riscv_vadd_vv_i16m1(*s0, *s7, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 40, t0, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -11, t1, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 4, t2, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -1, t3, vl);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = __riscv_vsub_vv_i16m1(*s1, *s7, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -5, *s2, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 17, *s3, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 58, *s4, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -10, *s5, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, 4, *s6, vl);
+    }
+    return __riscv_vadd_vv_i16m1(d0, c, vl);
+}
+
+template<int coeffIdx>
+vint16m2_t inline filter8_s16x16(vint16m2_t **s, const vint16m2_t c, size_t vl)
+{
+    vint16m2_t d0;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = __riscv_vsub_vv_i16m2(*s6, *s0, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 4, *s1, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, -10, *s2, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 58, *s3, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 17, *s4, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, -5, *s5, vl);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        d0 = __riscv_vmv_v_x_i16m2(0, vl);
+        vint16m2_t t0 = __riscv_vadd_vv_i16m2(*s3, *s4, vl);
+        vint16m2_t t1 = __riscv_vadd_vv_i16m2(*s2, *s5, vl);
+        vint16m2_t t2 = __riscv_vadd_vv_i16m2(*s1, *s6, vl);
+        vint16m2_t t3 = __riscv_vadd_vv_i16m2(*s0, *s7, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 40, t0, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, -11, t1, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 4, t2, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, -1, t3, vl);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = __riscv_vsub_vv_i16m2(*s1, *s7, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, -5, *s2, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 17, *s3, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 58, *s4, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, -10, *s5, vl);
+        d0 = __riscv_vmacc_vx_i16m2(d0, 4, *s6, vl);
+    }
+    return __riscv_vadd_vv_i16m2(d0, c, vl);
+}
+
+template<int coeffIdx>
+vint32m2_t inline filter8_s32x8(vint32m2_t **s, const vint32m2_t c, size_t vl)
+{
+    vint32m2_t d0;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = __riscv_vsub_vv_i32m2(*s6, *s0, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 4, *s1, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, -10, *s2, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 58, *s3, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 17, *s4, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, -5, *s5, vl);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        d0 = __riscv_vmv_v_x_i32m2(0, vl);
+        vint32m2_t t0 = __riscv_vadd_vv_i32m2(*s3, *s4, vl);
+        vint32m2_t t1 = __riscv_vadd_vv_i32m2(*s2, *s5, vl);
+        vint32m2_t t2 = __riscv_vadd_vv_i32m2(*s1, *s6, vl);
+        vint32m2_t t3 = __riscv_vadd_vv_i32m2(*s0, *s7, vl);
+
+        d0 = __riscv_vmacc_vx_i32m2(d0, 40, t0, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, -11, t1, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 4, t2, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, -1, t3, vl);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = __riscv_vsub_vv_i32m2(*s1, *s7, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, -5, *s2, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 17, *s3, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 58, *s4, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, -10, *s5, vl);
+        d0 = __riscv_vmacc_vx_i32m2(d0, 4, *s6, vl);
+    }
+    return __riscv_vadd_vv_i32m2(d0, c, vl);
+}
+
+/* N_TAPS = 4 */
+template<bool coeff4>
+vint16m1_t inline filter4_s16x8(vint16m1_t **s, const vint16m1_t c, const int16_t* f, size_t vl)
+{
+    vint16m1_t d0 = __riscv_vmv_v_x_i16m1(0, vl);
+
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        vint16m1_t t1 = __riscv_vadd_vv_i16m1(*s1, *s2, vl);
+        vint16m1_t t2 = __riscv_vadd_vv_i16m1(*s0, *s3, vl);
+
+        d0 = __riscv_vmacc_vx_i16m1(d0, 36, t1, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, -4, t2, vl);
+    }
+    else
+    {
+        d0 = __riscv_vmacc_vx_i16m1(d0, f0, *s0, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, f1, *s1, vl);
+        d0 = __riscv_vmacc_vx_i16m1(d0, f2, *s2, vl);

x265-4.2.tar/source/common/riscv64/fun-decls-prim.h Added

@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_RISCV64_INTRINSIC_SETUP_H
+#define X265_COMMON_RISCV64_INTRINSIC_SETUP_H
+
+#include "primitives.h"
+
+namespace X265_NS {
+
+void setupPixelPrimitives_rvv(EncoderPrimitives &p);
+void setupSaoPrimitives_rvv(EncoderPrimitives &p);
+void setupFilterPrimitives_rvv(EncoderPrimitives &p);
+void setupIntraPrimitives_rvv(EncoderPrimitives &p);
+
+}
+
+#endif // X265_COMMON_RISCV64_INTRINSIC_SETUP_H

x265-4.2.tar/source/common/riscv64/fun-decls.h Added

@@ -0,0 +1,203 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef _RISCV_FUNC_DECLS_
+#define _RISCV_FUNC_DECLS_
+
+#define FUNCDEF_TU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S3(ret, name, cpu, ...) \
+    ret PFX(name ## _2_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _3_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _5_ ## cpu(__VA_ARGS__)); \
+
+#define FUNCDEF_PU(ret, name, cpu, ...)         \
+    ret PFX(name##_4x4_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_8x8_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_16x16_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_32x32_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_64x64_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_8x4_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_4x8_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_16x8_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_8x16_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_16x32_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_32x16_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_64x32_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_32x64_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_16x12_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_12x16_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_16x4_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_4x16_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_32x24_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_24x32_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_32x8_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_8x32_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_64x48_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_48x64_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_64x16_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_16x64_##cpu)(__VA_ARGS__)
+
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...)  \
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__);    \
+    ret PFX(name##_4x2_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_2x4_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_8x2_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_2x8_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_8x6_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_6x8_##cpu)(__VA_ARGS__);     \
+    ret PFX(name##_8x12_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_12x8_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_6x16_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_16x6_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_2x16_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_16x2_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_4x12_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_12x4_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_32x12_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_12x32_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_32x4_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_4x32_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_32x48_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_48x32_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_16x24_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_24x16_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_8x64_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_64x8_##cpu)(__VA_ARGS__);    \
+    ret PFX(name##_64x24_##cpu)(__VA_ARGS__);   \
+    ret PFX(name##_24x64_##cpu)(__VA_ARGS__);
+
+FUNCDEF_TU(int, sa8d_8x8, rvv, const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+FUNCDEF_TU(int, psyCost_pp, rvv, const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride);
+FUNCDEF_TU(int, transpose, rvv, pixel *dst, const pixel *src, intptr_t dstride, intptr_t sstride);
+
+FUNCDEF_TU_S(uint32_t, copy_cnt, v, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(int, count_nonzero, v, const int16_t* quantCoeff);
+FUNCDEF_TU_S(sse_t, pixel_ssd_s, rvv, const int16_t*, intptr_t);
+FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, rvv, const int16_t*, intptr_t);
+FUNCDEF_TU_S(void, blockfill_s, v, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, v, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, v, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, v, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, v, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, ssimDist, v, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
+FUNCDEF_TU_S(void, idct, v, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S(void, dct, v, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S(void, getResidual, v, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+FUNCDEF_TU_S2(void, intra_pred_planar, rvv, pixel* dst, intptr_t dstride, const pixel* srcPix, int, int);
+
+FUNCDEF_TU_S3(void, nonPsyRdoQuant, v, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+FUNCDEF_TU_S3(void, PsyRdoQuant, v, int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
+
+FUNCDEF_PU(void, pixel_avg_pp, rvv, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+FUNCDEF_PU(void, pixel_avg_pp_aligned, rvv, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+FUNCDEF_PU(void, pixel_sub_ps, v, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+FUNCDEF_PU(void, pixel_add_ps, v, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+FUNCDEF_PU(void, sad_x3, rvv, const pixel *, const pixel *, const pixel *, const pixel *, intptr_t, int32_t *);
+FUNCDEF_PU(void, sad_x4, rvv, const pixel *, const pixel *, const pixel *, const pixel *, const pixel *, intptr_t, int32_t *);
+FUNCDEF_PU(sse_t, pixel_sse_pp, rvv, const pixel*, intptr_t, const pixel*, intptr_t);
+FUNCDEF_PU(void, blockcopy_ss, v, int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+FUNCDEF_PU(void, blockcopy_sp, v, pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+FUNCDEF_PU(void, blockcopy_ps, v, int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+FUNCDEF_PU(uint64_t, pixel_var, v, const pixel* pix, intptr_t stride);
+FUNCDEF_PU(int, sa8d, rvv, const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+
+FUNCDEF_CHROMA_PU(int, satd4, rvv, const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2);
+FUNCDEF_CHROMA_PU(int, satd8, rvv, const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2);
+FUNCDEF_CHROMA_PU(int, pixel_sad, rvv, const pixel *, intptr_t, const pixel *, intptr_t);
+FUNCDEF_CHROMA_PU(int, pixel_satd, rvv, const pixel*, intptr_t, const pixel*, intptr_t);
+FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, rvv, const int16_t*, intptr_t);
+FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, rvv, const int16_t*, intptr_t);
+FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, rvv, const int16_t*, intptr_t, const int16_t*, intptr_t);
+FUNCDEF_CHROMA_PU(void, filterPixelToShort, rvv, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, rvv, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+FUNCDEF_CHROMA_PU(void, addAvg, v, const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, v, pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void PFX(idst4_v)(const int16_t *src, int16_t *dst, intptr_t dstStride);
+void PFX(dst4_v)(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void PFX(denoiseDct_v)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
+
+void PFX(scale1D_128to64_v(pixel *dst, const pixel *src));
+void PFX(scale2D_64to32_v(pixel* dst, const pixel* src, intptr_t stride));
+
+void PFX(dequant_scaling_v(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
+void PFX(dequant_normal_v(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
+
+void PFX(ssim_4x4x2_core_v(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24));
+
+uint32_t PFX(quant_v)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t PFX(nquant_v)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+
+void PFX(normFact_v)(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k);
+
+int PFX(scanPosLast_v)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
+uint32_t PFX(costCoeffNxN_rvv)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+
+void PFX(saoCuStatsE0_rvv)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+void PFX(saoCuStatsE1_rvv)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+void PFX(saoCuStatsE2_rvv)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count);
+void PFX(saoCuStatsE3_rvv)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+
+void PFX(processSaoCUE0_rvv)(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride);
+void PFX(processSaoCUE1_rvv)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void PFX(processSaoCUE1_2Rows_rvv)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void PFX(processSaoCUE2_rvv)(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride);
+void PFX(processSaoCUE3_rvv)(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX);
+void PFX(processSaoCUB0_rvv)(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride);
+void PFX(calSign_rvv)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+void PFX(pelFilterLumaStrong_v_rvv)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterLumaStrong_h_rvv)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterChroma_V_rvv)(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
+void PFX(pelFilterChroma_H_rvv)(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
+
+void PFX(weight_pp_v)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void PFX(weight_sp_v)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+
+void PFX(planecopy_cp_v)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(planecopy_sp_v)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);

x265-4.2.tar/source/common/riscv64/intrapred-prim.cpp Added

@@ -0,0 +1,987 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+
+#include "riscv64_utils.h"
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace X265_NS
+{
+template<int tuSize>
+void intraFilter_rvv(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+{
+    const int tuSize2 = tuSize << 1;
+    const int len = tuSize2 + tuSize2;
+
+    pixel topLeft = samples0;
+    pixel topLast = samplestuSize2;
+    pixel leftLast = sampleslen;
+
+#if !HIGH_BIT_DEPTH
+    {
+        size_t vl = __riscv_vsetvl_e16m2(len);
+        vuint16m2_t two_vec = __riscv_vmv_v_x_u16m2(2, vl);
+        for(int i = 0; i < len; i+=vl) {
+            vl = __riscv_vsetvl_e8m1(len - i);
+            vuint8m1_t sample1_u8 = __riscv_vle8_v_u8m1(&samplesi, vl);
+            vuint8m1_t sample2_u8 = __riscv_vle8_v_u8m1(&samplesi-1, vl);
+            vuint8m1_t sample3_u8 = __riscv_vle8_v_u8m1(&samplesi+1, vl);
+
+            vuint16m2_t sample1 = __riscv_vzext_vf2_u16m2(sample1_u8, vl);
+            vuint16m2_t sample2 = __riscv_vzext_vf2_u16m2(sample2_u8, vl);
+            vuint16m2_t sample3 = __riscv_vzext_vf2_u16m2(sample3_u8, vl);
+
+            vuint16m2_t result1 =  __riscv_vsll_vx_u16m2(sample1, 1, vl);
+            result1 = __riscv_vadd_vv_u16m2(result1, sample2, vl);
+            vuint16m2_t result2 =  __riscv_vadd_vv_u16m2(sample3, two_vec, vl);
+            vuint16m2_t result3 =  __riscv_vadd_vv_u16m2(result1, result2, vl);
+            result3 = __riscv_vsrl_vx_u16m2(result3, 2, vl);
+
+            vuint8m1_t result_u8 = __riscv_vnsrl_wx_u8m1(result3, 0, vl);
+            __riscv_vse8_v_u8m1(&filteredi, result_u8, vl);
+        }
+    }
+#else
+    {
+        size_t vl = __riscv_vsetvl_e16m1(len);
+        vuint16m1_t two_vec = __riscv_vmv_v_x_u16m1(2, vl);
+        for(int i = 0; i < len; i += vl) {
+            vl = __riscv_vsetvl_e16m1(len - i);
+            vuint16m1_t sample1 = __riscv_vle16_v_u16m1(&samplesi, vl);
+            vuint16m1_t sample2 = __riscv_vle16_v_u16m1(&samplesi-1, vl);
+            vuint16m1_t sample3 = __riscv_vle16_v_u16m1(&samplesi+1, vl);
+
+            vuint16m1_t result1 = __riscv_vsll_vx_u16m1(sample1, 1, vl);
+            result1 = __riscv_vadd_vv_u16m1(result1, sample2, vl);
+            vuint16m1_t result2 = __riscv_vadd_vv_u16m1(sample3, two_vec, vl);
+            vuint16m1_t result3 = __riscv_vadd_vv_u16m1(result1, result2, vl);
+
+            result3 = __riscv_vsrl_vx_u16m1(result3, 2, vl);
+            __riscv_vse16_v_u16m1(&filteredi, result3, vl);
+        }
+    }
+#endif
+    filteredtuSize2 = topLast;
+    filtered0 = ((topLeft << 1) + samples1 + samplestuSize2 + 1 + 2) >> 2;
+    filteredtuSize2 + 1 = ((samplestuSize2 + 1 << 1) + topLeft + samplestuSize2 + 2 + 2) >> 2;
+    filteredtuSize2 + tuSize2 = leftLast;
+}
+
+template<int width>
+void intra_pred_ang_rvv(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
+{
+    int width2 = width << 1;
+    int horMode = dirMode < 18;
+    pixel neighbourBuf129;
+    const pixel *srcPix = srcPix0;
+
+    if (horMode) {
+        neighbourBuf0 = srcPix0;
+        if (width <= 8 && sizeof(pixel) == 1) {
+            const size_t vl = (size_t)(width << 1);
+            vuint8m1_t v1 = __riscv_vle8_v_u8m1((const uint8_t *)&srcPixwidth2 + 1, vl);
+            vuint8m1_t v2 = __riscv_vle8_v_u8m1((const uint8_t *)&srcPix1, vl);
+            __riscv_vse8_v_u8m1((uint8_t *)&neighbourBuf1, v1, vl);
+            __riscv_vse8_v_u8m1((uint8_t *)&neighbourBufwidth2 + 1, v2, vl);
+            srcPix = neighbourBuf;
+        } else {
+            memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
+            memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
+            srcPix = neighbourBuf;
+        }
+
+    }
+
+    const int8_t angleTable17 = {-32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32};
+    const int16_t invAngleTable8 = {4096, 1638, 910, 630, 482, 390, 315, 256};
+
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
+    int angle = angleTable8 + angleOffset;
+
+    if (!angle) {
+        for (int y = 0; y < width; y++) {
+            memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
+        }
+        if (bFilter) {
+            int topLeft = srcPix0, top = srcPix1;
+            for (int y = 0; y < width; y++) {
+                dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
+            }
+        }
+    } else {
+        pixel refBuf64;
+        const pixel *ref;
+
+        if (angle < 0) {
+            int nbProjected = -((width * angle) >> 5) - 1;
+            pixel *ref_pix = refBuf + nbProjected + 1;
+
+            int invAngle = invAngleTable- angleOffset - 1;
+            int invAngleSum = 128;
+
+            for (int i = 0; i < nbProjected; i++) {
+                invAngleSum += invAngle;
+                ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
+            }
+            memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
+            ref = ref_pix;
+        } else {
+            ref = srcPix + 1;
+        }
+        int angleSum = 0;
+        for (int y = 0; y < width; y++) {
+            angleSum += angle;
+            int offset = angleSum >> 5;
+            int fraction = angleSum & 31;
+            if (fraction) {
+                size_t vl = width;
+
+                #if !HIGH_BIT_DEPTH
+                    const uint8_t *ref_u8 = (const uint8_t *)ref + offset;
+                    uint8_t *dst_u8 = (uint8_t *)dst;
+                #else
+                    const uint16_t *ref_u16 = (const uint16_t *)ref + offset;
+                    uint16_t *dst_u16 = (uint16_t *)dst;
+                #endif
+
+                switch (width) {
+                case 32: {
+                #if !HIGH_BIT_DEPTH
+                    vuint8m2_t in0 = __riscv_vle8_v_u8m2(ref_u8, vl);
+                    vuint8m2_t in1 = __riscv_vle8_v_u8m2(ref_u8 + 1, vl);
+
+                    vuint8m2_t f0 = __riscv_vmv_v_x_u8m2(32 - fraction, vl);
+                    vuint8m2_t f1 = __riscv_vmv_v_x_u8m2(fraction, vl);
+
+                    vuint16m4_t temp0 = __riscv_vwmulu_vv_u16m4(in0, f0, vl);
+                    vuint16m4_t temp1 = __riscv_vwmulu_vv_u16m4(in1, f1, vl);
+
+                    vuint16m4_t sum = __riscv_vadd_vv_u16m4(temp0, temp1, vl);
+                    vuint16m4_t sixteen = __riscv_vmv_v_x_u16m4(16, vl);
+                    sum = __riscv_vadd_vv_u16m4(sum, sixteen, vl);
+
+                    vuint8m2_t res = __riscv_vnsrl_wx_u8m2(sum, 5, vl);
+                    __riscv_vse8_v_u8m2(dst_u8 + y * dstStride, res, vl);
+                #else
+                    vuint16m2_t f0 = __riscv_vmv_v_x_u16m2(32 - fraction, vl);
+                    vuint16m2_t f1 = __riscv_vmv_v_x_u16m2(fraction, vl);
+                    vuint32m4_t sixteen = __riscv_vmv_v_x_u32m4(16, vl);
+
+                    for (int x = 0; x < width; x += vl) {
+                        vl = __riscv_vsetvl_e16m2(width - x);
+                        vuint16m2_t in0 = __riscv_vle16_v_u16m2(ref_u16 + x, vl);
+                        vuint16m2_t in1 = __riscv_vle16_v_u16m2(ref_u16 + x + 1, vl);
+
+                        vuint32m4_t temp0 = __riscv_vwmulu_vv_u32m4(in0, f0, vl);
+                        vuint32m4_t temp1 = __riscv_vwmulu_vv_u32m4(in1, f1, vl);

x265-4.2.tar/source/common/riscv64/intrapred.S Added

@@ -0,0 +1,270 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+ #include "asm.S"
+
+.section .rodata
+.align 4
+
+tbl_const_1to5_3to0:
+    .byte 1, 2, 3, 4
+    .byte 3, 2, 1, 0
+
+.align 4
+tbl_const_1to8_7to0:
+    .byte 1, 2, 3, 4, 5, 6, 7, 8          // coefficient of (x+1)*TR
+    .byte 7, 6, 5, 4, 3, 2, 1, 0          // coefficient of (blksize-1-x)*Ly
+
+.align 4
+tbl_const_1to16_15to0:
+    .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .text
+.align 4
+
+
+function PFX(intra_pred_planar4_rvv)
+    li t0, 4
+    vsetvli t0, t0, e8, mf2, ta, ma
+
+    addi t1, a2, 1
+    vle8.v v0, (t1)           # v0 = above0..3
+
+    lbu t2, 5(a2)
+    vmv.v.x v2, t2            # v2 = topRight_b_broadcast
+
+    lbu t3, 13(a2)
+    vsetvli zero, zero, e16, m1, ta, ma
+    vmv.v.x v3, t3            # v3 = bottomLeft_h_broadcast
+
+    la t2, tbl_const_1to5_3to0
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vle8.v v4, (t2)           # v4 = 1,2,3,4 (x+1)
+    addi t2, t2, 4
+    vle8.v v5, (t2)           # v5 = 3,2,1,0 (3-x)
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vzext.vf2 v6, v0          # zero extension to 16 bits
+    vsll.vi v6, v6, 2         # v6 = 4 * abovex
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vwmulu.vv v10, v4, v2      # v10 = (x+1) * topRight
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vadd.vv v6, v6, v10        # v6 = 4 * abovex + (x+1) * topRight
+
+    vzext.vf2 v12, v0          # v12 = above (zero extension to 16 bits)
+    vsub.vv v8, v3, v12        # v8 = bottomLeft - abovex, as a correction
+
+    csrwi  vxrm, 0
+    li t4, 0                  # y = 0
+    mv t5, a0                 # t5 = current dst
+    mv t6, a1                 # t6 = dstStride
+
+.rept 4
+    add t1, a2, t4
+    addi t1, t1, 9           # left = srcPix + 9 + y
+    lbu t2, 0(t1)             # t2 = lefty
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vmv.v.x v9, t2            # v9 = lefty_broadcast
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vadd.vv v6, v6, v8        # v6 = (blkSize - 1 -y) * abovex + (x+1) * topRight + (y+1) * bottomLeft
+
+    vmv.v.v v16, v6           # v16 = v6
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vwmulu.vv v14, v5, v9                  # v14 = (3-x) * lefty
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vadd.vv v16, v16, v14                  # v16 += (3-x) * lefty
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vnclipu.wi v18, v16, 3
+
+    vse8.v v18, (t5)
+
+    add     t5, t5, t6          # dst += dstStride
+    addi    t4, t4, 1           # y++
+.endr
+    ret
+endfunc
+
+
+function PFX(intra_pred_planar8_rvv)
+// Register map
+// a0  = dst
+// a1  = dstStride
+// a2  = *srcPix
+// a3  = leftx
+// a4  = tmp
+// v0  = above7:0
+// v1  = left7:0
+// v2  = topRight = rep(aboveblkSize)
+// v3  = bottomLeft = rep(leftblkSize)
+// v4  = const1 2 3 4 5 6 7 8
+// v5  = const7 6 5 4 3 2 1 0
+
+    li t0, 8
+    vsetvli t0, t0, e8, mf2, ta, ma
+
+    addi t1, a2, 1
+    vle8.v v0, (t1)           # v0 = above0..7
+
+    lbu t2, 9(a2)
+    vmv.v.x v2, t2            # v2 = topRight_b_broadcast
+
+    lbu t3, 25(a2)
+    vsetvli zero, zero, e16, m1, ta, ma
+    vmv.v.x v3, t3            # v3 = bottomLeft_h_broadcast
+
+    la t2, tbl_const_1to8_7to0
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vle8.v v4, (t2)           # v4 = 1,2,3,4,5,6,7,8 (x+1)
+    addi t2, t2, 8
+    vle8.v v5, (t2)           # v5 = 7,6,5,4,3,2,1,0 (7-x)
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vzext.vf2 v6, v0          # zero extension to 16 bits
+    vsll.vi v6, v6, 3         # v6 = 8 * abovex
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vwmulu.vv v10, v4, v2      # v10 = (x+1) * topRight
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vadd.vv v6, v6, v10        # v6 = 8 * abovex + (x+1) * topRight
+
+    vzext.vf2 v12, v0          # v12 = above (zero extension to 16 bits)
+    vsub.vv v8, v3, v12        # v8 = bottomLeft - abovex, as a correction
+
+    csrwi  vxrm, 0
+    li t3, 8
+    li t4, 0                  # y = 0
+    mv t5, a0                 # t5 = current dst
+    mv t6, a1                 # t6 = dstStride
+
+loop_y_8x8:
+    add t1, a2, t4
+    addi t1, t1, 17           # left = srcPix + 17 + y
+    lbu t2, 0(t1)             # t2 = lefty
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vmv.v.x v9, t2            # v9 = lefty_broadcast
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vadd.vv v6, v6, v8        # v6 = (blkSize - 1 -y) * abovex + (x+1) * topRight + (y+1) * bottomLeft
+
+    vmv.v.v v16, v6           # v16 = v6
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vwmulu.vv v14, v5, v9                  # v14 = (7-x) * lefty
+
+    vsetvli zero, zero, e16, m1, ta, ma
+    vadd.vv v16, v16, v14                  # v16 += (7-x) * lefty
+
+    vsetvli zero, zero, e8, mf2, ta, ma
+    vnclipu.wi v18, v16, 4
+
+    vse8.v v18, (t5)
+
+    add     t5, t5, t6          # dst += dstStride
+    addi    t4, t4, 1           # y++
+    addi    t3, t3, -1
+    bnez    t3, loop_y_8x8
+
+    ret
+endfunc
+

x265-4.2.tar/source/common/riscv64/loopfilter.S Added

@@ -0,0 +1,889 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+pelFilterLumaStrong_coeff_16:
+    .hword 0, 2, 1, 1, 2, 3, 3, 7
+    .hword 100, 4, 2, 5, 6, 4, 4, 100
+    .hword 100, 1, 3, 2, 3, 5, 5, 100
+    .hword 100, 0, 4, 3, 4, 6, 6, 100
+    .hword 100, 3, 100, 4, 5, 100, 7, 100
+
+    .hword 0, 3, 1, 2, 2, 1, 1, 0
+    .hword 0, 2, 1, 2, 2, 1, 3, 0
+    .hword 0, 1, 1, 2, 2, 1, 2, 0
+
+    .hword 0, 4, 2, 4, 4, 2, 4, 0
+    .hword 0, 3, 2, 3, 3, 2, 3, 0
+
+.align 4
+.text
+
+//void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
+function PFX(processSaoCUE0_rvv)
+#if HIGH_BIT_DEPTH
+    slli            a4, a4, 1
+#endif
+    li              a5, (1 << BIT_DEPTH) - 1
+    lb              t0, (a3)
+    lb              t1, 1(a3)
+    neg             t0, t0
+    neg             t1, t1
+    add             t3, a0, a4
+    vsetvli         zero, a2, e16, m2, ta, ma
+    vmv.v.i         v30, 0
+    vmv.v.x         v28, a5
+    vsetvli         zero, a2, e8, m1, ta, ma
+    vmv.s.x         v8, t0
+    vmv.s.x         v9, t1
+loop_processSaoCUE0:
+#if HIGH_BIT_DEPTH
+    vsetvli         t2, a2, e16, m2, ta, ma
+    addi            t4, a0, 2
+    addi            t5, t3, 2
+    vle16.v         v2, (a0)
+    vle16.v         v20, (t4)
+    vle16.v         v4, (t3)
+    vle16.v         v22, (t5)
+    SIGNOF_RVV      v24, v2, v20
+    SIGNOF_RVV      v26, v4, v22
+    vsetvli         zero, t2, e8, m1, ta, ma
+    vnsrl.wi        v6, v24, 0
+    vnsrl.wi        v7, v26, 0
+#else
+    vsetvli         t2, a2, e8, m1, ta, ma
+    addi            t4, a0, 1
+    addi            t5, t3, 1
+    vle8.v          v2, (a0)
+    vle8.v          v20, (t4)
+    vle8.v          v4, (t3)
+    vle8.v          v22, (t5)
+    SIGNOF_RVV      v6, v2, v20
+    SIGNOF_RVV      v7, v4, v22
+#endif
+    addi            t6, t2, -1
+    vslideup.vi     v8, v6, 1
+    vslideup.vi     v9, v7, 1
+    vsub.vv         v10, v6, v8
+    vsub.vv         v11, v7, v9
+    vadd.vi         v10, v10, 2
+    vadd.vi         v11, v11, 2
+    vslidedown.vx   v8, v6, t6
+    vslidedown.vx   v9, v7, t6
+
+#if HIGH_BIT_DEPTH
+    vluxei8.v       v20, (a1), v10
+    vluxei8.v       v22, (a1), v11
+    vwadd.wv        v2, v2, v20
+    vwadd.wv        v4, v4, v22
+    vsetvli         zero, t2, e16, m2, ta, ma
+    vmax.vv         v2, v2, v30
+    vmax.vv         v4, v4, v30
+    vmin.vv         v2, v2, v28
+    vmin.vv         v4, v4, v28
+    vse16.v         v2, (a0)
+    vse16.v         v4, (t3)
+#else
+    vluxei8.v       v20, (a1), v10
+    vluxei8.v       v22, (a1), v11
+    vsetvli         zero, t2, e16, m2, ta, ma
+    vzext.vf2       v12, v2
+    vzext.vf2       v14, v4
+    vsetvli         zero, t2, e8, m1, ta, ma
+    vwadd.wv        v12, v12, v20
+    vwadd.wv        v14, v14, v22
+    vsetvli         zero, t2, e16, m2, ta, ma
+    vmax.vv         v12, v12, v30
+    vmax.vv         v14, v14, v30
+    vmin.vv         v12, v12, v28
+    vmin.vv         v14, v14, v28
+    vsetvli         zero, t2, e8, m1, ta, ma
+    vnsrl.wi        v16, v12, 0
+    vnsrl.wi        v17, v14, 0
+    vse8.v          v16, (a0)
+    vse8.v          v17, (t3)
+#endif
+
+#if HIGH_BIT_DEPTH
+    slli            t1, t2, 1
+    add             a0, a0, t1
+    add             t3, t3, t1
+#else
+    add             a0, a0, t2
+    add             t3, t3, t2
+#endif
+    sub             a2, a2, t2
+    bgtz            a2, loop_processSaoCUE0
+    ret
+endfunc
+
+//void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+function PFX(processSaoCUE1_rvv)
+#if HIGH_BIT_DEPTH
+    slli            a3, a3, 1
+#endif
+    li              a5, (1 << BIT_DEPTH) - 1
+    add             t0, a0, a3
+    vsetvli         zero, a4, e16, m2, ta, ma
+    vmv.v.i         v30, 0
+    vmv.v.x         v28, a5
+loop_processSaoCUE1:
+#if HIGH_BIT_DEPTH
+    vsetvli         t2, a4, e16, m2, ta, ma
+    vle16.v         v14, (a0)
+    vle16.v         v4, (t0)
+    vle8.v          v6, (a1)
+    SIGNOF_RVV      v20, v14, v4
+    vsetvli         zero, t2, e8, m1, ta, ma
+    vnsrl.wi        v8, v20, 0
+#else
+    vsetvli         t2, a4, e8, m1, ta, ma
+    vle8.v          v2, (a0)
+    vle8.v          v4, (t0)
+    vle8.v          v6, (a1)
+    SIGNOF_RVV      v8, v2, v4
+#endif
+    vadd.vv         v10, v8, v6
+    vadd.vi         v10, v10, 2
+    vrsub.vi        v8, v8, 0
+    vse8.v          v8, (a1)
+    vluxei8.v       v12, (a2), v10
+
+#if !HIGH_BIT_DEPTH
+    vsetvli         zero, t2, e16, m2, ta, ma
+    vzext.vf2       v14, v2
+    vsetvli         zero, t2, e8, m1, ta, ma
+#endif
+    vwadd.wv        v14, v14, v12
+    vsetvli         zero, t2, e16, m2, ta, ma
+    vmax.vv         v14, v14, v30
+    vmin.vv         v14, v14, v28
+#if HIGH_BIT_DEPTH
+    vse16.v         v14, (a0)
+#else
+    vsetvli         zero, t2, e8, m1, ta, ma
+    vnsrl.wi        v16, v14, 0
+    vse8.v          v16, (a0)
+#endif
+
+#if HIGH_BIT_DEPTH
+    slli            t3, t2, 1
+    add             a0, a0, t3

x265-4.2.tar/source/common/riscv64/mc-a.S Added

@@ -0,0 +1,112 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 4
+
+.text
+
+#if BIT_DEPTH == 8
+// void pixelavg_pp(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t sstride0,
+//                  const pixel *src1, intptr_t sstride1, int);
+.macro AVG_PP_LOOP_FUNC w, h, lmul
+function PFX(pixel_avg_pp_\w\()x\h\()_rvv)
+    csrwi        vxrm, 0
+    li           t0, \h
+    li           t1, \w
+1:
+    addi         t0, t0, -4
+    vsetvli      zero, t1, e8, m\()\lmul, ta, ma
+
+    vle8.v       v0, (a2)
+    vle8.v       v16, (a4)
+    add          a2, a2, a3
+    add          a4, a4, a5
+    vaaddu.vv    v0, v0, v16
+
+    vle8.v       v4, (a2)
+    vle8.v       v20, (a4)
+    add          a2, a2, a3
+    add          a4, a4, a5
+    vaaddu.vv    v4, v4, v20
+
+    vle8.v       v8, (a2)
+    vle8.v       v24, (a4)
+    add          a2, a2, a3
+    add          a4, a4, a5
+    vaaddu.vv    v8, v8, v24
+
+    vle8.v       v12, (a2)
+    vle8.v       v28, (a4)
+    add          a2, a2, a3
+    add          a4, a4, a5
+    vaaddu.vv    v12, v12, v28
+
+    vse8.v       v0, (a0)
+    add          a0, a0, a1
+    vse8.v       v4, (a0)
+    add          a0, a0, a1
+    vse8.v       v8, (a0)
+    add          a0, a0, a1
+    vse8.v       v12, (a0)
+    add          a0, a0, a1
+    bnez         t0, 1b
+    ret
+endfunc
+.endm
+
+AVG_PP_LOOP_FUNC  4,  4,  1
+AVG_PP_LOOP_FUNC  4,  8,  1
+AVG_PP_LOOP_FUNC  4,  16,  1
+
+AVG_PP_LOOP_FUNC  8,  4,  1
+AVG_PP_LOOP_FUNC  8,  8,  1
+AVG_PP_LOOP_FUNC  8,  16,  1
+AVG_PP_LOOP_FUNC  8,  32,  1
+
+AVG_PP_LOOP_FUNC  12,  16,  1
+
+AVG_PP_LOOP_FUNC  16,  4,  1
+AVG_PP_LOOP_FUNC  16,  8,  1
+AVG_PP_LOOP_FUNC  16,  12,  1
+AVG_PP_LOOP_FUNC  16,  16,  1
+AVG_PP_LOOP_FUNC  16,  32,  1
+AVG_PP_LOOP_FUNC  16,  64,  1
+
+AVG_PP_LOOP_FUNC  24,  32,  2
+
+AVG_PP_LOOP_FUNC  32,  8,  2
+AVG_PP_LOOP_FUNC  32,  16,  2
+AVG_PP_LOOP_FUNC  32,  24,  2
+AVG_PP_LOOP_FUNC  32,  32,  2
+AVG_PP_LOOP_FUNC  32,  64,  2
+
+AVG_PP_LOOP_FUNC  48,  64,  4
+
+AVG_PP_LOOP_FUNC  64,  16,  4
+AVG_PP_LOOP_FUNC  64,  32,  4
+AVG_PP_LOOP_FUNC  64,  48,  4
+AVG_PP_LOOP_FUNC  64,  64,  4
+
+#endif
\ No newline at end of file

x265-4.2.tar/source/common/riscv64/p2s.S Added

@@ -0,0 +1,138 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Yujiao He <he.yujiao@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+ #include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+#if HIGH_BIT_DEPTH
+# if BIT_DEPTH == 10
+#  define P2S_SHIFT 4
+# elif BIT_DEPTH == 12
+#  define P2S_SHIFT 2
+# endif
+#else // if !HIGH_BIT_DEPTH
+# define P2S_SHIFT 6
+#endif // HIGH_BIT_DEPTH
+
+.macro p2s_x1 lmul1, lmul2
+#if HIGH_BIT_DEPTH
+    vle16.v         v8, (a0)
+    vsll.vi         v16, v8, P2S_SHIFT
+#else
+    vsetvli         zero, t0, e8, \lmul2, ta, ma
+    vle8.v          v8, (a0)
+    vsetvli         zero, t0, e16, \lmul1, ta, ma
+    vzext.vf2       v16, v8
+    vsll.vi         v16, v16, P2S_SHIFT
+#endif
+    vadd.vv         v16, v16, v0
+    vse16.v         v16, (a2)
+.endm
+
+.macro P2S_FUNC w, h, lmul1, lmul2
+function PFX(filterPixelToShort_\w\()x\h\()_rvv)
+    li              t0, \w
+    li              t2, 0xe000
+    vsetvli         zero, t0, e16, \lmul1, ta, ma
+    vmv.v.x         v0, t2
+#if HIGH_BIT_DEPTH
+    slli            a1, a1, 1
+#endif
+    slli            a3, a3, 1
+.if \h <= 16 /* height <= 16, fully unroll*/
+    p2s_x1          \lmul1, \lmul2
+.rept \h -1
+    add             a0, a0, a1
+    add             a2, a2, a3
+    p2s_x1          \lmul1, \lmul2
+.endr
+.else  /* height > 16, 4 rows unroll*/
+    li              t1, \h
+1:
+.rept 4
+    p2s_x1          \lmul1, \lmul2
+    add             a0, a0, a1
+    add             a2, a2, a3
+.endr
+    addi            t1, t1, -4
+    bnez            t1, 1b
+.endif
+    ret
+endfunc
+.endm
+
+P2S_FUNC 4, 2, m1, m1
+P2S_FUNC 4, 4, m1, m1
+P2S_FUNC 4, 8, m1, m1
+P2S_FUNC 4, 16, m1, m1
+P2S_FUNC 4, 32, m1, m1
+
+P2S_FUNC 6, 8, m1, m1
+P2S_FUNC 6, 16, m1, m1
+
+P2S_FUNC 8, 2, m1, m1
+P2S_FUNC 8, 4, m1, m1
+P2S_FUNC 8, 8, m1, m1
+P2S_FUNC 8, 6, m1, m1
+P2S_FUNC 8, 12, m1, m1
+P2S_FUNC 8, 16, m1, m1
+P2S_FUNC 8, 32, m1, m1
+P2S_FUNC 8, 64, m1, m1
+
+P2S_FUNC 12, 16, m2, m1
+P2S_FUNC 12, 32, m2, m1
+
+P2S_FUNC 16, 4, m2, m1
+P2S_FUNC 16, 8, m2, m1
+P2S_FUNC 16, 12, m2, m1
+P2S_FUNC 16, 16, m2, m1
+P2S_FUNC 16, 24, m2, m1
+P2S_FUNC 16, 32, m2, m1
+P2S_FUNC 16, 64, m2, m1
+
+P2S_FUNC 24, 32, m4, m2
+P2S_FUNC 24, 64, m4, m2
+
+P2S_FUNC 32, 8, m4, m2
+P2S_FUNC 32, 12, m4, m2
+P2S_FUNC 32, 16, m4, m2
+P2S_FUNC 32, 24, m4, m2
+P2S_FUNC 32, 32, m4, m2
+P2S_FUNC 32, 48, m4, m2
+P2S_FUNC 32, 64, m4, m2
+
+P2S_FUNC 48, 64, m8, m4
+
+P2S_FUNC 64, 16, m8, m4
+P2S_FUNC 64, 32, m8, m4
+P2S_FUNC 64, 48, m8, m4
+P2S_FUNC 64, 64, m8, m4

x265-4.2.tar/source/common/riscv64/pixel-prim.cpp Added

@@ -0,0 +1,1497 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *          foolgry <wang.zhiyong11@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "slicetype.h" // LOWRES_COST_MASK
+#include "primitives.h"
+#include "x265.h"
+#include "riscv64_utils.h"
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+#define SUMSUB_AB(sum, diff, a, b, vl)                     \
+    do {                                                   \
+        vuint16m1_t _a = (a);                              \
+        vuint16m1_t _b = (b);                              \
+        (sum) = __riscv_vadd_vv_u16m1(_a, _b, (vl));       \
+        (diff) = __riscv_vsub_vv_u16m1(_a, _b, (vl));      \
+    } while (0)
+
+namespace {
+
+using namespace X265_NS;
+
+#if HIGH_BIT_DEPTH
+// todo
+
+#else // !HIGH_BIT_DEPTH
+
+typedef struct {
+    vuint8mf2_t *m0;
+    vuint8mf2_t *m1;
+    vuint8mf2_t *m2;
+    vuint8mf2_t *m3;
+} vectors_u8_mf2_t;
+
+typedef struct {
+    vuint8mf2_t *m0;
+    vuint8mf2_t *m1;
+    vuint8mf2_t *m2;
+    vuint8mf2_t *m3;
+    vuint8mf2_t *m4;
+    vuint8mf2_t *m5;
+    vuint8mf2_t *m6;
+    vuint8mf2_t *m7;
+} vectors_u8_mf2_8_t;
+
+typedef struct {
+    vuint8m1_t *m0;
+    vuint8m1_t *m1;
+    vuint8m1_t *m2;
+    vuint8m1_t *m3;
+} vectors_u8_m1_t;
+
+typedef struct {
+    vuint16m1_t *m0;
+    vuint16m1_t *m1;
+    vuint16m1_t *m2;
+    vuint16m1_t *m3;
+} vectors_u16_m1_t;
+
+typedef struct {
+    vuint16m1_t *m0;
+    vuint16m1_t *m1;
+    vuint16m1_t *m2;
+    vuint16m1_t *m3;
+    vuint16m1_t *m4;
+    vuint16m1_t *m5;
+    vuint16m1_t *m6;
+    vuint16m1_t *m7;
+} vectors_u16_m1_8_t;
+
+// 4 times vle8 performs better than one time vlsseg4e8.
+// 1. array elements cannot have RVV type, such as 'vuint8mf2_t';
+// 2. The member variables in the structure cannot be RVV type, so use the structure pointer version.
+static void inline vload_u8x8x4_mf2(const uint8_t **pix, const intptr_t stride_pix,
+                               vectors_u8_mf2_t *d, size_t vl)
+{
+    *(d->m0) = __riscv_vle8_v_u8mf2(*pix, vl);
+    *(d->m1) = __riscv_vle8_v_u8mf2(*pix + stride_pix, vl);
+    *(d->m2) = __riscv_vle8_v_u8mf2(*pix + 2 * stride_pix, vl);
+    *(d->m3) = __riscv_vle8_v_u8mf2(*pix + 3 * stride_pix, vl);
+}
+
+static inline void load_diff_u8x8x8(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, vectors_u16_m1_8_t *diff)
+{
+    const uint8_t *pix1_ptr = pix1;
+    const uint8_t *pix2_ptr = pix2;
+
+    size_t vl = __riscv_vsetvl_e8mf2(8);
+    vuint8mf2_t r0, r1, r2, r3, t0, t1, t2, t3;
+    vectors_u8_mf2_t r = {&r0, &r1, &r2, &r3};
+    vectors_u8_mf2_t t = {&t0, &t1, &t2, &t3};
+
+    //row 0~3
+    vload_u8x8x4_mf2(&pix1_ptr, stride_pix1, &r, vl);
+    vload_u8x8x4_mf2(&pix2_ptr, stride_pix2, &t, vl);
+    *(diff->m0) = __riscv_vwsubu_vv_u16m1(*(r.m0), *(t.m0), vl);
+    *(diff->m1) = __riscv_vwsubu_vv_u16m1(*(r.m1), *(t.m1), vl);
+    *(diff->m2) = __riscv_vwsubu_vv_u16m1(*(r.m2), *(t.m2), vl);
+    *(diff->m3) = __riscv_vwsubu_vv_u16m1(*(r.m3), *(t.m3), vl);
+    //row4~7
+
+    pix1_ptr += 4 * stride_pix1;
+    pix2_ptr += 4 * stride_pix2;
+    vload_u8x8x4_mf2(&pix1_ptr, stride_pix1, &r, vl);
+    vload_u8x8x4_mf2(&pix2_ptr, stride_pix2, &t, vl);
+    *(diff->m4) = __riscv_vwsubu_vv_u16m1(*(r.m0), *(t.m0), vl);
+    *(diff->m5) = __riscv_vwsubu_vv_u16m1(*(r.m1), *(t.m1), vl);
+    *(diff->m6) = __riscv_vwsubu_vv_u16m1(*(r.m2), *(t.m2), vl);
+    *(diff->m7) = __riscv_vwsubu_vv_u16m1(*(r.m3), *(t.m3), vl);
+}
+
+static inline void vload_u8mf2x4(vectors_u8_mf2_t *d, const uint8_t **pix, intptr_t stride_pix, size_t vl) {
+    *(d->m0) = __riscv_vle8_v_u8mf2(*pix, vl);
+    *(d->m1) = __riscv_vle8_v_u8mf2(*pix + stride_pix, vl);
+    *(d->m2) = __riscv_vle8_v_u8mf2(*pix + 2 * stride_pix, vl);
+    *(d->m3) = __riscv_vle8_v_u8mf2(*pix + 3 * stride_pix, vl);
+    *pix += 4 * stride_pix;
+}
+
+static inline void vload_u8m1x4(vectors_u8_m1_t *d, const uint8_t **pix, intptr_t stride_pix, size_t vl) {
+    *(d->m0) = __riscv_vle8_v_u8m1(*pix, vl);
+    *(d->m1) = __riscv_vle8_v_u8m1(*pix + stride_pix, vl);
+    *(d->m2) = __riscv_vle8_v_u8m1(*pix + 2 * stride_pix, vl);
+    *(d->m3) = __riscv_vle8_v_u8m1(*pix + 3 * stride_pix, vl);
+    *pix += 4 * stride_pix;
+}
+
+static inline void vslide_combine_u8(vuint8mf2_t *d0, vuint8mf2_t *d1, vectors_u8_mf2_t s) {
+    *d0 = __riscv_vslideup_vx_u8mf2(*(s.m0), *(s.m2), 4, 8);
+    *d1 = __riscv_vslideup_vx_u8mf2(*(s.m1), *(s.m3), 4, 8);
+}
+
+static inline void vslidedown_u8x4(vectors_u8_m1_t *d0, vectors_u8_m1_t *d1,
+                                   const vectors_u8_m1_t s0, const vectors_u8_m1_t s1, size_t vl) {
+    const size_t offset = 8;
+    *(d0->m0) = __riscv_vslidedown_vx_u8m1(*(s0.m0), offset, vl);
+    *(d0->m1) = __riscv_vslidedown_vx_u8m1(*(s0.m1), offset, vl);
+    *(d0->m2) = __riscv_vslidedown_vx_u8m1(*(s0.m2), offset, vl);
+    *(d0->m3) = __riscv_vslidedown_vx_u8m1(*(s0.m3), offset, vl);
+
+    *(d1->m0) = __riscv_vslidedown_vx_u8m1(*(s1.m0), offset, vl);
+    *(d1->m1) = __riscv_vslidedown_vx_u8m1(*(s1.m1), offset, vl);
+    *(d1->m2) = __riscv_vslidedown_vx_u8m1(*(s1.m2), offset, vl);
+    *(d1->m3) = __riscv_vslidedown_vx_u8m1(*(s1.m3), offset, vl);
+}
+
+static inline void vwsubu_u8x4(vectors_u16_m1_t *diff, const vectors_u8_mf2_t s0, const vectors_u8_mf2_t s1, size_t vl) {
+    *(diff->m0) = __riscv_vwsubu_vv_u16m1(*(s0.m0), *(s1.m0), vl);
+    *(diff->m1) = __riscv_vwsubu_vv_u16m1(*(s0.m1), *(s1.m1), vl);
+    *(diff->m2) = __riscv_vwsubu_vv_u16m1(*(s0.m2), *(s1.m2), vl);
+    *(diff->m3) = __riscv_vwsubu_vv_u16m1(*(s0.m3), *(s1.m3), vl);
+}
+
+static inline void vget_first8_u8m1(vectors_u8_mf2_t *d, const vectors_u8_m1_t s) {
+    *(d->m0) = __riscv_vlmul_trunc_v_u8m1_u8mf2(*(s.m0));
+    *(d->m1) = __riscv_vlmul_trunc_v_u8m1_u8mf2(*(s.m1));
+    *(d->m2) = __riscv_vlmul_trunc_v_u8m1_u8mf2(*(s.m2));
+    *(d->m3) = __riscv_vlmul_trunc_v_u8m1_u8mf2(*(s.m3));
+}
+
+static inline vint16m1_t vabs_u16(vuint16m1_t s0, size_t vl) {
+    vint16m1_t tmp = __riscv_vreinterpret_v_u16m1_i16m1(s0);
+    vint16m1_t t0 = __riscv_vrsub_vx_i16m1(tmp, 0, vl);  // t0 = -s0
+    return __riscv_vmax_vv_i16m1(tmp, t0, vl);           // d0 = max(s0, t0)
+}
+
+static inline vuint16m1_t vmax_abs_u16(vuint16m1_t s0, vuint16m1_t s1, size_t vl) {
+    vuint16m1_t b0 = __riscv_vreinterpret_v_i16m1_u16m1(vabs_u16(s0, vl));
+    vuint16m1_t b1 = __riscv_vreinterpret_v_i16m1_u16m1(vabs_u16(s1, vl));
+    return __riscv_vmaxu_vv_u16m1(b0, b1, vl);
+}
+
+static inline int vredsum_u16(const vuint16m1_t src, size_t vl) {
+    vuint16m1_t v_sum = __riscv_vmv_v_x_u16m1(0, vl);
+    v_sum = __riscv_vredsum_vs_u16m1_u16m1(src, v_sum, vl);

x265-4.2.tar/source/common/riscv64/pixel-util.S Added

@@ -0,0 +1,3375 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *          Yujiao He <he.yujiao@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.option arch, +zbb
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+.text
+
+.macro LOAD_4_LINE src, es, stride, d1, d2, d3, d4
+    slli            t2, \stride, 1
+    add             t3, t2, \stride
+    add             t1, \src, \stride
+    add             t2, \src, t2
+    add             t3, \src, t3
+    vl\es\().v      \d1, (\src)
+    vl\es\().v      \d2, (t1)
+    vl\es\().v      \d3, (t2)
+    vl\es\().v      \d4, (t3)
+.endm
+
+#if !HIGH_BIT_DEPTH
+.macro LOAD_DIFF_8x8
+    vsetivli        zero, 8, e8, mf2, ta, ma
+    LOAD_4_LINE     a0, e8, a1, v24, v1, v2, v3
+    add             t0, t3, a1
+    LOAD_4_LINE     t0, e8, a1, v4, v5, v6, v7
+    LOAD_4_LINE     a2, e8, a3, v8, v9, v10, v11
+    add             t0, t3, a3
+    LOAD_4_LINE     t0, e8, a3, v12, v13, v14, v15
+
+    // diff v16~v23
+    vwsubu.vv       v16, v24, v8
+    vwsubu.vv       v17, v1, v9
+    vwsubu.vv       v18, v2, v10
+    vwsubu.vv       v19, v3, v11
+    vwsubu.vv       v20, v4, v12
+    vwsubu.vv       v21, v5, v13
+    vwsubu.vv       v22, v6, v14
+    vwsubu.vv       v23, v7, v15
+.endm
+
+.macro MAX_ABS s0, s1, s2, s3, t0, t1, t2, t3
+    VABS            \s0, \s0, \t0
+    VABS            \s1, \s1, \t1
+    VABS            \s2, \s2, \t2
+    VABS            \s3, \s3, \t3
+
+    vmax.vv         \t0, \s0, \s1
+    vmax.vv         \t1, \s2, \s3
+.endm
+
+.macro SA8D_8x8 dst, s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7
+    // v1~3 v8~v15
+    vsetivli        zero, 8, e16, m1, ta, ma
+    HADAMARD4       \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3
+    HADAMARD4       \s4, \s5, \s6, \s7, \t0, \t1, \t2, \t3
+    SUMSUB_ABCD     \t0, \t4, \t1, \t5, \s0, \s4, \s1, \s5
+    SUMSUB_ABCD     \t2, \t6, \t3, \t7, \s2, \s6, \s3, \s7
+
+    // transpose v8~v15
+    TRANSPOSE_8x8   16, \s0, \s1, \s2, \s3, \s4, \s5, \s6, \s7, \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7
+    vsetivli        zero, 8, e16, m1, ta, ma
+
+    // H1~2
+    HADAMARD4       \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3
+    HADAMARD4       \s4, \s5, \s6, \s7, \t0, \t1, \t2, \t3
+
+    VABS            \s0, \s0, \t0
+    VABS            \s1, \s1, \t1
+    VABS            \s2, \s2, \t2
+    VABS            \s3, \s3, \t3
+    VABS            \s4, \s4, \t4
+    VABS            \s5, \s5, \t5
+    VABS            \s6, \s6, \t6
+    VABS            \s7, \s7, \t7
+
+    vmv.v.i         \t0, 0
+    vmax.vv         \s0, \s0, \s4
+    vmax.vv         \s1, \s1, \s5
+    vmax.vv         \s2, \s2, \s6
+    vmax.vv         \s3, \s3, \s7
+    vadd.vv         \s0, \s0, \s1
+    vadd.vv         \s2, \s2, \s3
+    vadd.vv         \s0, \s0, \s2
+
+    vwredsumu.vs    \s1, \s0, \t0
+    vsetivli        zero, 1, e32, m1, ta, ma
+    vmv.x.s         \dst, \s1
+.endm
+
+//int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
+function PFX(sa8d_8x8_rvv)
+    li              t0, 0x55
+    vsetivli        zero, 8, e16, m1, ta, ma
+    vmv.v.x         v0, t0
+
+    LOAD_DIFF_8x8
+    SA8D_8x8 a0, v16, v17, v18, v19, v20, v21, v22, v23, v8, v9, v10, v11, v12, v13, v14, v15
+    addi            a0, a0, 1
+    srli            a0, a0, 1
+    ret
+endfunc
+
+//int sa8d_8x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
+function PFX(sa8d_8x16_rvv)
+    li              t0, 0x55
+    vsetivli        zero, 8, e16, m1, ta, ma
+    vmv.v.i         v30, 0
+    vmv.v.x         v0, t0
+
+    LOAD_DIFF_8x8
+    SA8D_8x8 a7, v16, v17, v18, v19, v20, v21, v22, v23, v8, v9, v10, v11, v12, v13, v14, v15
+    addi            a7, a7, 1
+    srli            a7, a7, 1
+
+    slli            t0, a1, 3
+    slli            t1, a3, 3
+    add             a0, a0, t0
+    add             a2, a2, t1
+    LOAD_DIFF_8x8
+    SA8D_8x8 a0, v16, v17, v18, v19, v20, v21, v22, v23, v8, v9, v10, v11, v12, v13, v14, v15
+    addi            a0, a0, 1
+    srli            a0, a0, 1
+    add             a0, a0, a7
+
+    ret
+endfunc
+
+.macro LOAD_DIFF_8x16
+    LOAD_4_LINE     a0, e8, a1, v28, v29, v30, v31
+    add             t0, t3, a1
+    LOAD_4_LINE     t0, e8, a1, v4, v5, v6, v7
+    LOAD_4_LINE     a2, e8, a3, v8, v9, v10, v11
+    add             t0, t3, a3
+    LOAD_4_LINE     t0, e8, a3, v12, v13, v14, v15
+
+    // diff v16~v23
+    vwsubu.vv       v16, v28, v8
+    vwsubu.vv       v18, v29, v9
+    vwsubu.vv       v20, v30, v10
+    vwsubu.vv       v22, v31, v11
+    vwsubu.vv       v24, v4, v12
+    vwsubu.vv       v26, v5, v13
+    vwsubu.vv       v28, v6, v14
+    vwsubu.vv       v30, v7, v15
+
+    li              t1, 8
+    vsetivli        t0, 16, e16, m1, ta, ma
+    beq             t0, t1, 1f
+
+    vsetivli        zero, 8, e16, m1, ta, ma
+    vslidedown.vi   v17, v16, 8
+    vslidedown.vi   v19, v18, 8
+    vslidedown.vi   v21, v20, 8
+    vslidedown.vi   v23, v22, 8
+    vslidedown.vi   v25, v24, 8
+    vslidedown.vi   v27, v26, 8
+    vslidedown.vi   v29, v28, 8
+    vslidedown.vi   v31, v30, 8
+1:
+.endm
+
+.macro SA8D_16x16 dst, stride1, stride2
+    li              t0, 0x55
+    vsetivli        zero, 16, e8, m1, ta, ma
+    vmv.v.x         v0, t0
+
+    LOAD_DIFF_8x16
+    SA8D_8x8 a6, v16, v18, v20, v22, v24, v26, v28, v30, v8, v9, v10, v11, v12, v13, v14, v15
+    SA8D_8x8 t4, v17, v19, v21, v23, v25, v27, v29, v31, v8, v9, v10, v11, v12, v13, v14, v15

x265-4.2.tar/source/common/riscv64/riscv64_utils.S Added

@@ -0,0 +1,212 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 4
+
+.text
+
+.macro CALC_STRIDED_ADDRS src, stride, reg1, reg2, reg3, reg4
+    slli                 \reg4, \stride, 1          # reg4 = stride * 2
+    add                  \reg1, \src, \stride       # reg1 = src + stride
+    add                  \reg2, \src, \reg4         # reg2 = src + 2*stride
+    add                  \reg3, \reg2, \stride      # reg3 = src + 3*stride
+.endm
+
+.macro LOAD_4_LINE src, es, sstride, d1, d2, d3, d4, reg1, reg2, reg3, reg4
+    CALC_STRIDED_ADDRS   \src, \sstride, \reg1, \reg2, \reg3, \reg4
+    vl\es\().v           \d1, (\src)
+    vl\es\().v           \d2, (\reg1)
+    vl\es\().v           \d3, (\reg2)
+    vl\es\().v           \d4, (\reg3)
+.endm
+
+.macro STORE_4_LINE dst, es, dstride, d1, d2, d3, d4, reg1, reg2, reg3, reg4
+    CALC_STRIDED_ADDRS   \dst, \dstride, \reg1, \reg2, \reg3, \reg4
+    vs\es\().v           \d1, (\dst)
+    vs\es\().v           \d2, (\reg1)
+    vs\es\().v           \d3, (\reg2)
+    vs\es\().v           \d4, (\reg3)
+.endm
+
+.macro TRANSPOSE_HELPER a, b, c, d, e, f, g, h, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+    vslideup.vi         \tmp1, \b, 1
+    vslidedown.vi       \tmp2, \a, 1
+    vslideup.vi         \tmp3, \d, 1
+    vslidedown.vi       \tmp4, \c, 1
+    vslideup.vi         \tmp5, \f, 1
+    vslidedown.vi       \tmp6, \e, 1
+    vslideup.vi         \tmp7, \h, 1
+    vslidedown.vi       \tmp8, \g, 1
+
+    vmerge.vvm          \a, \tmp1, \a, v0
+    vmerge.vvm          \b, \b, \tmp2, v0
+
+    vmerge.vvm          \c, \tmp3, \c, v0
+    vmerge.vvm          \d, \d, \tmp4, v0
+
+    vmerge.vvm          \e, \tmp5, \e, v0
+    vmerge.vvm          \f, \f, \tmp6, v0
+
+    vmerge.vvm          \g, \tmp7, \g, v0
+    vmerge.vvm          \h, \h, \tmp8, v0
+.endm
+
+#if !HIGH_BIT_DEPTH
+# void transpose4x4_rvv(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+function transpose4x4_rvv_u8
+    li                    t4, 0x5
+    vsetivli              zero, 4, e8, mf2, ta, ma
+    vmv.v.x               v0, t4
+
+    LOAD_4_LINE           a1, e8, a3, v4, v5, v6, v7, t0, t1, t2, t3
+    TRANSPOSE_4x4         8, v16, v17, v18, v19, v4, v5, v6, v7
+
+    vsetivli              zero, 4, e8, mf2, ta, ma
+    STORE_4_LINE          a0, e8, a2, v16, v17, v18, v19, t0, t1, t2, t3
+
+    ret
+endfunc
+
+function transpose8x8_rvv_u8
+    li                    t4, 0x55
+    vsetivli              zero, 8, e8, m1, ta, ma
+    vmv.v.x               v0, t4
+
+    LOAD_4_LINE           a1, e8, a3, v4, v5, v6, v7, t0, t1, t2, t3
+    add                   t4, t2, a3
+    LOAD_4_LINE           t4, e8, a3, v8, v9, v10 v11, t0, t1, t2, t3
+
+    TRANSPOSE_8x8         8, v16, v17, v18, v19, v20, v21, v22, v23, v4, v5, v6, v7, v8, v9, v10, v11
+
+    vsetivli              zero, 8, e8, m1, ta, ma
+    STORE_4_LINE          a0, e8, a2, v16, v17, v18, v19, t0, t1, t2, t3
+    add                   t4, t2, a2
+    STORE_4_LINE          t4, e8, a2, v20, v21, v22, v23, t0, t1, t2, t3
+
+    ret
+endfunc
+
+function transpose16x16_rvv_u8
+    # v0 must be 0x5555 as mask
+    li                   t4, 0x5555
+    vsetivli             zero, 16, e8, m1, ta, ma
+    vmv.v.x              v0, t4
+
+    # load: v1-v16
+    LOAD_4_LINE           a1, e8, a3, v1, v2, v3, v4, t0, t1, t2, t3
+    add                   t4, t2, a3
+    LOAD_4_LINE           t4, e8, a3, v5, v6, v7, v8, t0, t1, t2, t3
+    add                   t4, t2, a3
+    LOAD_4_LINE           t4, e8, a3, v9, v10, v11, v12, t0, t1, t2, t3
+    add                   t4, t2, a3
+    LOAD_4_LINE           t4, e8, a3, v13, v14, v15, v16, t0, t1, t2, t3
+
+    # ------------------------------------------------------------
+    # Stage 1: e8, VL=16
+    # ------------------------------------------------------------
+    TRANSPOSE_HELPER      v1, v2, v3, v4, v5, v6, v7, v8, v17, v18, v19, v20, v21, v22, v23, v24
+    TRANSPOSE_HELPER      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24
+
+    # ------------------------------------------------------------
+    # Stage 2:  e16, VL=8
+    # ------------------------------------------------------------
+    vsetivli             zero, 8, e16, m1, ta, ma
+
+    TRANSPOSE_HELPER     v1, v3, v2, v4, v5, v7, v6, v8, v17, v18, v19, v20, v21, v22, v23, v24
+    TRANSPOSE_HELPER     v9, v11, v10, v12, v13, v15, v14, v16, v17, v18, v19, v20, v21, v22, v23, v24
+
+
+    # ------------------------------------------------------------
+    # Stage 3: e32, VL=4
+    # ------------------------------------------------------------
+    vsetivli             zero, 4, e32, m1, ta, ma
+
+    TRANSPOSE_HELPER     v1, v5, v2, v6, v3, v7, v4, v8, v17, v18, v19, v20, v21, v22, v23, v24
+    TRANSPOSE_HELPER     v9, v13, v10, v14, v11, v15, v12, v16, v17, v18, v19, v20, v21, v22, v23, v24
+
+
+    # ------------------------------------------------------------
+    # Stage 4: e64, VL=2
+    # ------------------------------------------------------------
+    vsetivli             zero, 2, e64, m1, ta, ma
+
+    TRANSPOSE_HELPER     v1, v9, v2, v10, v3, v11, v4, v12, v17, v18, v19, v20, v21, v22, v23, v24
+    TRANSPOSE_HELPER     v5, v13, v6, v14, v7, v15, v8, v16, v17, v18, v19, v20, v21, v22, v23, v24
+
+
+    # store: v1-v16
+    vsetivli              zero, 16, e8, m1, ta, ma
+    STORE_4_LINE          a0, e8, a2, v1, v2, v3, v4, t0, t1, t2, t3
+    add                   t4, t2, a2
+    STORE_4_LINE          t4, e8, a2, v5, v6, v7, v8, t0, t1, t2, t3
+    add                   t4, t2, a2
+    STORE_4_LINE          t4, e8, a2, v9, v10, v11, v12, t0, t1, t2, t3
+    add                   t4, t2, a2
+    STORE_4_LINE          t4, e8, a2, v13, v14, v15, v16, t0, t1, t2, t3
+
+    ret
+endfunc
+
+#else
+# void transpose4x4_rvv(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
+function transpose4x4_rvv_u16
+    li                     t4, 0x5
+    vsetivli               zero, 4, e16, mf2, ta, ma
+    vmv.v.x                v0, t4
+
+    slli                   t5, a3, 1
+    slli                   t6, a2, 1
+
+    LOAD_4_LINE            a1, e16, t5, v4, v5, v6, v7, t0, t1, t2, t3
+    TRANSPOSE_4x4          16, v16, v17, v18, v19, v4, v5, v6, v7
+
+    vsetivli               zero, 4, e16, mf2, ta, ma
+    STORE_4_LINE           a0, e16, t6, v16, v17, v18, v19, t0, t1, t2, t3
+
+    ret
+endfunc
+
+function transpose8x8_rvv_u16
+    li                    t4, 0x55
+    vsetivli              zero, 8, e16, m1, ta, ma
+    vmv.v.x               v0, t4
+
+    slli                  t5, a3, 1
+    slli                  t6, a2, 1
+
+    LOAD_4_LINE           a1, e16, t5, v4, v5, v6, v7, t0, t1, t2, t3

x265-4.2.tar/source/common/riscv64/riscv64_utils.cpp Added

@@ -0,0 +1,111 @@
+
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "x265.h"
+#include "riscv64_utils.h"
+#include <riscv_vector.h>
+
+namespace X265_NS
+{
+
+#if !HIGH_BIT_DEPTH
+void transpose32x32_rvv(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
+{
+    transpose16x16_rvv(dst, src, dstride, sstride);
+    transpose16x16_rvv(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
+    if (dst == src)
+    {
+        size_t vl = 16;
+        uint8_t tmp16 * 16 __attribute__((aligned(64)));
+        transpose16x16_rvv(tmp, src + 16, 16, sstride);
+        transpose16x16_rvv(dst + 16, src + 16 * sstride, dstride, sstride);
+        for (int i = 0; i < 16; i++)
+        {
+            __riscv_vse8_v_u8m1(dst + (16 + i) * dstride, __riscv_vle8_v_u8m1(tmp + 16 * i, vl), vl);
+        }
+    }
+    else
+    {
+        transpose16x16_rvv(dst + 16 * dstride, src + 16, dstride, sstride);
+        transpose16x16_rvv(dst + 16, src + 16 * sstride, dstride, sstride);
+    }
+
+}
+
+#else
+void transpose16x16_rvv(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
+{
+    transpose8x8_rvv(dst, src, dstride, sstride);
+    transpose8x8_rvv(dst + 8 * dstride + 8, src + 8 * sstride + 8, dstride, sstride);
+
+    if (dst == src)
+    {
+        uint16_t tmp8 * 8;
+        size_t vl = 8;
+        transpose8x8_rvv(tmp, src + 8, 8, sstride);
+        transpose8x8_rvv(dst + 8, src + 8 * sstride, dstride, sstride);
+        for (int i = 0; i < 8; i++)
+        {
+            __riscv_vse16_v_u16m1(dst + (8 + i) * dstride, __riscv_vle16_v_u16m1(tmp + 8 * i, vl), vl);
+        }
+    }
+    else
+    {
+        transpose8x8_rvv(dst + 8 * dstride, src + 8, dstride, sstride);
+        transpose8x8_rvv(dst + 8, src + 8 * sstride, dstride, sstride);
+    }
+
+}
+
+void transpose32x32_rvv(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
+{
+    //assumption: there is no partial overlap
+    for (int i = 0; i < 4; i++)
+    {
+        transpose8x8_rvv(dst + i * 8 * (1 + dstride), src + i * 8 * (1 + sstride), dstride, sstride);
+        for (int j = i + 1; j < 4; j++)
+        {
+            if (dst == src)
+            {
+                uint16_t tmp8 * 8 __attribute__((aligned(64)));
+                size_t vl = 8;
+                transpose8x8_rvv(tmp, src + 8 * i + 8 * j * sstride, 8, sstride);
+                transpose8x8_rvv(dst + 8 * i + 8 * j * dstride, src + 8 * j + 8 * i * sstride, dstride, sstride);
+                for (int k = 0; k < 8; k++)
+                {
+                    __riscv_vse16_v_u16m1(dst + 8 * j + (8 * i + k) * dstride, __riscv_vle16_v_u16m1(tmp + 8 * k, vl), vl);
+                }
+            }
+            else
+            {
+                transpose8x8_rvv(dst + 8 * (j + i * dstride), src + 8 * (i + j * sstride), dstride, sstride);
+                transpose8x8_rvv(dst + 8 * (i + j * dstride), src + 8 * (j + i * sstride), dstride, sstride);
+            }
+
+        }
+    }
+}
+#endif // !HIGH_BIT_DEPTH
+}
\ No newline at end of file

x265-4.2.tar/source/common/riscv64/riscv64_utils.h Added

@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef __RISCV64_UTILS_H__
+#define __RISCV64_UTILS_H__
+
+#include <stdint.h>
+
+extern "C" {
+    void transpose4x4_rvv_u8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
+    void transpose8x8_rvv_u8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
+
+    void transpose4x4_rvv_u16(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
+    void transpose8x8_rvv_u16(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
+
+    void transpose16x16_rvv_u8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
+}
+
+namespace X265_NS
+{
+
+// overload
+inline void transpose4x4_rvv(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride) {
+    transpose4x4_rvv_u8(dst, src, dstride, sstride);
+}
+
+inline void transpose4x4_rvv(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride) {
+    transpose4x4_rvv_u16(dst, src, dstride, sstride);
+}
+
+inline void transpose8x8_rvv(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride) {
+    transpose8x8_rvv_u8(dst, src, dstride, sstride);
+}
+
+inline void transpose8x8_rvv(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride) {
+    transpose8x8_rvv_u16(dst, src, dstride, sstride);
+}
+
+inline void transpose16x16_rvv(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride) {
+    transpose16x16_rvv_u8(dst, src, dstride, sstride);
+}
+
+void transpose32x32_rvv(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
+
+void transpose16x16_rvv(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
+void transpose32x32_rvv(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
+}
+
+#endif
\ No newline at end of file

x265-4.2.tar/source/common/riscv64/sad-a.S Added

@@ -0,0 +1,940 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Jia Yuan <yuan.jia@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 4
+
+.text
+
+.macro push
+    addi          sp, sp, -48
+    sd            ra, 16(sp)    // save return address
+    sd            s0, 32(sp)    // save the frame pointer
+    addi          s0, sp, 48    // set the new frame pointer
+
+    sd            x9, 0(sp)
+.endm
+
+.macro  pop
+    ld           x9, 0(sp)
+
+    // restore the frame pointer and return address
+    ld           s0, 32(sp)
+    ld           ra, 16(sp)
+    addi         sp, sp, 48
+.endm
+
+#if !HIGH_BIT_DEPTH
+.macro SAD_FUNC_SMALL w, h, lmul1, lmul2
+function PFX(pixel_sad_\w\()x\h\()_rvv)
+    li            t0, \h
+    li            t1, \w
+    // prevent overflow when there is a bigger \h
+    vsetvli       zero, t1, e32, m\()\lmul2, ta, ma
+    vmv.v.i       v28, 0
+
+1:
+    addi          t0, t0, -4
+    vsetvli       zero, t1, e8, m\()\lmul1, ta, ma
+
+    vle8.v        v0, (a0)
+    vle8.v        v1, (a2)
+    add           a0, a0, a1
+    add           a2, a2, a3
+    VABDU         v8, v0, v1, v24
+
+    vle8.v        v2, (a0)
+    vle8.v        v3, (a2)
+    add           a0, a0, a1
+    add           a2, a2, a3
+    VABDU         v9, v2, v3, v25
+
+    vle8.v        v4, (a0)
+    vle8.v        v5, (a2)
+    add           a0, a0, a1
+    add           a2, a2, a3
+    VABDU         v10, v4, v5, v26
+
+    vle8.v        v6, (a0)
+    vle8.v        v7, (a2)
+    add           a0, a0, a1
+    add           a2, a2, a3
+    VABDU         v11, v6, v7, v27
+
+    vsetvli       zero, t1, e32, m\()\lmul2, ta, ma
+
+    vzext.vf4     v0, v8
+    vzext.vf4     v4, v9
+    vadd.vv       v28, v28, v0
+    vadd.vv       v28, v28, v4
+
+    vzext.vf4     v12, v10
+    vzext.vf4     v16, v11
+    vadd.vv       v28, v28, v12
+    vadd.vv       v28, v28, v16
+
+    bnez          t0, 1b
+
+    vmv.v.i       v20, 0
+    vredsum.vs    v20, v28, v20
+    vmv.x.s       a0, v20
+    ret
+endfunc
+.endm
+
+SAD_FUNC_SMALL 4, 4, 1, 1
+SAD_FUNC_SMALL 4, 8, 1, 1
+SAD_FUNC_SMALL 4, 16, 1, 1
+
+SAD_FUNC_SMALL 8, 4, 1, 2
+SAD_FUNC_SMALL 8, 8, 1, 2
+SAD_FUNC_SMALL 8, 16, 1, 2
+SAD_FUNC_SMALL 8, 32, 1, 2
+
+SAD_FUNC_SMALL 12, 16, 1, 4
+
+SAD_FUNC_SMALL 16, 4, 1, 4
+SAD_FUNC_SMALL 16, 8, 1, 4
+SAD_FUNC_SMALL 16, 12, 1, 4
+SAD_FUNC_SMALL 16, 16, 1, 4
+SAD_FUNC_SMALL 16, 32, 1, 4
+SAD_FUNC_SMALL 16, 64, 1, 4
+
+.macro SAD_FUNC_MEDIUM w, h, lmul1, lmul2
+function PFX(pixel_sad_\w\()x\h\()_rvv)
+    li           t0, \h
+    li           t1, \w
+
+    vsetvli      zero, t1, e32, m\()\lmul2, ta, ma
+    vmv.v.i      v24, 0
+
+1:
+    addi          t0, t0, -2
+    vsetvli       zero, t1, e8, m\()\lmul1, ta, ma
+
+    vle8.v        v0, (a0)
+    vle8.v        v2, (a2)
+    add           a0, a0, a1
+    add           a2, a2, a3
+    VABDU         v8, v0, v2, v12
+
+    vle8.v        v4, (a0)
+    vle8.v        v6, (a2)
+    add           a0, a0, a1
+    add           a2, a2, a3
+    VABDU         v10, v4, v6, v14
+
+    vsetvli       zero, t1, e32, m\()\lmul2, ta, ma
+
+    vzext.vf4     v0, v8
+    vadd.vv       v24, v24, v0
+
+    vzext.vf4     v16, v10
+    vadd.vv       v24, v24, v16
+
+    bnez          t0, 1b
+
+    vmv.v.i       v0, 0
+    vredsum.vs    v0, v24, v0
+    vmv.x.s       a0, v0
+    ret
+endfunc
+.endm
+
+SAD_FUNC_MEDIUM  24, 32, 2, 8
+
+SAD_FUNC_MEDIUM  32, 8, 2, 8
+SAD_FUNC_MEDIUM  32, 16, 2, 8
+SAD_FUNC_MEDIUM  32, 24, 2, 8
+SAD_FUNC_MEDIUM  32, 32, 2, 8
+SAD_FUNC_MEDIUM  32, 64, 2, 8
+
+.macro SAD_FUNC_COMMON w, h, lmul1, lmul2
+function PFX(pixel_sad_\w\()x\h\()_rvv)
+    li             t0, \h
+    li             t1, \w
+
+    vsetvli        zero, t1, e32, m\()\lmul2, ta, ma
+    vmv.v.i        v24, 0
+
+1:  // row loop
+    mv             t3, a0
+    mv             t4, a2
+    addi           t0, t0, -1
+
+2:  // inline segment loop
+    vsetvli         t5, t1, e8, m\()\lmul1, ta, ma
+
+    vle8.v          v0, (t3)
+    vle8.v          v4, (t4)
+    add             t3, t3, t5
+    add             t4, t4, t5
+    VABDU           v8, v0, v4, v12
+
+    vsetvli         zero, t1, e32,m\()\lmul2, tu, ma
+    vzext.vf4       v16, v8
+    vadd.vv         v24, v24, v16
+
+    sub             t1, t1, t5

x265-4.2.tar/source/common/riscv64/sao-prim.cpp Added

@@ -0,0 +1,229 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <riscv_vector.h>
+#include <stdint.h>
+#include "sao.h"
+#include "primitives.h"
+
+static inline void compute_eo_stats(const vint8m1_t edge_type, const int16_t *diff,
+    int32_t *stats, int32_t *count, int vl)
+{
+    (void)__riscv_vsetvl_e16m2(vl);
+    vint16m2_t tmp_0 = __riscv_vmv_v_x_i16m2(0, vl);
+    (void)__riscv_vsetvl_e32m1(vl);
+    vint32m1_t tmp_01 = __riscv_vmv_v_x_i32m1(0, vl);
+
+    // Create a mask for each edge type.
+    (void)__riscv_vsetvl_e8m1(vl);
+    vbool8_t mask0 = __riscv_vmseq_vx_i8m1_b8(edge_type, -2, vl);
+    vbool8_t mask1 = __riscv_vmseq_vx_i8m1_b8(edge_type, -1, vl);
+    vbool8_t mask2 = __riscv_vmseq_vx_i8m1_b8(edge_type, 0, vl);
+    vbool8_t mask3 = __riscv_vmseq_vx_i8m1_b8(edge_type, 1, vl);
+    vbool8_t mask4 = __riscv_vmseq_vx_i8m1_b8(edge_type, 2, vl);
+
+    count1 += __riscv_vcpop_m_b8(mask0, vl);
+    count2 += __riscv_vcpop_m_b8(mask1, vl);
+    count0 += __riscv_vcpop_m_b8(mask2, vl);
+    count3 += __riscv_vcpop_m_b8(mask3, vl);
+    count4 += __riscv_vcpop_m_b8(mask4, vl);
+
+    // Widen the masks to 16-bit.
+    (void)__riscv_vsetvl_e16m2(vl);
+    vint16m2_t load_diff = __riscv_vle16_v_i16m2(diff, vl);
+    vint16m2_t temp_add0 = __riscv_vmerge_vvm_i16m2(tmp_0, load_diff, mask0, vl);
+    vint16m2_t temp_add1 = __riscv_vmerge_vvm_i16m2(tmp_0, load_diff, mask1, vl);
+    vint16m2_t temp_add2 = __riscv_vmerge_vvm_i16m2(tmp_0, load_diff, mask2, vl);
+    vint16m2_t temp_add3 = __riscv_vmerge_vvm_i16m2(tmp_0, load_diff, mask3, vl);
+    vint16m2_t temp_add4 = __riscv_vmerge_vvm_i16m2(tmp_0, load_diff, mask4, vl);
+
+    vint32m1_t temp_stats0 = __riscv_vwredsum_vs_i16m2_i32m1(temp_add0, tmp_01, vl);
+    vint32m1_t temp_stats1 = __riscv_vwredsum_vs_i16m2_i32m1(temp_add1, tmp_01, vl);
+    vint32m1_t temp_stats2 = __riscv_vwredsum_vs_i16m2_i32m1(temp_add2, tmp_01, vl);
+    vint32m1_t temp_stats3 = __riscv_vwredsum_vs_i16m2_i32m1(temp_add3, tmp_01, vl);
+    vint32m1_t temp_stats4 = __riscv_vwredsum_vs_i16m2_i32m1(temp_add4, tmp_01, vl);
+
+    stats1 += __riscv_vmv_x_s_i32m1_i32(temp_stats0);
+    stats2 += __riscv_vmv_x_s_i32m1_i32(temp_stats1);
+    stats0 += __riscv_vmv_x_s_i32m1_i32(temp_stats2);
+    stats3 += __riscv_vmv_x_s_i32m1_i32(temp_stats3);
+    stats4 += __riscv_vmv_x_s_i32m1_i32(temp_stats4);
+}
+
+static inline vint8m1_t signOf_rvv(const pixel *a, const pixel *b, int vl)
+{
+#if HIGH_BIT_DEPTH
+    vl = __riscv_vsetvl_e16m2(vl);
+    vuint16m2_t s0 = __riscv_vle16_v_u16m2(a, vl);
+    vuint16m2_t s1 = __riscv_vle16_v_u16m2(b, vl);
+
+    vbool8_t bgt = __riscv_vmsgtu_vv_u16m2_b8(s0, s1, vl);
+    vbool8_t blt = __riscv_vmsltu_vv_u16m2_b8(s0, s1, vl);
+
+    // a > b : 1   a == b : 0  a < b : -1
+    vl = __riscv_vsetvl_e8m1(vl);
+    vint8m1_t ret = __riscv_vmv_v_x_i8m1(0, vl);
+    ret = __riscv_vmerge_vxm_i8m1(ret, 1, bgt, vl);
+    ret = __riscv_vmerge_vxm_i8m1(ret, -1, blt, vl);
+#else // HIGH_BIT_DEPTH
+    vl = __riscv_vsetvl_e8m1(vl);
+    vint8m1_t ret = __riscv_vmv_v_x_i8m1(0, vl);
+
+    vuint8m1_t s0 = __riscv_vle8_v_u8m1(a, vl);
+    vuint8m1_t s1 = __riscv_vle8_v_u8m1(b, vl);
+
+    // a > b : 1   a == b : 0  a < b : -1
+    vbool8_t bgt = __riscv_vmsgtu_vv_u8m1_b8(s0, s1, vl);
+    ret = __riscv_vmerge_vxm_i8m1(ret, 1, bgt, vl);
+    vbool8_t blt = __riscv_vmsltu_vv_u8m1_b8(s0, s1, vl);
+    ret = __riscv_vmerge_vxm_i8m1(ret, -1, blt, vl);
+#endif // HIGH_BIT_DEPTH
+
+    return ret;
+}
+
+namespace X265_NS {
+void saoCuStatsBO_rvv(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+{
+#if HIGH_BIT_DEPTH
+    const int n_elem = 4;
+    const int elem_width = 16;
+#else
+    const int n_elem = 8;
+    const int elem_width = 8;
+#endif
+
+    // Additional temporary buffer for accumulation.
+    int32_t stats_tmp32 = { 0 };
+    int32_t count_tmp32 = { 0 };
+
+    // Byte-addressable pointers to buffers, to optimise address calculation.
+    uint8_t *stats_b2 = {
+        reinterpret_cast<uint8_t *>(stats),
+        reinterpret_cast<uint8_t *>(stats_tmp),
+    };
+    uint8_t *count_b2 = {
+        reinterpret_cast<uint8_t *>(count),
+        reinterpret_cast<uint8_t *>(count_tmp),
+    };
+
+    // Combine shift for index calculation with shift for address calculation.
+    const int right_shift = X265_DEPTH - X265_NS::SAO::SAO_BO_BITS;
+    const int left_shift = 2;
+    const int shift = right_shift - left_shift;
+    // Mask out bits 7, 1 & 0 to account for combination of shifts.
+    const int mask = 0x7c;
+
+    // Compute statistics into temporary buffers.
+    for (int y = 0; y < endY; y++)
+    {
+        int x = 0;
+        for (; x + n_elem < endX; x += n_elem)
+        {
+            uint64_t class_idx_64 =
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
+
+            for (int i = 0; i < n_elem; ++i)
+            {
+                const int idx = i & 1;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
+            }
+        }
+
+        if (x < endX)
+        {
+            uint64_t class_idx_64 =
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
+
+            for (int i = 0; (i + x) < endX; ++i)
+            {
+                const int idx = i & 1;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
+            }
+        }
+
+        diff += MAX_CU_SIZE;
+        rec += stride;
+    }
+
+    // Reduce temporary buffers to destination using Neon.
+    for (int i = 0; i < 32;)
+    {
+        int vl = __riscv_vsetvl_e32m2(32 - i);
+        vint32m2_t s0 = __riscv_vle32_v_i32m2(stats_tmp + i, vl);
+        vint32m2_t s1 = __riscv_vle32_v_i32m2(stats + i, vl);
+        vint32m2_t ss = __riscv_vadd_vv_i32m2(s0, s1, vl);
+        __riscv_vse32_v_i32m2(stats + i, ss, vl);
+
+        vint32m2_t c0 = __riscv_vle32_v_i32m2(count_tmp + i, vl);
+        vint32m2_t c1 = __riscv_vle32_v_i32m2(count + i, vl);
+        vint32m2_t cs = __riscv_vadd_vv_i32m2(c0, c1, vl);
+        __riscv_vse32_v_i32m2(count + i, cs, vl);
+
+        i += vl;
+    }
+}
+
+void saoCuStatsE0_rvv(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+{
+    for (int y = 0; y < endY; y++)
+    {
+        // Calculate negated sign_left(x) directly, to save negation when
+        // reusing sign_right(x) as sign_left(x + 1).
+        int vl = __riscv_vsetvl_e8m1(endX);
+        vint8m1_t neg_sign_left = __riscv_vmv_v_x_i8m1(x265_signOf(rec-1 - rec0), vl);
+        for (int x = 0; x < endX; x += vl)

x265-4.2.tar/source/common/riscv64/sao.S Added

@@ -0,0 +1,884 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Changsheng Wu <wu.changsheng@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+.text
+
+//void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+function PFX(saoCuStatsE0_rvv)
+    addi            sp, sp, -64
+    sd              s0, (sp)
+    sd              s1, 8(sp)
+    sd              s2, 16(sp)
+    sd              s3, 24(sp)
+    sd              s4, 32(sp)
+    sd              s5, 40(sp)
+    sd              s6, 48(sp)
+    li              s0, 0
+    li              s1, 0
+    li              s2, 0
+    li              s3, 0
+    li              s4, 0
+    li              s6, 1 << 7
+#if HIGH_BIT_DEPTH
+    slli            a2, a2, 1
+    vsetvli         t1, zero, e32, m2, ta, ma
+#else
+    vsetvli         t1, zero, e32, m4, ta, ma
+#endif
+    vmv.v.i         v4, 0
+    vmv.v.i         v8, 0
+    vmv.v.i         v12, 0
+    vmv.v.i         v16, 0
+    vmv.v.i         v20, 0
+#if HIGH_BIT_DEPTH
+    vsetvli         t1, zero, e16, m1, ta, ma
+#else
+    vsetvli         t1, zero, e16, m2, ta, ma
+#endif
+    vmv.v.i         v30, 0
+
+loop_saoCuStatsE0_y:
+    mv              t0, a0
+    mv              t1, a1
+    mv              t2, a3
+#if HIGH_BIT_DEPTH
+    lhu             t3, -2(a1)
+    lhu             t4, (a1)
+#else
+    lbu             t3, -1(a1)
+    lbu             t4, (a1)
+#endif
+    sub             t3, t3, t4
+    SIGNOF          t3, t4, t5
+#if HIGH_BIT_DEPTH
+    vsetvli         zero, a3, e16, m1, ta, ma
+#else
+    vsetvli         zero, a3, e8, m1, ta, ma
+#endif
+    vmv.s.x         v1, t3
+loop_saoCuStatsE0_x:
+#if HIGH_BIT_DEPTH
+    addi            t5, t1, 2
+    vsetvli         t3, t2, e16, m1, ta, ma
+    vle16.v         v2, (t1)
+    vle16.v         v3, (t5)
+#else
+    addi            t5, t1, 1
+    vsetvli         t3, t2, e8, m1, ta, ma
+    vle8.v          v2, (t1)
+    vle8.v          v3, (t5)
+#endif
+    addi            t4, t3, -1
+    SIGNOF_RVV      v24, v2, v3
+    vslideup.vi     v1, v24, 1
+    vsub.vv         v25, v24, v1
+    vslidedown.vx   v1, v24, t4
+
+    vmseq.vi        v0, v25, -2
+    vmseq.vi        v2, v25, -1
+    vmseq.vi        v3, v25, 0
+    vmseq.vi        v24, v25, 1
+    vmseq.vi        v25, v25, 2
+
+    vcpop.m         t4, v0
+    vcpop.m         t5, v2
+    vcpop.m         t6, v3
+    vcpop.m         a7, v24
+    vcpop.m         s5, v25
+    add             s0, s0, t4
+    add             s1, s1, t5
+    add             s2, s2, t6
+    add             s3, s3, a7
+    add             s4, s4, s5
+
+#if HIGH_BIT_DEPTH
+    vle16.v         v26, (t0)
+    vmerge.vvm      v6, v30, v26, v0
+    vmv1r.v         v0, v2
+    vmerge.vvm      v10, v30, v26, v0
+    vmv1r.v         v0, v3
+    vmerge.vvm      v14, v30, v26, v0
+    vmv1r.v         v0, v24
+    vmerge.vvm      v18, v30, v26, v0
+    vmv1r.v         v0, v25
+    vmerge.vvm      v22, v30, v26, v0
+    vsetvli         zero, t3, e16, m1, tu, ma
+    vwadd.wv        v4, v4, v6
+    vwadd.wv        v8, v8, v10
+    vwadd.wv        v12, v12, v14
+    vwadd.wv        v16, v16, v18
+    vwadd.wv        v20, v20, v22
+#else
+    vsetvli         zero, t3, e16, m2, ta, ma
+    vle16.v         v26, (t0)
+    vmerge.vvm      v28, v30, v26, v0
+    vsetvli         zero, t3, e16, m2, tu, ma
+    vwadd.wv        v4, v4, v28
+    vsetvli         zero, t3, e16, m2, ta, ma
+    vmv1r.v         v0, v2
+    vmerge.vvm      v28, v30, v26, v0
+    vsetvli         zero, t3, e16, m2, tu, ma
+    vwadd.wv        v8, v8, v28
+    vsetvli         zero, t3, e16, m2, ta, ma
+    vmv1r.v         v0, v3
+    vmerge.vvm      v28, v30, v26, v0
+    vsetvli         zero, t3, e16, m2, tu, ma
+    vwadd.wv        v12, v12, v28
+    vsetvli         zero, t3, e16, m2, ta, ma
+    vmv1r.v         v0, v24
+    vmerge.vvm      v28, v30, v26, v0
+    vmv1r.v         v0, v25
+    vmerge.vvm      v2, v30, v26, v0
+    vsetvli         zero, t3, e16, m2, tu, ma
+    vwadd.wv        v16, v16, v28
+    vwadd.wv        v20, v20, v2
+#endif
+
+    slli            t4, t3, 1
+#if HIGH_BIT_DEPTH
+    add             t1, t1, t4
+#else
+    add             t1, t1, t3
+#endif
+    sub             t2, t2, t3
+    add             t0, t0, t4
+    bgtz            t2, loop_saoCuStatsE0_x
+    add             a0, a0, s6
+    add             a1, a1, a2
+    addi            a4, a4, -1
+    bgtz            a4, loop_saoCuStatsE0_y
+
+    lw              t0, (a6)
+    lw              t1, 4(a6)
+    lw              t2, 8(a6)
+    lw              t3, 12(a6)
+    lw              t4, 16(a6)
+    add             t1, t1, s0
+    add             t2, t2, s1
+    add             t0, t0, s2
+    add             t3, t3, s3
+    add             t4, t4, s4
+    sw              t0, (a6)
+    sw              t1, 4(a6)
+    sw              t2, 8(a6)
+    sw              t3, 12(a6)
+    sw              t4, 16(a6)
+
+#if HIGH_BIT_DEPTH
+    vsetvli         zero, a3, e32, m2, ta, ma
+#else
+    vsetvli         zero, a3, e32, m4, ta, ma

x265-4.2.tar/source/common/riscv64/ssd-a.S Added

@@ -0,0 +1,413 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Yujiao He <he.yujiao@sanechips.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+/* sse_ss, 1 row */
+.macro SSE_SS_x1 s0,s1
+    vle16.v             v8, (\s0)
+    vle16.v             v12, (\s1)
+    vsub.vv             v16, v8, v12
+    vwmacc.vv           v0, v16, v16     
+.endm
+
+/* sse_ss_func, width <= 16, fully unrolled */
+.macro SSE_SS_FUNC_SMALL w,h,lmul1,lmul2
+function PFX(pixel_sse_ss_\w\()x\h\()_rvv)
+    li                  t0, \w
+    slli                a1, a1, 1
+    slli                a3, a3, 1
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v0, 0
+
+    vsetvli             zero, t0, e16, \lmul2, ta, ma
+    SSE_SS_x1           a0, a2
+.rept \h - 1
+    add                 a0, a0, a1
+    add                 a2, a2, a3
+    SSE_SS_x1           a0, a2
+.endr
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v8, 0
+#if BIT_DEPTH == 12
+    vwredsum.vs         v8, v0, v8
+    vsetvli             zero, t0, e64, \lmul1, ta, ma
+#else
+    vredsum.vs          v8, v0, v8
+#endif
+    vmv.x.s             a0, v8
+    ret
+endfunc
+.endm
+
+SSE_SS_FUNC_SMALL 4, 4, m1, m1
+SSE_SS_FUNC_SMALL 8, 8, m2, m1
+SSE_SS_FUNC_SMALL 16, 16, m4, m2
+
+/* ssd_s_func, width <= 16, fully unrolled */
+.macro SSD_S_FUNC_SMALL w,h,lmul1,lmul2
+function PFX(pixel_ssd_s_\w\()x\h\()_rvv)
+    li                  t0, \w
+    slli                a1, a1, 1
+
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v0, 0
+
+    vsetvli             zero, t0, e16, \lmul2, ta, ma
+    vle16.v             v8, (a0)
+    vwmacc.vv           v0, v8, v8
+.rept \h - 1
+    add                 a0, a0, a1
+    vle16.v             v8, (a0)
+    vwmacc.vv           v0, v8, v8
+.endr
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v8, 0
+    vredsum.vs          v8, v0, v8
+    vmv.x.s             a0, v8
+    ret
+endfunc
+.endm
+
+SSD_S_FUNC_SMALL 4, 4, m1, m1
+SSD_S_FUNC_SMALL 8, 8, m2, m1
+SSD_S_FUNC_SMALL 16, 16, m4, m2
+
+# if !HIGH_BIT_DEPTH
+
+/* sse_pp, 1 row */
+.macro SSE_PP_x1 s0,s1,lmul1,lmul2
+    vsetvli             zero, t0, e8, \lmul2, ta, ma
+    vle8.v              v8, (\s0)
+    vle8.v              v10, (\s1)
+    VABDU               v16, v8, v10, v12
+
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vzext.vf4           v24, v16
+    vmacc.vv            v0, v24, v24
+.endm
+
+/* sse_pp_func */
+.macro SSE_PP_FUNC w,h,lmul1,lmul2,len
+function PFX(pixel_sse_pp_\w\()x\h\()_rvv)
+    li                  t0, \len
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v0, 0
+
+.if \w <=16 /* width <=16, fully unrolled */
+    SSE_PP_x1           a0, a2,\lmul1,\lmul2
+.rept \h - 1
+    add                 a0, a0, a1
+    add                 a2, a2, a3
+    SSE_PP_x1           a0, a2, \lmul1, \lmul2
+.endr
+
+.else /* width >16, loop by row */
+    li                  t1, \h
+1:
+    SSE_PP_x1           a0, a2, \lmul1, \lmul2
+
+.if \w ==64
+    addi                t2, a0, 32
+    addi                t3, a2, 32
+    SSE_PP_x1           t2, t3, \lmul1, \lmul2
+.endif
+    addi                t1, t1, -1
+    add                 a0, a0, a1
+    add                 a2, a2, a3
+    bnez                t1, 1b
+.endif
+    vmv.v.i             v8, 0
+    vredsum.vs          v8, v0, v8
+    vmv.x.s             a0, v8
+    ret
+endfunc
+.endm
+
+SSE_PP_FUNC 4, 4, m1, m1, 4
+SSE_PP_FUNC 4, 8, m1, m1, 4
+SSE_PP_FUNC 8, 8, m2, m1, 8
+SSE_PP_FUNC 8, 16, m2, m1, 8
+SSE_PP_FUNC 16, 16 ,m4, m1, 16
+SSE_PP_FUNC 16, 32, m4, m1, 16
+SSE_PP_FUNC 32, 32, m8, m2, 32
+SSE_PP_FUNC 32, 64, m8, m2, 32
+SSE_PP_FUNC 64, 64, m8, m2, 32
+
+/* sse_ss, width >16, loop by row */
+.macro SSE_SS_FUNC_BIG w,h,lmul1,lmul2
+function PFX(pixel_sse_ss_\w\()x\h\()_rvv)
+    li                  t0, 32
+    slli                a1, a1, 1
+    slli                a3, a3, 1
+    li                  t1, \h
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v0, 0
+
+    vsetvli             zero, t0, e16, \lmul2, ta, ma       
+1:
+    SSE_SS_x1           a0, a2
+.if \w ==64
+    addi                t2, a0, 64
+    addi                t3, a2, 64
+    SSE_SS_x1           t2, t3
+.endif
+    addi                t1, t1, -1
+    add                 a0, a0, a1
+    add                 a2, a2, a3
+    bnez                t1, 1b
+
+    vsetvli             zero, t0, e32, \lmul1, ta, ma
+    vmv.v.i             v8, 0
+    vredsum.vs          v8, v0, v8
+    vmv.x.s             a0, v8
+    ret
+endfunc
+.endm
+
+SSE_SS_FUNC_BIG 32, 32, m8, m4
+SSE_SS_FUNC_BIG 64, 64, m8, m4

x265-4.1.tar/source/common/scaler.cpp -> x265-4.2.tar/source/common/scaler.cpp Changed

x265-4.1.tar/source/common/slice.h -> x265-4.2.tar/source/common/slice.h Changed

@@ -26,6 +26,7 @@
 #define X265_SLICE_H
 
 #include "common.h"
+#include "mv.h"
 
 namespace X265_NS {
 // private namespace
@@ -35,6 +36,8 @@
 class PicYuv;
 class MotionReference;
 
+struct MEData;
+
 enum SliceType
 {
     B_SLICE,
@@ -104,6 +107,10 @@
         LEVEL6 = 180,
         LEVEL6_1 = 183,
         LEVEL6_2 = 186,
+        LEVEL6_3 = 189,
+        LEVEL7 = 210,
+        LEVEL7_1 = 213,
+        LEVEL7_2 = 216,
         LEVEL8_5 = 255,
     };
 }
@@ -113,7 +120,7 @@
     int      profileIdcMAX_LAYERS;
     int      levelIdc;
     uint32_t minCrForLevel;
-    uint32_t maxLumaSrForLevel;
+    uint64_t maxLumaSrForLevel;
     uint32_t bitDepthConstraint;
     int      chromaFormatConstraint;
     bool     tierFlag;
@@ -374,6 +381,7 @@
     WeightParam m_weightPredTable2MAX_NUM_REF3; // listrefIdx0:Y, 1:U, 2:V
     MotionReference (*m_mref)MAX_NUM_REF + 1;
     RPS         m_rps;
+    MEData*     m_ctuMV;
 
     NalUnitType m_nalUnitType;
     SliceType   m_sliceType;
@@ -415,6 +423,7 @@
         m_lastIDR = 0;
         m_sLFaseFlag = true;
         m_numRefIdx0 = m_numRefIdx1 = 0;
+        m_ctuMV = NULL;
         memset(m_refFrameList, 0, sizeof(m_refFrameList));
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
         memset(m_refPOCList, 0, sizeof(m_refPOCList));

x265-4.1.tar/source/common/threading.cpp -> x265-4.2.tar/source/common/threading.cpp Changed

x265-4.1.tar/source/common/threading.h -> x265-4.2.tar/source/common/threading.h Changed

@@ -56,15 +56,20 @@
 int no_atomic_inc(int* ptr);
 int no_atomic_dec(int* ptr);
 int no_atomic_add(int* ptr, int val);
+int64_t no_atomic_add64(int64_t* ptr, int64_t val);
 }
 
-#define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
-#define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define BSR(id, x)            (id) = ((unsigned long)__builtin_clz(x) ^ 31)
+#define BSF(id, x)            (id) = ((unsigned long)__builtin_ctz(x))
+#define BSR64(id, x)          (id) = ((unsigned long)__builtin_clzll(x) ^ 63)
+#define BSF64(id, x)          (id) = ((unsigned long)__builtin_ctzll(x))
 #define ATOMIC_OR(ptr, mask)  no_atomic_or((int*)ptr, mask)
 #define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask)
 #define ATOMIC_INC(ptr)       no_atomic_inc((int*)ptr)
 #define ATOMIC_DEC(ptr)       no_atomic_dec((int*)ptr)
-#define ATOMIC_ADD(ptr, val)  no_atomic_add((int*)ptr, val)
+#define ATOMIC_ADD(ptr, val)  (sizeof(*(ptr)) == 8 ? \
+                               no_atomic_add64((int64_t*)ptr, (int64_t)(val)) : \
+                               no_atomic_add((int*)ptr, (int)(val)))
 #define GIVE_UP_TIME()        usleep(0)
 
 #elif __GNUC__               /* GCCs builtin atomics */
@@ -72,24 +77,30 @@
 #include <sys/time.h>
 #include <unistd.h>
 
-#define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
-#define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define BSR(id, x)            (id) = ((unsigned long)__builtin_clz(x) ^ 31)
+#define BSF(id, x)            (id) = ((unsigned long)__builtin_ctz(x))
+#define BSR64(id, x)          (id) = ((unsigned long)__builtin_clzll(x) ^ 63)
+#define BSF64(id, x)          (id) = ((unsigned long)__builtin_ctzll(x))
 #define ATOMIC_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
 #define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
 #define ATOMIC_INC(ptr)       __sync_add_and_fetch((volatile int32_t*)ptr, 1)
 #define ATOMIC_DEC(ptr)       __sync_add_and_fetch((volatile int32_t*)ptr, -1)
-#define ATOMIC_ADD(ptr, val)  __sync_fetch_and_add((volatile int32_t*)ptr, val)
+#define ATOMIC_ADD(ptr, val)  __sync_fetch_and_add((volatile __typeof__(*(ptr))*)ptr, (__typeof__(*(ptr) + 0))(val))
 #define GIVE_UP_TIME()        usleep(0)
 
 #elif defined(_MSC_VER)       /* Windows atomic intrinsics */
 
 #include <intrin.h>
 
-#define CLZ(id, x)            _BitScanReverse(&id, x)
-#define CTZ(id, x)            _BitScanForward(&id, x)
+#define BSR(id, x)            _BitScanReverse(&id, x)
+#define BSF(id, x)            _BitScanForward(&id, x)
+#define BSR64(id, x)          _BitScanReverse64(&id, x)
+#define BSF64(id, x)          _BitScanForward64(&id, x)
 #define ATOMIC_INC(ptr)       InterlockedIncrement((volatile LONG*)ptr)
 #define ATOMIC_DEC(ptr)       InterlockedDecrement((volatile LONG*)ptr)
-#define ATOMIC_ADD(ptr, val)  InterlockedExchangeAdd((volatile LONG*)ptr, val)
+#define ATOMIC_ADD(ptr, val)  (sizeof(*(ptr)) == 8 ? \
+                               InterlockedExchangeAdd64((volatile LONGLONG*)ptr, (LONGLONG)(val)) : \
+                               InterlockedExchangeAdd((volatile LONG*)ptr, (LONG)(val)))
 #define ATOMIC_OR(ptr, mask)  _InterlockedOr((volatile LONG*)ptr, (LONG)mask)
 #define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
 #define GIVE_UP_TIME()        Sleep(0)

x265-4.1.tar/source/common/threadpool.cpp -> x265-4.2.tar/source/common/threadpool.cpp Changed

@@ -27,22 +27,32 @@
 #include "threading.h"
 
 #include <new>
+#include <vector>
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
 #include <winnt.h>
 #endif
 
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#elif !defined(_WIN32)
+#include <fstream>
+#include <string>
+#include <cstdio>
+#include <cstdlib>
+#endif
+
 #if X86_64
 
 #ifdef __GNUC__
 
-#define SLEEPBITMAP_CTZ(id, x)     id = (unsigned long)__builtin_ctzll(x)
+#define SLEEPBITMAP_BSF(id, x)     (id) = ((unsigned long)__builtin_ctzll(x))
 #define SLEEPBITMAP_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
 #define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
 
 #elif defined(_MSC_VER)
 
-#define SLEEPBITMAP_CTZ(id, x)     _BitScanForward64(&id, x)
+#define SLEEPBITMAP_BSF(id, x)     _BitScanForward64(&id, x)
 #define SLEEPBITMAP_OR(ptr, mask)  InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)
 #define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)
 
@@ -51,7 +61,7 @@
 #else
 
 /* use 32-bit primitives defined in threading.h */
-#define SLEEPBITMAP_CTZ CTZ
+#define SLEEPBITMAP_BSF BSF
 #define SLEEPBITMAP_OR  ATOMIC_OR
 #define SLEEPBITMAP_AND ATOMIC_AND
 
@@ -206,7 +216,7 @@
     sleepbitmap_t masked = m_sleepBitmap & firstTryBitmap;
     while (masked)
     {
-        SLEEPBITMAP_CTZ(id, masked);
+        SLEEPBITMAP_BSF(id, masked);
 
         sleepbitmap_t bit = (sleepbitmap_t)1 << id;
         if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
@@ -218,7 +228,7 @@
     masked = m_sleepBitmap & secondTryBitmap;
     while (masked)
     {
-        SLEEPBITMAP_CTZ(id, masked);
+        SLEEPBITMAP_BSF(id, masked);
 
         sleepbitmap_t bit = (sleepbitmap_t)1 << id;
         if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
@@ -247,6 +257,135 @@
 
     return bondCount;
 }
+
+/* Distributes totalNumThreads between ThreadedME and FrameEncoder pools.
+ * Modifies threadsPerPool, nodeMaskPerPool, numNumaNodes, and numPools in-place.
+ * Returns the number of threads reserved for frame encoding. */
+static void distributeThreadsForTme(
+    x265_param* p,
+    int totalNumThreads,
+    int& numNumaNodes,
+    bool bNumaSupport,
+    int* threadsPerPool,
+    uint64_t* nodeMaskPerPool,
+    int& numPools,
+    int& threadsFrameEnc)
+{
+    if (totalNumThreads < MIN_TME_THREADS)
+    {
+        x265_log(p, X265_LOG_WARNING, "Low thread count detected, disabling --threaded-me."
+            " Minimum recommended is 32 cores / threads\n");
+        p->bThreadedME = 0;
+        return;
+    }
+
+    int targetTME = ThreadPool::configureTmeThreadCount(p, totalNumThreads);
+    targetTME = (targetTME < 1) ? 1 : targetTME;
+
+    threadsFrameEnc = totalNumThreads - targetTME;
+    int defaultNumFT = ThreadPool::getFrameThreadsCount(p, totalNumThreads);
+    if (threadsFrameEnc < defaultNumFT)
+    {
+        threadsFrameEnc = defaultNumFT;
+        targetTME = totalNumThreads - threadsFrameEnc;
+    }
+
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
+    if (bNumaSupport && numNumaNodes > 1)
+    {
+        int tmeNumaNodes = 0;
+        int leftover = 0;
+
+        // First thread pool belongs to ThreadedME
+        std::vector<int> threads(1, 0);
+        std::vector<uint64_t> nodeMasks(1, 0);
+        int poolIndex = 0;
+
+        /* Greedily assign whole NUMA nodes to TME until reaching or exceeding the target */
+        for (int i = 0; i < numNumaNodes + 1; i++)
+        {
+            if (!threadsPerPooli && !nodeMaskPerPooli)
+                continue;
+
+            int toTake = X265_MIN(threadsPerPooli, targetTME - threads0);
+            if (toTake > 0)
+            {
+                threadspoolIndex += toTake;
+                nodeMaskspoolIndex |= nodeMaskPerPooli;
+                tmeNumaNodes++;
+
+                if (threads0 == targetTME)
+                    poolIndex++;
+
+                if (toTake < threadsPerPooli)
+                    leftover = threadsPerPooli - toTake;
+            }
+            else
+            {
+                threads.push_back(threadsPerPooli);
+                nodeMasks.push_back(nodeMaskPerPooli);
+                poolIndex++;
+            }
+        }
+
+        // Distribute leftover threads among FrameEncoders
+        if (leftover)
+        {
+            // Case 1: There are 1 or more threadpools for FrameEncoder(s) by now
+            if (threads.size() > 1)
+            {
+                int split = static_cast<int>(static_cast<double>(leftover) / (numNumaNodes - 1));
+                for (int pool = 1; pool < numNumaNodes; pool++)
+                {
+                    int give = X265_MIN(split, leftover);
+                    threadspool += give;
+                    leftover -= give;
+                }
+            }
+
+            // Case 2: FrameEncoder(s) haven't received threads yet
+            if (threads.size() == 1)
+            {
+                threads.push_back(leftover);
+                // Give the same node mask as the last node of ThreadedME
+                uint64_t msb = 1;
+                uint64_t tmeNodeMask = nodeMasks0;
+                while (tmeNodeMask > 1)
+                {
+                    tmeNodeMask >>= 1;
+                    msb <<= 1;
+                }
+                nodeMasks.push_back(msb);
+            }
+        }
+
+        // Apply calculated threadpool assignment
+        memset(threadsPerPool, 0, sizeof(int) * (numNumaNodes + 2));
+        memset(nodeMaskPerPool, 0, sizeof(uint64_t) * (numNumaNodes + 2));
+
+        numPools = numNumaNodes = static_cast<int>(threads.size());
+        for (int pool = 0; pool < numPools; pool++)
+        {
+            threadsPerPoolpool = threadspool;
+            nodeMaskPerPoolpool = nodeMaskspool;
+        }
+    }
+    else
+#endif
+    {
+        memset(threadsPerPool, 0, sizeof(int) * (numNumaNodes + 2));
+        memset(nodeMaskPerPool, 0, sizeof(uint64_t) * (numNumaNodes + 2));
+
+        threadsPerPool0 = targetTME;
+        nodeMaskPerPool0 = 1;
+
+        threadsPerPool1 = threadsFrameEnc;
+        nodeMaskPerPool1 = 1;
+
+        numPools = 2;
+    }
+}
+
 ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved)
 {
     enum { MAX_NODE_NUM = 127 };
@@ -373,6 +512,22 @@
             nodeMaskPerPoolnumNumaNodes |= ((uint64_t)1 << i);

x265-4.1.tar/source/common/threadpool.h -> x265-4.2.tar/source/common/threadpool.h Changed

@@ -102,10 +102,12 @@
     void setThreadNodeAffinity(void *numaMask);
     int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
     int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
+
     static ThreadPool* allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved);
     static int  getCpuCount();
     static int  getNumaNodeCount();
-    static void getFrameThreadsCount(x265_param* p,int cpuCount);
+    static int  getFrameThreadsCount(x265_param* p, int cpuCount);
+    static int  configureTmeThreadCount(x265_param* p, int cpuCount);
 };
 
 /* Any worker thread may enlist the help of idle worker threads from the same
@@ -169,6 +171,20 @@
     virtual void processTasks(int workerThreadId) = 0;
 };
 
+/**
+ * @brief Return the highest current CPU frequency in MHz across all cores, or 0.0 if unavailable.
+ *
+ * The value reflects the live frequency as reported by the cpufreq subsystem,
+ * which accounts for the active scaling governor and EPP hint.
+ *
+ * Platform support:
+ *   Linux   – iterates /sys/devices/system/cpu/cpuN/cpufreq/scaling_cur_freq (kHz)
+ *              for all cores and returns the maximum; falls back to /proc/cpuinfo
+ *   macOS   – sysctl hw.cpufrequency (Hz)
+ *   Windows – registry ~MHz under CentralProcessor\0
+ */
+double getCPUFrequencyMHz();
+
 } // end namespace X265_NS
 
 #endif // ifndef X265_THREADPOOL_H

x265-4.1.tar/source/common/wavefront.cpp -> x265-4.2.tar/source/common/wavefront.cpp Changed

x265-4.1.tar/source/common/x86/blockcopy8.asm -> x265-4.2.tar/source/common/x86/blockcopy8.asm Changed

x265-4.1.tar/source/common/x86/const-a.asm -> x265-4.2.tar/source/common/x86/const-a.asm Changed

x265-4.1.tar/source/common/x86/cpu-a.asm -> x265-4.2.tar/source/common/x86/cpu-a.asm Changed

x265-4.1.tar/source/common/x86/dct8.asm -> x265-4.2.tar/source/common/x86/dct8.asm Changed

x265-4.1.tar/source/common/x86/h-ipfilter16.asm -> x265-4.2.tar/source/common/x86/h-ipfilter16.asm Changed

x265-4.1.tar/source/common/x86/h-ipfilter8.asm -> x265-4.2.tar/source/common/x86/h-ipfilter8.asm Changed

x265-4.1.tar/source/common/x86/h4-ipfilter16.asm -> x265-4.2.tar/source/common/x86/h4-ipfilter16.asm Changed

x265-4.1.tar/source/common/x86/intrapred16.asm -> x265-4.2.tar/source/common/x86/intrapred16.asm Changed

x265-4.1.tar/source/common/x86/intrapred8.asm -> x265-4.2.tar/source/common/x86/intrapred8.asm Changed

x265-4.1.tar/source/common/x86/intrapred8_allangs.asm -> x265-4.2.tar/source/common/x86/intrapred8_allangs.asm Changed

x265-4.1.tar/source/common/x86/ipfilter16.asm -> x265-4.2.tar/source/common/x86/ipfilter16.asm Changed

x265-4.1.tar/source/common/x86/ipfilter8.asm -> x265-4.2.tar/source/common/x86/ipfilter8.asm Changed

x265-4.1.tar/source/common/x86/loopfilter.asm -> x265-4.2.tar/source/common/x86/loopfilter.asm Changed

x265-4.1.tar/source/common/x86/mc-a.asm -> x265-4.2.tar/source/common/x86/mc-a.asm Changed

x265-4.1.tar/source/common/x86/mc-a2.asm -> x265-4.2.tar/source/common/x86/mc-a2.asm Changed

x265-4.1.tar/source/common/x86/pixel-32.asm -> x265-4.2.tar/source/common/x86/pixel-32.asm Changed

x265-4.1.tar/source/common/x86/pixel-a.asm -> x265-4.2.tar/source/common/x86/pixel-a.asm Changed

x265-4.1.tar/source/common/x86/pixel-util8.asm -> x265-4.2.tar/source/common/x86/pixel-util8.asm Changed

@@ -8484,7 +8484,13 @@
     ; r4 - nonZero
     ; r5 - scanFlagMask
     ; r6 - sum
+
+%if FORMAT_ELF
+    mov         r0, private_prefix %+ _entropyStateBits wrt ..gotpc
+%else
     lea         r0, private_prefix %+ _entropyStateBits
+    ;mov         r0, private_prefix %+ _entropyStateBits
+%endif
     mov         r1, r6mp
     xor         r6d, r6d
     xor         r4d, r4d
@@ -8664,7 +8670,12 @@
     ; r6 - sum
     ; {r3,r4} - ctxSig15-0
     ; r8m - (numNonZero != 0) || (subPosBase == 0)
+%if FORMAT_ELF
+    mov             r0, private_prefix %+ _entropyStateBits wrt ..gotpc
+%else
     lea             r0, private_prefix %+ _entropyStateBits
+    ;mov             r0, private_prefix %+ _entropyStateBits
+%endif
     mov             r1, r6mp
     xor             r6d, r6d
     xor             r8d, r8d
@@ -8753,7 +8764,7 @@
 ;        {
 ;            {
 ;                unsigned long cidx;
-;                CLZ(cidx, codeNumber + 1);
+;                BSR(cidx, codeNumber + 1);
 ;                length = cidx;
 ;            }
 ;            codeNumber = (length + length);
@@ -8812,7 +8823,7 @@
 
     shr         eax, t3b                ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
 
-    lea         r2d, rax - 3 + 1      ; CLZ(cidx, codeNumber + 1);
+    lea         r2d, rax - 3 + 1      ; BSR(cidx, codeNumber + 1);
     bsr         r2d, r2d
     add         r2d, r2d                ; codeNumber = (length + length)
 
@@ -8903,7 +8914,12 @@
     or          r11d, 0x100                     ; default value setting to 8
     bsf         r11d, r11d
 
-    lea         r5, private_prefix %+ _entropyStateBits
+%if FORMAT_ELF
+    mov             r5, private_prefix %+ _entropyStateBits wrt ..gotpc
+%else
+    lea             r5, private_prefix %+ _entropyStateBits
+    ;mov             r5, private_prefix %+ _entropyStateBits
+%endif
     xor         r6d, r6d
     mov         r4d, 0xFFFFFFF9
 
@@ -8975,3 +8991,4 @@
     or          eax, r4d
     RET
 %endif ; ARCH_X86_64
+SECTION_IBT_SHSTK

x265-4.1.tar/source/common/x86/pixeladd8.asm -> x265-4.2.tar/source/common/x86/pixeladd8.asm Changed

x265-4.1.tar/source/common/x86/sad-a.asm -> x265-4.2.tar/source/common/x86/sad-a.asm Changed

x265-4.1.tar/source/common/x86/sad16-a.asm -> x265-4.2.tar/source/common/x86/sad16-a.asm Changed

x265-4.1.tar/source/common/x86/seaintegral.asm -> x265-4.2.tar/source/common/x86/seaintegral.asm Changed

x265-4.1.tar/source/common/x86/ssd-a.asm -> x265-4.2.tar/source/common/x86/ssd-a.asm Changed

x265-4.1.tar/source/common/x86/v4-ipfilter16.asm -> x265-4.2.tar/source/common/x86/v4-ipfilter16.asm Changed

x265-4.1.tar/source/common/x86/v4-ipfilter8.asm -> x265-4.2.tar/source/common/x86/v4-ipfilter8.asm Changed

x265-4.1.tar/source/common/x86/x86inc.asm -> x265-4.2.tar/source/common/x86/x86inc.asm Changed

@@ -65,6 +65,9 @@
         %define UNIX64 1
     %endif
 %endif
+%ifndef ENABLE_CET
+%define ENABLE_CET 0
+%endif
 
 %define FORMAT_ELF 0
 %ifidn __OUTPUT_FORMAT__,elf
@@ -90,7 +93,30 @@
         SECTION .rodata align=%1
     %endif
 %endmacro
-
+%macro SECTION_IBT_SHSTK 0
+  %if ENABLE_CET
+    %ifidn __OUTPUT_FORMAT__,win32
+    %elif WIN64
+    %else
+         SECTION .note.gnu.property note
+         align 8
+         dd    .x1 - .x0      ; data size for "GNU\0"
+         dd    .x4 - .x1      ; Elf_Prop size
+         dd    5              ; ELF::NT_GNU_PROPERTY_TYPE_0
+    .x0:
+         db    "GNU", 0
+    .x1:
+         align 8
+         dd    0xc0000002     ; ELF::GNU_PROPERTY_X86_FEATURE_1_AND
+         dd    .x3 - .x2      ; data size
+    .x2:
+         dd    0x3            ; ELF::GNU_PROPERTY_X86_FEATURE_1_SHSTK | ELF::GNU_PROPERTY_X86_FEATURE_1_IBT
+    .x3:
+         align 8
+    .x4:
+    %endif
+  %endif
+%endmacro
 %if WIN64
     %define PIC
 %elif ARCH_X86_64 == 0
@@ -737,6 +763,13 @@
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
     %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
+    %if ENABLE_CET
+      %if ARCH_X86_64
+        endbr64
+      %else
+        endbr32
+      %endif
+    %endif
     %ifnidn %3, ""
         PROLOGUE %3
     %endif

x265-4.1.tar/source/common/yuv.cpp -> x265-4.2.tar/source/common/yuv.cpp Changed

x265-4.1.tar/source/dynamicHDR10/CMakeLists.txt -> x265-4.2.tar/source/dynamicHDR10/CMakeLists.txt Changed

x265-4.1.tar/source/encoder/CMakeLists.txt -> x265-4.2.tar/source/encoder/CMakeLists.txt Changed

x265-4.1.tar/source/encoder/analysis.cpp -> x265-4.2.tar/source/encoder/analysis.cpp Changed

@@ -72,14 +72,37 @@
 
 Analysis::Analysis()
 {
+    m_bTryLossless = false;
+    m_bChromaSa8d = false;
+    m_bHD = false;
+
+    memset(m_modeFlag, 0, sizeof(m_modeFlag));
+    memset(m_checkMergeAndSkipOnly, 0, sizeof(m_checkMergeAndSkipOnly));
+
+    for (int i = 0; i < NUM_CU_DEPTH; i++)
+    {
+        m_modeDepthi.bestMode = NULL;
+        memset(m_modeDepthi.pred, 0, sizeof(m_modeDepthi.pred));
+    }
+
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
-    m_bHD = false;
-    m_modeFlag0 = false;
-    m_modeFlag1 = false;
-    m_checkMergeAndSkipOnly0 = false;
-    m_checkMergeAndSkipOnly1 = false;
+    m_reuseDepth = NULL;
+    m_reuseModes = NULL;
+    m_reusePartSize = NULL;
+    m_reuseMergeFlag = NULL;
+    m_reuseMv0 = NULL;
+    m_reuseMv1 = NULL;
+    m_reuseMvpIdx0 = NULL;
+    m_reuseMvpIdx1 = NULL;
+    cacheCost = NULL;
+    m_additionalCtuInfo = NULL;
+    m_prevCtuInfoChange = NULL;
+
     m_evaluateInter = 0;
+    m_refineLevel = 0;
+
+    memset(m_splitRefIdx, 0, sizeof(m_splitRefIdx));
 }
 
 bool Analysis::create(ThreadLocalData *tld)
@@ -135,6 +158,153 @@
     X265_FREE(cacheCost);
 }
 
+void Analysis::computeMVForPUs(CUData& ctu, const CUGeom& cuGeom, int qp, Frame& frame)
+{
+    int areaId = 0;
+    int finalIdx = 0;
+
+    uint32_t depth = cuGeom.depth;
+    uint32_t nextDepth = depth + 1;
+
+    uint32_t cuSize = 1 << cuGeom.log2CUSize;
+    bool mightSplit = (cuSize > m_param->minCUSize);
+
+    uint32_t cuX = ctu.m_cuPelX + g_zscanToPelXcuGeom.absPartIdx;
+    uint32_t cuY = ctu.m_cuPelY + g_zscanToPelYcuGeom.absPartIdx;
+
+    if (cuSize != m_param->maxCUSize)
+    {
+        uint32_t subCUSize = m_param->maxCUSize / 2;
+        areaId = (cuX >= subCUSize) + 2 * (cuY >= subCUSize) + 1;
+    }
+
+    if (mightSplit)
+    {
+        int nextQP = qp;
+        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
+                nextQP = setLambdaFromQP(ctu, calculateQpforCuSize(ctu, childGeom));
+
+            computeMVForPUs(ctu, childGeom, nextQP, frame);
+        }
+    }
+
+    ModeDepth& md = m_modeDepthcuGeom.depth;
+    CUData& cu = md.predPRED_2Nx2N.cu;
+
+    for (int i = 0; i < MAX_NUM_PU_SIZES; i++)
+    {
+        const PUBlock& pu = g_puLookupi;
+        int startIdx = g_puStartIdxpu.width + pu.heightstatic_cast<int>(pu.partsize);
+
+        if (pu.width > cuSize || pu.height > cuSize || (pu.width != cuSize && pu.height != cuSize))
+            continue;
+
+        if (!m_param->bEnableAMP && pu.isAmp)
+            continue;
+        if (!m_param->bEnableRectInter && pu.width != pu.height && !pu.isAmp)
+            continue;
+
+        int blockWidth = pu.isAmp ? X265_MAX(pu.width, pu.height) : pu.width;
+        int blockHeight = pu.isAmp ? blockWidth : pu.height;
+
+        int numColsCTU = m_param->maxCUSize / blockWidth;
+        int numRowsCTU = m_param->maxCUSize / blockHeight;
+
+        int puOffset = 0;
+        if (pu.isAmp)
+            puOffset = numRowsCTU * numColsCTU;
+        else if (pu.partsize == SIZE_2NxN)
+            puOffset = numColsCTU;
+        else if (pu.partsize == SIZE_Nx2N)
+            puOffset = 1;
+
+        int col = (cuX - ctu.m_cuPelX) / blockWidth;
+        int row = (cuY - ctu.m_cuPelY) / blockHeight;
+
+        finalIdx = startIdx + row * numColsCTU + col;
+
+        int subIdx =finalIdx - startIdx;
+
+        int puRow = subIdx / numColsCTU;
+        int puCol = subIdx % numColsCTU;
+ 
+        int leftIdx = (puCol > 0) ? startIdx + puRow * numColsCTU + (puCol - 1) : -1;
+        int aboveIdx = (puRow > 0) ? startIdx + (puRow - 1) * numColsCTU + puCol : -1;
+        int aboveLeftIdx = (puRow > 0 && puCol > 0) ? startIdx + (puRow - 1) * numColsCTU + (puCol - 1) : -1;
+        int aboveRightIdx = (puRow > 0 && puCol < numColsCTU - 1) ? startIdx + (puRow - 1) * numColsCTU + (puCol + 1) : -1;
+
+        int neighborIdxMD_ABOVE_LEFT + 1 = { leftIdx, aboveIdx, aboveRightIdx, -1, aboveLeftIdx};
+
+        cu.initSubCU(ctu, cuGeom, qp);
+        cu.setPartSizeSubParts(pu.partsize);
+        setLambdaFromQP(cu, qp);
+        puMotionEstimation(m_slice, cuGeom, cu, m_frame->m_fencPic, puOffset, pu.partsize, areaId, finalIdx, false, neighborIdx);
+    }
+}
+
+void Analysis::deriveMVsForCTU(CUData& ctu, const CUGeom& cuGeom, Frame& frame)
+{
+    m_slice = ctu.m_slice;
+    m_frame = &frame;
+    m_param = m_frame->m_param;
+
+    ModeDepth& md = m_modeDepth0;
+
+    int numPredDir = m_slice->isInterP() ? 1 : 2;
+
+    // Full CTU
+    int baseQP = setLambdaFromQP(ctu, ctu.m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : ctu.m_slice->m_sliceQp);
+
+    md.predPRED_2Nx2N.cu.initSubCU(ctu, cuGeom, baseQP);
+    md.predPRED_2Nx2N.cu.setPartSizeSubParts(SIZE_2Nx2N);
+
+    puMotionEstimation(m_slice, cuGeom, md.predPRED_2Nx2N.cu, frame.m_fencPic, 0, SIZE_2Nx2N, 0, 0, true);
+
+    // Sub-CUs
+    if (m_param->maxCUSize != m_param->minCUSize)
+    {
+        for (int sub = 0; sub < 4; sub++)
+        {
+            ModeDepth& md1 = m_modeDepth1;
+
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + sub);
+            int qp = setLambdaFromQP(ctu, ctu.m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, childGeom) : ctu.m_slice->m_sliceQp);
+
+            md1.predPRED_2Nx2N.cu.initSubCU(ctu, childGeom, qp);
+            md1.predPRED_2Nx2N.cu.setPartSizeSubParts(SIZE_2Nx2N);
+
+            puMotionEstimation(m_slice, childGeom, md1.predPRED_2Nx2N.cu, frame.m_fencPic, 0, SIZE_2Nx2N, sub + 1, 0, true);
+        }
+    }
+
+    const Frame* colPic = m_slice->m_refFrameListm_slice->isInterB() && !m_slice->m_colFromL0Flagm_slice->m_colRefIdx;
+    const CUData* colCU = colPic->m_encData->getPicCTU(ctu.m_cuAddr);
+
+    for (int list = 0; list < numPredDir; list++)
+    {
+        int numRef = ctu.m_slice->m_numRefIdxlist;
+
+        for (int ref = 0; ref < numRef; ref++)
+        {
+            MV medianMv;
+            bool valid = ctu.getMedianColMV(colCU, colPic, list, ref, medianMv);
+            if (!valid)
+                continue;
+
+            for (int areaIdx = 0; areaIdx < 5; areaIdx++)
+            {
+                m_areaBestMVareaIdxlistref = medianMv;
+            }
+        }
+    }
+
+    computeMVForPUs(ctu, cuGeom, baseQP, frame);
+
+}
+
 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
 {
     m_slice = ctu.m_slice;
@@ -2960,12 +3130,19 @@
 
         if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predModecuGeom.absPartIdx == MODE_SKIP  && !mode.cu.isSkipped(0)))

x265-4.1.tar/source/encoder/analysis.h -> x265-4.2.tar/source/encoder/analysis.h Changed

@@ -130,6 +130,24 @@
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
     int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
 
+    /**
+     * @brief Build CTU-level and area-level MVP seeds used by threaded ME.
+     *
+     * Performs an initial 2Nx2N search on the full CTU (and first split depth
+     * when available), propagates temporal/colocated medians, and then drives
+     * per-PU motion estimation.
+     */
+    void deriveMVsForCTU(CUData& ctu, const CUGeom& cuGeom, Frame& frame);
+
+    /**
+     * @brief Recursively walk CU partitions and run ME for each enabled PU shape.
+     *
+     * Computes the PU index mapping (`finalIdx`) used by CTU MV storage and
+     * submits each PU to puMotionEstimation() with neighbor indices for MVP
+     * derivation.
+     */
+    void computeMVForPUs(CUData& ctu, const CUGeom& cuGeom, int qp, Frame& frame);
+
 protected:
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
     x265_analysis_inter_data*  m_reuseInterDataCTU;

x265-4.1.tar/source/encoder/api.cpp -> x265-4.2.tar/source/encoder/api.cpp Changed

@@ -309,7 +309,7 @@
 {
     if (!enc || !param_in)
         return -1;
-    x265_param save;
+    x265_param save = {};
     Encoder* encoder = static_cast<Encoder*>(enc);
     if (strlen(encoder->m_param->csvfn) && param_in->csvfpt != NULL)
          encoder->m_param->csvfpt = param_in->csvfpt;
@@ -965,8 +965,8 @@
 
         if (maxReuseLevel > 4)
         {
-            X265_FREE((analysis->interData)->mergeFlag);
-            X265_FREE((analysis->interData)->partSize);
+             X265_FREE((analysis->interData)->mergeFlag);
+             X265_FREE((analysis->interData)->partSize);
         }
         if (maxReuseLevel >= 7)
         {
@@ -1403,6 +1403,8 @@
                     /* detailed performance statistics */
                     fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms),"
                         "Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks");
+
+                    fprintf(csvfp, ", Total ThreadedME Wait Time (ms), Total ThreadedME Time (ms)");
 #if ENABLE_LIBVMAF
                     fprintf(csvfp, ", VMAF Frame Score");
 #endif
@@ -1539,6 +1541,9 @@
                                                                                      frameStats->totalFrameTime);
 
         fprintf(param->csvfpt, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
+
+        fprintf(param->csvfpt, ", %.1lf, %.1lf", frameStats->tmeWaitTime / 1000.0, frameStats->tmeTime / 1000.0);
+
 #if ENABLE_LIBVMAF
         fprintf(param->csvfpt, ", %lf", frameStats->vmafFrameScore);
 #endif

x265-4.1.tar/source/encoder/dpb.cpp -> x265-4.2.tar/source/encoder/dpb.cpp Changed

x265-4.1.tar/source/encoder/encoder.cpp -> x265-4.2.tar/source/encoder/encoder.cpp Changed

@@ -39,6 +39,7 @@
 #include "ratecontrol.h"
 #include "dpb.h"
 #include "nal.h"
+#include "threadedme.h"
 
 #include "x265.h"
 
@@ -70,7 +71,7 @@
 
 DolbyVisionProfileSpec dovi =
 {
-    { 1, 1, 1, 1, 1, 5, 1,  2, 2, 2, 50 },
+    { 1, 1, 1, 1, 1, 5, 1, 16, 9, 15, 50 },
     { 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
     { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 },
     { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
@@ -132,6 +133,7 @@
     m_numLumaWPBiFrames = 0;
     m_numChromaWPBiFrames = 0;
     m_lookahead = NULL;
+    m_threadedME = NULL;
     m_rateControl = NULL;
     m_dpb = NULL;
     m_numDelayedPic = 0;
@@ -254,10 +256,16 @@
         p->bEnableWavefront = 0;
     }
 
+    // For zero-latency tune, frameNumThreads must be set to 1
+    if (p->tune && (!strcmp(p->tune, "zerolatency") || !strcmp(p->tune, "zero-latency")))
+    {
+        p->frameNumThreads = 1;
+    }
+
     bool allowPools = !strlen(p->numaPools) || strcmp(p->numaPools, "none");
 
     // Trim the thread pool if --wpp, --pme, and --pmode are disabled
-    if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation && !p->lookaheadSlices)
+    if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation && !p->lookaheadSlices && !p->bThreadedME)
         allowPools = false;
 
     m_numPools = 0;
@@ -269,7 +277,7 @@
         {
             // auto-detect frame threads
             int cpuCount = ThreadPool::getCpuCount();
-            ThreadPool::getFrameThreadsCount(p, cpuCount);
+            p->frameNumThreads = ThreadPool::getFrameThreadsCount(p, cpuCount);
         }
     }
 
@@ -284,9 +292,12 @@
             x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pmode disabled\n");
         if (p->lookaheadSlices)
             x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --lookahead-slices disabled\n");
+        if (p->bThreadedME)
+            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --threaded-me disabled\n");
 
         // disable all pool features if the thread pool is disabled or unusable.
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = p->lookaheadSlices = 0;
+        p->bThreadedME = 0;
     }
 
     x265_log(p, X265_LOG_INFO, "Slices                              : %d\n", p->maxSlices);
@@ -299,6 +310,8 @@
         len += snprintf(buf + len,  sizeof(buf) - len, "%spmode", len ? "+" : "");
     if (p->bDistributeMotionEstimation)
         len += snprintf(buf + len, sizeof(buf) - len, "%spme ", len ? "+" : "");
+    if (p->bThreadedME)
+        len += snprintf(buf + len, sizeof(buf) - len, "%sthreaded-me", len ? "+": "");
     if (!len)
         strcpy(buf, "none");
 
@@ -310,17 +323,37 @@
         m_frameEncoderi->m_nalList.m_annexB = !!m_param->bAnnexB;
     }
 
+    if (p->bThreadedME)
+    {
+        m_threadedME = new ThreadedME(m_param, *this);
+    }
+
     if (m_numPools)
     {
+        // First threadpool belongs to ThreadedME, if the feature is enabled
+        if (p->bThreadedME)
+        {
+            m_threadedME->m_pool = &m_threadPool0;
+            m_threadedME->m_jpId = 0;
+
+            m_threadPool0.m_numProviders = 1;
+            m_threadPool0.m_jpTablem_threadedME->m_jpId = m_threadedME;
+        }
+
+        int numFrameThreadPools = (!m_param->bThreadedME) ? m_numPools : m_numPools - 1;
+
         for (int i = 0; i < m_param->frameNumThreads; i++)
         {
-            int pool = i % m_numPools;
+            // Since first pool belongs to ThreadedME
+            int pool = static_cast<int>(p->bThreadedME) + i % numFrameThreadPools;
             m_frameEncoderi->m_pool = &m_threadPoolpool;
             m_frameEncoderi->m_jpId = m_threadPoolpool.m_numProviders++;
             m_threadPoolpool.m_jpTablem_frameEncoderi->m_jpId = m_frameEncoderi;
         }
-        for (int i = 0; i < m_numPools; i++)
-            m_threadPooli.start();
+
+
+        for (int j = 0; j < m_numPools; j++)
+            m_threadPoolj.start();
     }
     else
     {
@@ -348,7 +381,7 @@
         lookAheadThreadPool = ThreadPool::allocThreadPools(p, pools, 1);
     }
     else
-        lookAheadThreadPool = m_threadPool;
+        lookAheadThreadPool = (!m_param->bThreadedME) ? m_threadPool : &m_threadPool1;
     m_lookahead = new Lookahead(m_param, lookAheadThreadPool);
     if (pools)
     {
@@ -361,6 +394,22 @@
     m_lookahead->m_numPools = pools;
     m_dpb = new DPB(m_param);
 
+    if (p->bThreadedME)
+    {
+        if (!m_threadedME->create())
+        {
+            m_param->bThreadedME = 0;
+            X265_FREE(m_threadedME);
+            m_threadedME = NULL;
+
+            x265_log(m_param, X265_LOG_ERROR, "Failed to create threadedME thread pool, --threaded-me disabled");
+        }
+        else
+        {
+            m_threadedME->start();
+        }
+    }
+
     m_rateControl = new RateControl(*m_param, this);
     if (!m_param->bResetZoneConfig)
     {
@@ -474,6 +523,7 @@
         m_aborted = true;
 
     initRefIdx();
+
     if (strlen(m_param->analysisSave) && m_param->bUseAnalysisFile)
     {
         char* temp = strcatFilename(m_param->analysisSave, ".temp");
@@ -584,7 +634,10 @@
 
     if (m_lookahead)
         m_lookahead->stopJobs();
-    
+
+    if (m_threadedME)
+        m_threadedME->stopJobs();
+
     for (int i = 0; i < m_param->frameNumThreads; i++)
     {
         if (m_frameEncoderi)
@@ -927,6 +980,12 @@
         delete m_lookahead;
     }
 
+    if (m_threadedME)
+    {
+        m_threadedME->destroy();
+        delete m_threadedME;
+    }
+
     delete m_dpb;
     if (!m_param->bResetZoneConfig && m_param->rc.zonefileCount)
     {
@@ -1082,6 +1141,16 @@
     }
 
     int numPayloads = pic_in->userSEI.numPayloads + toneMapPayload + userPayload;
+
+    // TODO: we may reuse buffer if become smaller than exist buffer
+    if (frame->m_userSEI.payloads && numPayloads != frame->m_userSEI.numPayloads)
+    {
+        for (int i = 0; i < frame->m_userSEI.numPayloads; i++)
+            delete frame->m_userSEI.payloadsi.payload;
+        delete frame->m_userSEI.payloads;
+        frame->m_userSEI.payloads = NULL;
+    }
+
     frame->m_userSEI.numPayloads = numPayloads;
 
     if (frame->m_userSEI.numPayloads)
@@ -1102,6 +1171,12 @@
             else
                 input = pic_in->userSEI.payloadsi;

x265-4.1.tar/source/encoder/encoder.h -> x265-4.2.tar/source/encoder/encoder.h Changed

x265-4.1.tar/source/encoder/entropy.cpp -> x265-4.2.tar/source/encoder/entropy.cpp Changed

@@ -1782,7 +1782,7 @@
                 else
 #endif
                     WRITE_FLAG(!!wp0.wtPresent, "luma_weight_lX_flag");
-                totalSignalledWeightFlags += wp0.wtPresent;
+                totalSignalledWeightFlags = totalSignalledWeightFlags + wp0.wtPresent;
             }
 
             if (bChroma)
@@ -1796,7 +1796,7 @@
                     else
 #endif
                         WRITE_FLAG(!!wp1.wtPresent, "chroma_weight_lX_flag");
-                    totalSignalledWeightFlags += 2 * wp1.wtPresent;
+                    totalSignalledWeightFlags = totalSignalledWeightFlags + 2 * wp1.wtPresent;
                 }
             }
 
@@ -1893,7 +1893,7 @@
         codeNumber = (codeNumber >> absGoRice) - COEF_REMAIN_BIN_REDUCTION;
         {
             unsigned long idx;
-            CLZ(idx, codeNumber + 1);
+            BSR(idx, codeNumber + 1);
             length = idx;
             X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
             codeNumber -= (1 << idx) - 1;
@@ -2206,7 +2206,7 @@
             {
                 {
                     unsigned long cidx;
-                    CLZ(cidx, codeNumber + 1);
+                    BSR(cidx, codeNumber + 1);
                     length = cidx;
                 }
                 X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
@@ -2319,8 +2319,8 @@
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
     ALIGN_VAR_32(uint16_t, absCoeff(1 << MLS_CG_SIZE) + 1);   // extra 2 bytes(+1) space for AVX2 assembly, +1 because (numNonZero<=1) in costCoeffNxN path
     uint32_t numNonZero = 1;
-    unsigned long lastNZPosInCG;
-    unsigned long firstNZPosInCG;
+    unsigned long lastNZPosInCG = 0;
+    unsigned long firstNZPosInCG = 0;
 
 #if _DEBUG
     // Unnecessary, for Valgrind-3.10.0 only
@@ -2410,6 +2410,7 @@
             if (m_bitIf)
             {
                 ALIGN_VAR_32(uint16_t, tmpCoeffSCAN_SET_SIZE);
+                memset(tmpCoeff, 0, sizeof(tmpCoeff));
 
                 // TODO: accelerate by PABSW
                 for (int i = 0; i < MLS_CG_SIZE; i++)
@@ -2488,10 +2489,10 @@
         numNonZero = coeffNumsubSet;
         if (numNonZero > 0)
         {
-            uint32_t idx;
+            uint32_t idx = 0;
             X265_CHECK(subCoeffFlag > 0, "subCoeffFlag is zero\n");
-            CLZ(lastNZPosInCG, subCoeffFlag);
-            CTZ(firstNZPosInCG, subCoeffFlag);
+            BSR(lastNZPosInCG, subCoeffFlag);
+            BSF(firstNZPosInCG, subCoeffFlag);
 
             bool signHidden = (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD);
             const uint8_t ctxSet = (((subSet > 0) + bIsLuma) & 2) + !(c1 & 3);
@@ -2887,7 +2888,7 @@
         // NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256
         //numBits = g_renormTablelps >> 3;
         unsigned long idx;
-        CLZ(idx, lps);
+        BSR(idx, lps);
         X265_CHECK(state != 63 || idx == 1, "state failure\n");
 
         numBits = 8 - idx;

x265-4.1.tar/source/encoder/frameencoder.cpp -> x265-4.2.tar/source/encoder/frameencoder.cpp Changed

@@ -36,6 +36,8 @@
 #include "nal.h"
 #include "temporalfilter.h"
 
+#include <iostream>
+
 namespace X265_NS {
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
 
@@ -196,10 +198,12 @@
     // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
     {
         unsigned long tmp;
-        CLZ(tmp, (numRows * numCols - 1));
+        BSR(tmp, (numRows * numCols - 1));
         m_sliceAddrBits = (uint16_t)(tmp + 1);
     }
 
+    m_tmeDeps.resize(m_numRows);
+
     m_retFrameBuffer = X265_MALLOC(Frame*, m_param->numLayers);
     for (int layer = 0; layer < m_param->numLayers; layer++)
         m_retFrameBufferlayer = NULL;
@@ -374,7 +378,7 @@
 
     ++ucode;
     unsigned long idx;
-    CLZ( idx, ucode );
+    BSR( idx, ucode );
     uint32_t length = (uint32_t)idx * 2 + 1;
 
     return length;
@@ -447,6 +451,8 @@
     m_totalActiveWorkerCount = 0;
     m_activeWorkerCountSamples = 0;
     m_totalWorkerElapsedTimelayer = 0;
+    m_totalThreadedMETimelayer = 0;
+    m_totalThreadedMEWaitlayer = 0;
     m_totalNoWorkerTimelayer = 0;
     m_countRowBlocks = 0;
     m_allRowsAvailableTimelayer = 0;
@@ -915,7 +921,7 @@
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
 
-    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)    
+    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
         m_rowsm_sliceBaseRowsliceId.active = true;
     
     if (m_param->bEnableWavefront)
@@ -975,8 +981,16 @@
                             m_mreflref.applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
                     }
                 }
-
+                
                 enableRowEncoder(m_row_to_idxrow); /* clear external dependency for this row */
+
+                if (m_top->m_threadedME && !slice->isIntra())
+                {
+                    ScopedLock lock(m_tmeDepLock);
+                    m_tmeDepsrow.external = true;
+                    m_top->m_threadedME->enqueueReadyRows(row, layer, this);
+                }
+
                 if (!rowInSlice)
                 {
                     m_row0WaitTimelayer = x265_mdate();
@@ -1022,6 +1036,13 @@
                     }
                 }
 
+                if (m_top->m_threadedME && !slice->isIntra())
+                {
+                    ScopedLock lock(m_tmeDepLock);
+                    m_tmeDepsi.external = true;
+                    m_top->m_threadedME->enqueueReadyRows(i, layer, this);
+                }
+
                 if (!i)
                     m_row0WaitTimelayer = x265_mdate();
                 else if (i == m_numRows - 1)
@@ -1038,6 +1059,11 @@
     vmafFrameLevelScore();
 #endif
 
+    m_tmeDepLock.acquire();
+    m_tmeDeps.clear();
+    m_tmeDeps.resize(m_numRows);
+    m_tmeDepLock.release();
+
     if (m_param->maxSlices > 1)
     {
         PicYuv *reconPic = m_framelayer->m_reconPic0;
@@ -1470,7 +1496,9 @@
     const uint32_t typeNum = m_idx_to_rowrow & 1;
 
     if (!typeNum)
+    {
         processRowEncoder(realRow, m_tldthreadId, layer);
+    }
     else
     {
         m_frameFilter.processRow(realRow, layer);
@@ -1600,6 +1628,12 @@
     if (tld.analysis.m_sliceMaxY < tld.analysis.m_sliceMinY)
         tld.analysis.m_sliceMaxY = tld.analysis.m_sliceMinY = 0;
 
+    if (m_top->m_threadedME && !slice->isIntra())
+    {
+        ScopedLock lock(m_tmeDepLock);
+        m_tmeDepsrow.internal = true;
+        m_top->m_threadedME->enqueueReadyRows(row, layer, this);
+    }
 
     while (curRow.completed < numCols)
     {
@@ -1609,6 +1643,29 @@
         const uint32_t cuAddr = lineStartCUAddr + col;
         CUData* ctu = curEncData.getPicCTU(cuAddr);
         const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
+
+        /* Must wait for TME to finish before initCTU because both threads
+         * operate on the same CUData — the encoder's initCTU would corrupt
+         * data that deriveMVsForCTU is still reading. */
+        if (m_top->m_threadedME && slice->m_sliceType != I_SLICE)
+        {
+            int64_t waitStart = x265_mdate();
+            bool waited = false;
+
+            while (m_framelayer->m_ctuMEFlagscuAddr.get() == 0)
+            {
+#ifdef DETAILED_CU_STATS
+                tld.analysis.m_statsm_jpId.countTmeBlockedCTUs++;
+#endif
+                m_framelayer->m_ctuMEFlagscuAddr.waitForChange(0);
+                waited = true;
+            }
+
+            int64_t waitEnd = x265_mdate();
+            if (waited)
+                ATOMIC_ADD(&m_totalThreadedMEWaitlayer, waitEnd - waitStart);
+        }
+
         ctu->initCTU(*m_framelayer, cuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
 
         if (!layer && bIsVbv)
@@ -2324,88 +2381,179 @@
     }
 }
 
+void compute_film_grain_resolution(int width, int height,
+                                   int& apply_units_resolution_log2,
+                                   int& apply_horz_resolution,
+                                   int& apply_vert_resolution)
+{
+    unsigned long log2_width, log2_height;
+    BSF(log2_width, (unsigned long) width);
+    BSF(log2_height, (unsigned long) height);
+
+    int log2 = (log2_width < log2_height) ? log2_width : log2_height;
+    apply_units_resolution_log2 = log2;
+
+    int unit = 1 << log2;
+    apply_horz_resolution = width / unit;
+    apply_vert_resolution = height / unit;
+
+    return;
+}
+
 void FrameEncoder::readAomModel(AomFilmGrainCharacteristics* m_aomFilmGrain, FILE* Aomfilmgrain)
 {
     char const* errorMessage = "Error reading Aom FilmGrain characteristics\n";
     AomFilmGrain m_afg;
     m_afg.m_chroma_scaling_from_luma = 0;
+    int bitCount = 0;
+    bitCount += 4; // payload_less_than_4byte_flag(1) + film_grain_param_set_idx(3)
     x265_fread((char*)&m_aomFilmGrain->m_apply_grain, sizeof(int32_t), 1, Aomfilmgrain, errorMessage);
+    bitCount++;
     x265_fread((char*)&m_aomFilmGrain->m_grain_seed, sizeof(uint16_t), 1, Aomfilmgrain, errorMessage);
+    bitCount+=16;
     x265_fread((char*)&m_aomFilmGrain->m_update_grain, sizeof(int32_t), 1, Aomfilmgrain, errorMessage);
+    bitCount++;
     x265_fread((char*)&m_aomFilmGrain->m_num_y_points, sizeof(int32_t), 1, Aomfilmgrain, errorMessage);
+    bitCount+=4;
+
     if (m_aomFilmGrain->m_num_y_points)
     {
+        m_aomFilmGrain->point_y_value_increment_bits = 8;
+        bitCount += 3;
+        m_aomFilmGrain->point_y_scaling_bits = 8;
+        bitCount += 2;
         for (int i = 0; i < m_aomFilmGrain->m_num_y_points; i++)
         {
             for (int j = 0; j < 2; j++)
             {
                 x265_fread((char*)&m_aomFilmGrain->m_scaling_points_yij, sizeof(int32_t), 1, Aomfilmgrain, errorMessage);
+                bitCount+=8;
             }
         }

x265-4.1.tar/source/encoder/frameencoder.h -> x265-4.2.tar/source/encoder/frameencoder.h Changed

@@ -41,6 +41,8 @@
 #include "reference.h"
 #include "nal.h"
 #include "temporalfilter.h"
+#include "threadedme.h"
+#include <queue>
 
 namespace X265_NS {
 // private x265 namespace
@@ -241,6 +243,9 @@
     int64_t                  m_slicetypeWaitTimeMAX_LAYERS;        // total elapsed time waiting for decided frame
     int64_t                  m_totalWorkerElapsedTimeMAX_LAYERS;   // total elapsed time spent by worker threads processing CTUs
     int64_t                  m_totalNoWorkerTimeMAX_LAYERS;        // total elapsed time without any active worker threads
+    int64_t                  m_totalThreadedMEWaitMAX_LAYERS;      // total time spent waiting by CTUs for ThreadedME
+    int64_t                  m_totalThreadedMETimeMAX_LAYERS;      // total time spent processing by ThreadedME
+
 #if DETAILED_CU_STATS
     CUStats                  m_cuStats;
 #endif
@@ -267,6 +272,19 @@
 
     int                      m_sLayerId;
 
+    std::queue<CTUTask>      m_tmeTasks;
+    Lock                     m_tmeTasksLock;
+
+    struct TMEDependencyState
+    {
+        bool internal;
+        bool external;
+        bool isQueued;
+    };
+
+    std::vector<TMEDependencyState> m_tmeDeps;
+    Lock                     m_tmeDepLock;
+
     class WeightAnalysis : public BondedTaskGroup
     {
     public:

x265-4.1.tar/source/encoder/framefilter.cpp -> x265-4.2.tar/source/encoder/framefilter.cpp Changed

x265-4.1.tar/source/encoder/level.cpp -> x265-4.2.tar/source/encoder/level.cpp Changed

@@ -31,7 +31,7 @@
 typedef struct
 {
     uint32_t maxLumaSamples;
-    uint32_t maxLumaSamplesPerSecond;
+    uint64_t maxLumaSamplesPerSecond;
     uint32_t maxBitrateMain;
     uint32_t maxBitrateHigh;
     uint32_t maxCpbSizeMain;
@@ -44,31 +44,26 @@
 
 LevelSpec levels =
 {
-    { 36864,    552960,     128,      MAX_UINT, 350,    MAX_UINT, 2, Level::LEVEL1,   "1",   10 },
-    { 122880,   3686400,    1500,     MAX_UINT, 1500,   MAX_UINT, 2, Level::LEVEL2,   "2",   20 },
-    { 245760,   7372800,    3000,     MAX_UINT, 3000,   MAX_UINT, 2, Level::LEVEL2_1, "2.1", 21 },
-    { 552960,   16588800,   6000,     MAX_UINT, 6000,   MAX_UINT, 2, Level::LEVEL3,   "3",   30 },
-    { 983040,   33177600,   10000,    MAX_UINT, 10000,  MAX_UINT, 2, Level::LEVEL3_1, "3.1", 31 },
-    { 2228224,  66846720,   12000,    30000,    12000,  30000,    4, Level::LEVEL4,   "4",   40 },
-    { 2228224,  133693440,  20000,    50000,    20000,  50000,    4, Level::LEVEL4_1, "4.1", 41 },
-    { 8912896,  267386880,  25000,    100000,   25000,  100000,   6, Level::LEVEL5,   "5",   50 },
-    { 8912896,  534773760,  40000,    160000,   40000,  160000,   8, Level::LEVEL5_1, "5.1", 51 },
-    { 8912896,  1069547520, 60000,    240000,   60000,  240000,   8, Level::LEVEL5_2, "5.2", 52 },
-    { 35651584, 1069547520, 60000,    240000,   60000,  240000,   8, Level::LEVEL6,   "6",   60 },
-    { 35651584, 2139095040, 120000,   480000,   120000, 480000,   8, Level::LEVEL6_1, "6.1", 61 },
-    { 35651584, 4278190080U, 240000,  800000,   240000, 800000,   6, Level::LEVEL6_2, "6.2", 62 },
-    { MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
+    { 36864,      552960,         128,      MAX_UINT, 350,     MAX_UINT, 2, Level::LEVEL1,   "1",   10 },
+    { 122880,     3686400,        1500,     MAX_UINT, 1500,    MAX_UINT, 2, Level::LEVEL2,   "2",   20 },
+    { 245760,     7372800,        3000,     MAX_UINT, 3000,    MAX_UINT, 2, Level::LEVEL2_1, "2.1", 21 },
+    { 552960,     16588800,       6000,     MAX_UINT, 6000,    MAX_UINT, 2, Level::LEVEL3,   "3",   30 },
+    { 983040,     33177600,       10000,    MAX_UINT, 10000,   MAX_UINT, 2, Level::LEVEL3_1, "3.1", 31 },
+    { 2228224,    66846720,       12000,    30000,    12000,   30000,    4, Level::LEVEL4,   "4",   40 },
+    { 2228224,    133693440,      20000,    50000,    20000,   50000,    4, Level::LEVEL4_1, "4.1", 41 },
+    { 8912896,    267386880,      25000,    100000,   25000,   100000,   6, Level::LEVEL5,   "5",   50 },
+    { 8912896,    534773760,      40000,    160000,   40000,   160000,   8, Level::LEVEL5_1, "5.1", 51 },
+    { 8912896,    1069547520,     60000,    240000,   60000,   240000,   8, Level::LEVEL5_2, "5.2", 52 },
+    { 35651584,   1069547520,     60000,    240000,   60000,   240000,   8, Level::LEVEL6,   "6",   60 },
+    { 35651584,   2139095040,     120000,   480000,   120000,  480000,   8, Level::LEVEL6_1, "6.1", 61 },
+    { 35651584,   4278190080U,    240000,   800000,   240000,  800000,   6, Level::LEVEL6_2, "6.2", 62 },
+    { 80216064,   4812963840ULL,  320000,   1600000,  240000,  1600000,  6, Level::LEVEL6_3, "6.3", 63 },
+    { 142606336,  4812963840ULL,  320000,   1600000,  240000,  1600000,  6, Level::LEVEL7,   "7",   70 },
+    { 142606336,  8556380160ULL,  480000,   3200000,  480000,  3200000,  6, Level::LEVEL7_1, "7.1", 71 },
+    { 142606336,  17112760320ULL, 960000,   6400000,  960000,  6400000,  6, Level::LEVEL7_2, "7.2", 72 },
+    { MAX_UINT, MAX_UINT64, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
 };
 
-static inline int _confirm(x265_param* param, bool bflag, const char* message)
-{
-    if (!bflag)
-        return 0;
-
-    x265_log(param, X265_LOG_ERROR, "%s\n", message);
-    return 1;
-}
-
 /* determine minimum decoder level required to decode the described video */
 void determineLevel(const x265_param &param, VPS& vps)
 {
@@ -153,7 +148,7 @@
 #endif
 
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
-    uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
+    uint64_t samplesPerSec = (uint64_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
     uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate;
 
     const uint32_t MaxDpbPicBuf = param.bEnableSCC ? 7 : 6;
@@ -164,9 +159,9 @@
     uint32_t i;
     if (param.bLossless)
     {
-        i = 13;
+        i = NumLevels - 1;
         vps.ptl.minCrForLevel = 1;
-        vps.ptl.maxLumaSrForLevel = MAX_UINT;
+        vps.ptl.maxLumaSrForLevel = MAX_UINT64;
         vps.ptl.levelIdc = Level::LEVEL8_5;
         vps.ptl.tierFlag = Level::MAIN;
     }
@@ -401,7 +396,7 @@
     bool allowHighTier = l.maxBitrateHigh < MAX_UINT && param.bHighTier;
 
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
-    uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
+    uint64_t samplesPerSec = (uint64_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
     bool ok = true;
     if (lumaSamples > l.maxLumaSamples)
         ok = false;

x265-4.1.tar/source/encoder/motion.cpp -> x265-4.2.tar/source/encoder/motion.cpp Changed

@@ -628,6 +628,155 @@
     }
 }
 
+int MotionEstimate::diamondSearch(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, MV& outMV)
+{
+    int bcost = INT_MAX;
+    MV bmv(0, 0);
+    MV omv = bmv;
+
+    ALIGN_VAR_16(int, costs16);
+
+    intptr_t stride = ref->lumaStride;
+    pixel* fenc = fencPUYuv.m_buf0;
+    pixel* fref = ref->fpelPlane0 + blockOffset;
+
+    for (int16_t dist = 1; dist <= 4; dist <<= 1)
+    {
+        const int32_t top = omv.y - dist;
+        const int32_t bottom = omv.y + dist;
+        const int32_t left = omv.x - dist;
+        const int32_t right = omv.x + dist;
+        const int32_t top2 = omv.y - (dist >> 1);
+        const int32_t bottom2 = omv.y + (dist >> 1);
+        const int32_t left2 = omv.x - (dist >> 1);
+        const int32_t right2 = omv.x + (dist >> 1);
+
+        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
+        {
+            COST_MV_X4(omv.x, top, omv.x, bottom, left, omv.y, right, omv.y);
+            COST_MV_X4(left2, top2, right2, top2, left2, bottom2, right2, bottom2);
+        }
+        else // check border for each mv
+        {
+            if (top >= mvmin.y) // check top
+            {
+                COST_MV(omv.x, top);
+            }
+            if (top2 >= mvmin.y) // check half top
+            {
+                if (left2 >= mvmin.x)  // check half left
+                {
+                    COST_MV(left2, top2);
+                }
+                if (right2 <= mvmax.x) // check half right
+                {
+                    COST_MV(right2, top2);
+                }
+            }
+            if (left >= mvmin.x) // check left
+            {
+                COST_MV(left, omv.y);
+            }
+            if (right <= mvmax.x) // check right
+            {
+                COST_MV(right, omv.y);
+            }
+            if (bottom2 <= mvmax.y) // check half bottom
+            {
+                if (left2 >= mvmin.x) // check half left
+                {
+                    COST_MV(left2, bottom2);
+                }
+                if (right2 <= mvmax.x) // check half right
+                {
+                    COST_MV(right2, bottom2);
+                }
+            }
+            if (bottom <= mvmax.y) // check bottom
+            {
+                COST_MV(omv.x, bottom);
+            }
+        }
+    }
+
+    for (int16_t dist = 8; dist <= 64; dist += 8)
+    {
+        const int32_t top = omv.y - dist;
+        const int32_t bottom = omv.y + dist;
+        const int32_t left = omv.x - dist;
+        const int32_t right = omv.x + dist;
+
+        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
+        {
+            COST_MV_X4(omv.x, top, left, omv.y, right, omv.y, omv.x, bottom);
+
+            for (int16_t index = 1; index < 4; index++)
+            {
+                int32_t posYT = top + ((dist >> 2) * index);
+                int32_t posYB = bottom - ((dist >> 2) * index);
+                int32_t posXL = omv.x - ((dist >> 2) * index);
+                int32_t posXR = omv.x + ((dist >> 2) * index);
+
+                COST_MV_X4(posXL, posYT,
+                    posXR, posYT,
+                    posXL, posYB,
+                    posXR, posYB);
+            }
+        }
+        else // check border for each mv
+        {
+            if (top >= mvmin.y) // check top
+            {
+                COST_MV(omv.x, top);
+            }
+            if (left >= mvmin.x) // check left
+            {
+                COST_MV(left, omv.y);
+            }
+            if (right <= mvmax.x) // check right
+            {
+                COST_MV(right, omv.y);
+            }
+            if (bottom <= mvmax.y) // check bottom
+            {
+                COST_MV(omv.x, bottom);
+            }
+            for (int16_t index = 1; index < 4; index++)
+            {
+                int32_t posYT = top + ((dist >> 2) * index);
+                int32_t posYB = bottom - ((dist >> 2) * index);
+                int32_t posXL = omv.x - ((dist >> 2) * index);
+                int32_t posXR = omv.x + ((dist >> 2) * index);
+
+                if (posYT >= mvmin.y) // check top
+                {
+                    if (posXL >= mvmin.x) // check left
+                    {
+                        COST_MV(posXL, posYT);
+                    }
+                    if (posXR <= mvmax.x) // check right
+                    {
+                        COST_MV(posXR, posYT);
+                    }
+                }
+                if (posYB <= mvmax.y) // check bottom
+                {
+                    if (posXL >= mvmin.x) // check left
+                    {
+                        COST_MV(posXL, posYB);
+                    }
+                    if (posXR <= mvmax.x) // check right
+                    {
+                        COST_MV(posXR, posYB);
+                    }
+                }
+            }
+        }
+    }
+    outMV = bmv;
+    return bcost;
+}
+
 void MotionEstimate::refineMV(ReferencePlanes* ref,
                               const MV&        mvmin,
                               const MV&        mvmax,
@@ -1596,6 +1745,14 @@
     // check mv range for slice bound
     X265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");
 
+    // Get a chance to ZeroMv
+    if (bmv.notZero())
+    {
+      int cost = subpelCompare(ref, MV(0, 0), satd) + mvcost(MV(0, 0));
+      if (cost <= bcost)
+        bmv = MV(0, 0);
+    }
+
     x265_emms();
     outQMv = bmv;
     return bcost;

x265-4.1.tar/source/encoder/motion.h -> x265-4.2.tar/source/encoder/motion.h Changed

x265-4.1.tar/source/encoder/ratecontrol.cpp -> x265-4.2.tar/source/encoder/ratecontrol.cpp Changed

@@ -71,7 +71,7 @@
     }\
 }
 
-inline int calcScale(uint32_t x)
+inline int calcScale(uint64_t x)
 {
     static uint8_t lut16 = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
     int y, z = (((x & 0xffff) - 1) >> 27) & 16;
@@ -254,15 +254,16 @@
     m_relativeComplexity = NULL;
 
     // vbv initialization
-    m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
-    m_param->rc.vbvMaxBitrate = x265_clip3(0, 2000000, m_param->rc.vbvMaxBitrate);
-    m_param->rc.vbvBufferInit = x265_clip3(0.0, 2000000.0, m_param->rc.vbvBufferInit);
-    m_param->vbvBufferEnd = x265_clip3(0.0, 2000000.0, m_param->vbvBufferEnd);
+    m_param->rc.vbvBufferSize = x265_clip3(0, 8000000, m_param->rc.vbvBufferSize);
+    m_param->rc.vbvMaxBitrate = x265_clip3(0, 8000000, m_param->rc.vbvMaxBitrate);
+    m_param->rc.vbvBufferInit = x265_clip3(0.0, 8000000.0, m_param->rc.vbvBufferInit);
+    m_param->vbvBufferEnd = x265_clip3(0.0, 8000000.0, m_param->vbvBufferEnd);
     m_initVbv = false;
     m_singleFrameVbv = 0;
     m_rateTolerance = 1.0;
     m_encodedSegmentBits = 0;
     m_segDur = 0;
+    m_totalframesInSegment = 0;
 
     if (m_param->rc.vbvBufferSize)
     {
@@ -321,6 +322,7 @@
     m_leadingNoBSatd = 0;
     m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor);
     m_pbOffset = 6.0 * X265_LOG2(m_param->rc.pbFactor);
+    m_iBits = 0;
 
     for (int i = 0; i < QP_MAX_MAX; i++)
         m_qpToEncodedBitsi = 0;
@@ -336,6 +338,8 @@
         m_lastQScaleFori = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
         m_lmini = x265_qp2qScale(m_param->rc.qpMin);
         m_lmaxi = x265_qp2qScale(m_param->rc.qpMax);
+        m_frameCountSegi = 0;
+        m_movingSumComplexitySegi = 0;
     }
 
     if (m_param->rc.rateControlMode == X265_RC_CQP)
@@ -407,8 +411,8 @@
         x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
             m_param->rc.vbvBufferSize);
     }
-    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
-    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
+    uint64_t vbvBufferSize = m_param->rc.vbvBufferSize * 1000ULL;
+    uint64_t vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000ULL;
 
     if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
     {
@@ -416,9 +420,9 @@
         vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
         vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
     }
-    m_bufferRate = vbvMaxBitrate / m_fps;
-    m_vbvMaxRate = vbvMaxBitrate;
-    m_bufferSize = vbvBufferSize;
+    m_bufferRate = static_cast<double>(vbvMaxBitrate) / m_fps;
+    m_vbvMaxRate = static_cast<double>(vbvMaxBitrate);
+    m_bufferSize = static_cast<double>(vbvBufferSize);
     m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
 
     if (m_param->rc.vbvBufferInit > 1.)
@@ -492,7 +496,11 @@
     m_accumPNorm = .01;
     m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm;
 
-
+    for (int i = 0; i < 3; i++)
+    {
+        m_frameCountSegi = 0;
+        m_movingSumComplexitySegi = 0;
+    }
     /* Frame Predictors used in vbv */
     initFramePredictors();
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
@@ -664,7 +672,6 @@
                 }
                 /* read stats */
                 p = statsIn;
-                double totalQpAq = 0;
                 for (int i = 0; i < m_numEntries; i++)
                 {
                     RateControlEntry *rce, *rcePocOrder;
@@ -729,7 +736,6 @@
                         return false;
                     }
                     rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
-                    totalQpAq += qpAq;
                     rce->qpNoVbv = qNoVbv;
                     rce->qpaRc = qpRc;
                     rce->qpAq = qpAq;
@@ -828,8 +834,8 @@
 {
     if (m_isVbv)
     {
-        m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
-        m_param->rc.vbvMaxBitrate = x265_clip3(0, 2000000, m_param->rc.vbvMaxBitrate);
+        m_param->rc.vbvBufferSize = x265_clip3(0, 8000000, m_param->rc.vbvBufferSize);
+        m_param->rc.vbvMaxBitrate = x265_clip3(0, 8000000, m_param->rc.vbvMaxBitrate);
         if (m_param->reconfigWindowSize)
             m_param->rc.vbvMaxBitrate = (int)(m_param->rc.vbvMaxBitrate * (double)(m_fps / m_param->reconfigWindowSize));
         if (m_param->rc.vbvMaxBitrate < m_param->rc.bitrate &&
@@ -845,11 +851,11 @@
             x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
                 m_param->rc.vbvBufferSize);
         }
-        int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
-        int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
-        m_bufferRate = vbvMaxBitrate / m_fps;
-        m_vbvMaxRate = vbvMaxBitrate;
-        m_bufferSize = vbvBufferSize;
+        uint64_t vbvBufferSize = m_param->rc.vbvBufferSize * 1000ULL;
+        uint64_t vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000ULL;
+        m_bufferRate = static_cast<double>(vbvMaxBitrate) / m_fps;
+        m_vbvMaxRate = static_cast<double>(vbvMaxBitrate);
+        m_bufferSize = static_cast<double>(vbvBufferSize);
         m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
     }
     if (m_param->rc.rateControlMode == X265_RC_CRF)
@@ -891,8 +897,8 @@
 
 void RateControl::initHRD(SPS& sps)
 {
-    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
-    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
+    uint64_t vbvBufferSize = m_param->rc.vbvBufferSize * 1000ULL;
+    uint64_t vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000ULL;
 
     // Init HRD
     HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
@@ -904,12 +910,12 @@
 
     // normalize HRD size and rate to the value / scale notation
     hrd->bitRateScale = x265_clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT);
-    hrd->bitRateValue = (vbvMaxBitrate >> (hrd->bitRateScale + BR_SHIFT));
+    hrd->bitRateValue = static_cast<uint32_t>(vbvMaxBitrate >> (hrd->bitRateScale + BR_SHIFT));
 
     hrd->cpbSizeScale = x265_clip3(0, 15, calcScale(vbvBufferSize) - CPB_SHIFT);
-    hrd->cpbSizeValue = (vbvBufferSize >> (hrd->cpbSizeScale + CPB_SHIFT));
-    int bitRateUnscale = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
-    int cpbSizeUnscale = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
+    hrd->cpbSizeValue = static_cast<uint32_t>(vbvBufferSize >> (hrd->cpbSizeScale + CPB_SHIFT));
+    uint64_t bitRateUnscale = (uint64_t)hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
+    uint64_t cpbSizeUnscale = (uint64_t)hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
 
     // arbitrary
     #define MAX_DURATION 0.5
@@ -1108,7 +1114,7 @@
 
 bool RateControl::initPass2()
 {
-    uint64_t allConstBits = 0, allCodedBits = 0;
+    uint64_t allConstBits = 0;
     uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
     int startIndex, endIndex;
     int fps = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
@@ -1127,7 +1133,6 @@
         for (endIndex = m_start; endIndex < m_numEntries; endIndex++)
         {
             allConstBits += m_rce2PassendIndex.miscBits;
-            allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
         }
 
         if (allAvailableBits < allConstBits)
@@ -1225,14 +1230,13 @@
     int t0, t1;
     double qScaleMin = x265_qp2qScale(m_param->rc.qpMin);
     double qScaleMax = x265_qp2qScale(m_param->rc.qpMax);
-    int iterations = 0 , adjMin, adjMax;
+    int adjMin, adjMax;
     CHECKED_MALLOC(fills, double, m_numEntries + 1);
     fills++;
 
     /* adjust overall stream size */
     do
     {
-        iterations++;
         prevBits = expectedBits;
 
         if (expectedBits)
@@ -1370,6 +1374,14 @@
             //Reset SBRC buffer
             m_encodedSegmentBits = 0;
             m_segDur = 0;
+            m_iBits = 0;
+            m_totalframesInSegment = m_param->totalFrames - m_framesDone;
+
+            for (int i = 0; i < 3; i++)
+            {
+                m_frameCountSegi = 0;
+                m_movingSumComplexitySegi = 0;

x265-4.1.tar/source/encoder/ratecontrol.h -> x265-4.2.tar/source/encoder/ratecontrol.h Changed

@@ -178,7 +178,7 @@
     int     m_qpConstant3;
     int     m_lastNonBPictType;
     int     m_framesDone;        /* # of frames passed through RateCotrol already */
-
+    int64_t m_iBits;
     double  m_cplxrSum;          /* sum of bits*qscale/rceq */
     double  m_wantedBitsWindow;  /* target bitrate * window */
     double  m_accumPQp;          /* for determining I-frame quant */
@@ -194,6 +194,8 @@
     int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
     int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
     int64_t m_encodedSegmentBits;      /* bits used for encoded frames in a segment*/
+    double  m_movingSumComplexitySeg3;
+    int     m_frameCountSeg3;
     double  m_segDur;
     double  m_fps;
     int64_t m_satdCostWindow50;
@@ -201,6 +203,7 @@
     int     m_sliderPos;
     int64_t m_lastRemovedSatdCost;
     double  m_movingAvgSum;
+    int     m_totalframesInSegment;
 
     /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
     int64_t m_lastBsliceSatdCost;
@@ -302,8 +305,7 @@
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
     double tuneAbrQScaleFromFeedback(double qScale);
     double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
-    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
-    double tuneQscaleToUpdatedBitrate(Frame* curFrame, double q); // Tune qScale according to updated bitrate
+    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget	
     void   accumPQpUpdate();
 
     int    getPredictorType(int lowresSliceType, int sliceType);

x265-4.1.tar/source/encoder/rdcost.h -> x265-4.2.tar/source/encoder/rdcost.h Changed

@@ -76,18 +76,13 @@
             qpCr = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset1 + slice.m_chromaQpOffset1);
         }
 
-        if (slice.m_sps->chromaFormatIdc == X265_CSP_I444)
-        {
-            int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
-            uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tabchroma_offset_idx : 256;
-            m_chromaDistWeight0 = lambdaOffset;
+        int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
+        uint16_t lambdaOffset = x265_chroma_lambda2_offset_tabchroma_offset_idx;
+        m_chromaDistWeight0 = lambdaOffset;
 
-            chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
-            lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tabchroma_offset_idx : 256;
-            m_chromaDistWeight1 = lambdaOffset;
-        }
-        else
-            m_chromaDistWeight0 = m_chromaDistWeight1 = 256;
+        chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
+        lambdaOffset = x265_chroma_lambda2_offset_tabchroma_offset_idx;
+        m_chromaDistWeight1 = lambdaOffset;
     }
 
     void setLambda(double lambda2, double lambda)

x265-4.1.tar/source/encoder/sao.cpp -> x265-4.2.tar/source/encoder/sao.cpp Changed

@@ -733,7 +733,6 @@
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
     const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
-    const pixel* fenc;
     const pixel* rec;
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
     uint32_t picWidth  = m_param->sourceWidth;
@@ -863,7 +862,6 @@
                     skipR = 5;
                 }
 
-                fenc = fenc0;
                 rec  = rec0;
 
                 startX = !lpelx;
@@ -873,7 +871,6 @@
                 endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
                 if (startY)
                 {
-                    fenc += stride;
                     rec += stride;
                 }
 
@@ -888,7 +885,6 @@
                     skipB = 4;
                     skipR = 5;
                 }
-                fenc = fenc0;
                 rec  = rec0;
                 startX = !lpelx;
                 endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
@@ -898,7 +894,6 @@
 
                 if (startY)
                 {
-                    fenc += stride;
                     rec += stride;
                 }

x265-4.1.tar/source/encoder/search.cpp -> x265-4.2.tar/source/encoder/search.cpp Changed

@@ -33,6 +33,7 @@
 
 #include "analysis.h"  // TLD
 #include "framedata.h"
+#include "encoder.h"
 
 using namespace X265_NS;
 
@@ -222,6 +223,340 @@
     return quantQP;
 }
 
+void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData& cu, PicYuv* fencPic, int puOffset, PartSize part, int areaIdx, int finalIdx, bool isMVP , const int* neighborIdx)
+{
+#ifdef DETAILED_CU_STATS
+    m_statscu.m_encData->m_frameEncoderID.countMotionEstimate++;
+#endif
+
+    int satdCost = 0;
+    int numPredDir = slice->isInterP() ? 1 : 2;
+    int searchRange = isMVP ? 32 : m_param->searchRange;
+
+    MV mvp(0,0);
+    MV mvzero(0,0);
+
+    MV mvc(MD_ABOVE_LEFT + 1) * 2 + 2;
+    MV amvpCand2MAX_NUM_REFAMVP_NUM_CANDS;
+
+    MotionData bestME2;
+    bestME0.cost = MAX_UINT;
+    bestME1.cost = MAX_UINT;
+
+    int numPart = cu.getNumPartInter(0);
+    uint32_t lastMode = 0;
+
+    int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
+    int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
+
+    int numMvc = 0;
+    for (int puIdx = 0; puIdx < numPart; puIdx++)
+    {
+        PredictionUnit pu(cu, cuGeom, puIdx);
+
+        int pos = finalIdx + puIdx * puOffset;
+        int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;
+
+        InterNeighbourMV neighbours6;
+        if(!isMVP)
+           cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, neighbours);
+
+        for (int list = 0; list < numPredDir; list++)
+        {
+            int numIdx = slice->m_numRefIdxlist;
+            for (int ref = 0; ref < numIdx; ref++)
+            {
+                getBlkBits(part, slice->isInterP(), puIdx, lastMode, m_listSelBits);
+                uint32_t bits = m_listSelBitslist + MVP_IDX_BITS;
+                bits += getTUBits(ref, numIdx);
+
+                MV mvmin, mvmax, outmv,mvp_lowres;;
+                mvp = !isMVP ? m_areaBestMVareaIdxlistref : mvp;
+
+                MV zeroMV2 = {0,0};
+                const MV* amvp = zeroMV;
+                int mvpIdx = 0;
+
+                bool bLowresMVP = false;
+                if (!isMVP)
+                {
+                    for(int dir = MD_LEFT; dir <= MD_ABOVE_LEFT ; dir++)
+                    {
+                        int neighIdx = neighborIdxdir;
+                        if (neighIdx >= 0)
+                        {
+                            MEData& neighborData = slice->m_ctuMVslotIdx * MAX_NUM_PUS_PER_CTU + neighIdx;
+                            for (int i = 0; i < 2; i++)
+                            {
+                                neighboursdir.mvi = neighborData.mvi;
+                                neighboursdir.refIdxi = neighborData.refi;
+                            }
+                            neighboursdir.isAvailable = (neighborData.ref0 >= 0 || neighborData.ref1 >= 0);
+                        }
+                        else
+                        {
+                            for (int i = 0; i < 2; i++)
+                                neighboursdir.refIdxi = -1;
+                            neighboursdir.isAvailable = false;
+                        }
+                    }
+
+                    numMvc = cu.getPMV(neighbours, list, ref, amvpCandlistref, mvc);
+                    if (numMvc > 0)
+                    {
+                        amvp = amvpCandlistref;
+                        mvpIdx = selectMVP(cu, pu, amvp, list, ref);
+                        mvp = amvpmvpIdx;                 
+                    }
+                    else if (slice->m_refFrameListlistref->m_encData->m_slice->m_sliceType != I_SLICE)
+                    {
+                        MEData meData = slice->m_refFrameListlistref->m_encData->m_slice->m_ctuMVslotIdx * MAX_NUM_PUS_PER_CTU + pos;
+
+                        bool bi = (meData.ref0 >= 0 && meData.ref1 >= 0);
+                        bool uniL0 = (meData.ref0 >= 0 && meData.ref1 == REF_NOT_VALID);
+                        bool uniL1 = (meData.ref1 >= 0 && meData.ref0 == REF_NOT_VALID);
+
+                        if (uniL0)
+                            mvp = meData.mv0;
+                        else if (uniL1)
+                            mvp = meData.mv1;
+                        else if (bi)
+                            mvp = meData.mvlist;
+                    }
+                }
+
+                m_me.setMVP(mvp);
+
+                if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad))
+                {
+                    uint32_t blockX = cu.m_cuPelX + g_zscanToPelXpu.puAbsPartIdx + (pu.width  >> 1);
+                    uint32_t blockY = cu.m_cuPelY + g_zscanToPelYpu.puAbsPartIdx + (pu.height >> 1);
+
+                    if (blockX < m_slice->m_sps->picWidthInLumaSamples && blockY < m_slice->m_sps->picHeightInLumaSamples)
+                    {
+                        MV lmv = getLowresMV(cu, pu, list, ref);
+                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
+                        if (lmv.notZero() && !layer)
+                        {
+                            mvcnumMvc++ = lmv;
+                            bLowresMVP = true;
+                        }
+                        mvp_lowres = lmv;
+                    }
+                }
+
+                PicYuv* recon = slice->m_mreflistref.reconPic;
+                int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0);
+
+                if (m_param->searchMethod == X265_SEA)
+                {
+                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                        m_me.integralplanes = slice->m_refFrameListlistref->m_encData->m_meIntegralplanes + offset;
+                }
+
+                m_me.setSourcePU(fencPic->m_picOrg0, fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
+                setSearchRange(cu, mvp, searchRange, mvmin, mvmax);
+
+                if (isMVP)
+                {
+                    satdCost = m_me.diamondSearch(&slice->m_mreflistref, mvmin, mvmax, outmv);
+                    m_areaBestMVareaIdxlistref = outmv;
+                }
+                else
+                {
+                    m_vertRestriction = slice->m_refPOCListlistref == slice->m_poc;
+                    satdCost = m_me.motionEstimate(&slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
+                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
+
+                    if (bLowresMVP && mvp_lowres.notZero() && mvp_lowres != mvp)
+                    {
+                        MV outmv_lowres;
+                        bLowresMVP = false;
+                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+                        int lowresMvCost = m_me.motionEstimate(&slice->m_mreflistref,  mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange,outmv_lowres, m_param->maxSlices,
+                            m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0): 0);
+
+                        if (lowresMvCost < satdCost)
+                        {
+                            outmv = outmv_lowres;
+                            satdCost = lowresMvCost;
+                            bLowresMVP = true;
+                        }
+                    }
+                }
+
+                bits += m_me.bitcost(outmv);
+                uint32_t mvCost = m_me.mvcost(outmv);
+                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+
+                if(!isMVP)
+                {
+                    if (bLowresMVP)
+                        updateMVP(mvp, outmv, bits, cost, mvp_lowres);
+
+                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
+                }
+                if (cost < bestMElist.cost)
+                {
+                    bestMElist.mv = outmv;
+                    bestMElist.mvp = mvp;
+                    bestMElist.mvpIdx = 0;
+                    bestMElist.cost = cost;
+                    bestMElist.bits = bits;
+                    bestMElist.mvCost = mvCost;
+                    bestMElist.ref = ref;
+                }
+            }
+        }
+
+        if (isMVP)
+            return;

x265-4.1.tar/source/encoder/search.h -> x265-4.2.tar/source/encoder/search.h Changed

@@ -179,6 +179,7 @@
     int64_t  pmodeBlockTime;                    // elapsed worker time blocked for pmode batch completion
     int64_t  weightAnalyzeTime;                 // elapsed worker time analyzing reference weights
     int64_t  totalCTUTime;                      // elapsed worker time in compressCTU (includes pmode master)
+    int64_t  tmeTime;                           // elapsed worker time in threadedME
 
     uint32_t skippedMotionReferencesNUM_CU_DEPTH;
     uint32_t totalMotionReferencesNUM_CU_DEPTH;
@@ -195,6 +196,8 @@
     uint64_t countPModeTasks;
     uint64_t countPModeMasters;
     uint64_t countWeightAnalyze;
+    uint64_t countTmeTasks;
+    uint64_t countTmeBlockedCTUs;
     uint64_t totalCTUs;
 
     CUStats() { clear(); }
@@ -227,6 +230,7 @@
         pmodeBlockTime += other.pmodeBlockTime;
         weightAnalyzeTime += other.weightAnalyzeTime;
         totalCTUTime += other.totalCTUTime;
+        tmeTime += other.tmeTime;
 
         countIntraAnalysis += other.countIntraAnalysis;
         countMotionEstimate += other.countMotionEstimate;
@@ -236,6 +240,8 @@
         countPModeTasks += other.countPModeTasks;
         countPModeMasters += other.countPModeMasters;
         countWeightAnalyze += other.countWeightAnalyze;
+        countTmeTasks += other.countTmeTasks;
+        countTmeBlockedCTUs += other.countTmeBlockedCTUs;
         totalCTUs += other.totalCTUs;
 
         other.clear();
@@ -288,6 +294,8 @@
 
     bool            m_vertRestriction;
 
+    MV              m_areaBestMV52MAX_NUM_REF;
+
 #if ENABLE_SCC_EXT
     int             m_ibcEnabled;
     int             m_numBVs;
@@ -341,6 +349,16 @@
 
     MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
 
+    /**
+     * @brief Run motion estimation for one PU partition shape and persist the best ME result.
+     *
+     * Used by Analysis threaded-ME flow. With isMVP=true this bootstraps area MVPs,
+     * and with isMVP=false it performs full PU ME using spatial/temporal neighbors
+     * and stores results into per-CTU MV slots addressed by finalIdx/puOffset.
+     */
+    void puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData& ctu, PicYuv* fencPic, int puOffset, PartSize part, int areaIdx, int finalIdx, 
+        bool isMVP ,  const int* neighborIdx = NULL);
+ 
 #if ENABLE_SCC_EXT
     void      predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2, MV* iMVCandList = NULL);
     bool      predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);

x265-4.1.tar/source/encoder/sei.h -> x265-4.2.tar/source/encoder/sei.h Changed

@@ -204,83 +204,171 @@
     int32_t     m_chroma_scaling_from_luma;
     int32_t     m_grain_scale_shift;
     uint16_t    m_grain_seed;
+    bool        subsamplingX;
+    bool        subsamplingY;
+    bool        predict_scaling_flag;
+    bool        predict_y_scaling_flag;
+    bool        predict_cb_scaling_flag;
+    bool        predict_cr_scaling_flag;
+    int         units_resolution_log2;
+    int         horz_resolution;
+    int         vert_resolution;
+    bool        luma_only_flag;
+    int         point_y_value_increment_bits;
+    int         point_y_scaling_bits;
+    int         point_cb_value_increment_bits;
+    int         point_cb_scaling_bits;
+    int         point_cr_value_increment_bits;
+    int         point_cr_scaling_bits;
+    int         cb_scaling_offset;
+    int         cr_scaling_offset;
+    int         payload_size;
+    int         payload_bits;
 
-    void writeSEI(const SPS&)
+    void writeSEI(const SPS& sps)
     {
-        WRITE_CODE(0x26, 8, "country_code");
+        WRITE_CODE(0xB5, 8, "country_code");
         WRITE_CODE(0x5890, 16, "provider_code");
-        WRITE_CODE(0x0001, 16, "provider_oriented_code");
-        WRITE_FLAG(m_apply_grain, "afgs1_enable_flag");
-        WRITE_CODE(m_grain_seed, 16, "grain_seed");
+        WRITE_CODE(0x0001, 8, "provider_oriented_code");
+        WRITE_FLAG(1, "afgs1_enable_flag");
+        WRITE_CODE(0, 4, "reserved_4bits");
+        WRITE_CODE(0, 3, "num_film_grain_sets_minus1");
+        WRITE_FLAG(payload_size < 4 ? 1 : 0, "payload_less_than_4byte_flag");
+        WRITE_CODE(payload_size, payload_bits, "payload_size");
+
         WRITE_CODE(0, 3, "film_grain_param_set_idx");
-        WRITE_CODE(m_update_grain, 1, "update_grain");
-        WRITE_CODE(m_num_y_points, 4, "num_y_points");
-        if (m_num_y_points)
+        WRITE_FLAG(m_apply_grain, "apply_grain_flag");
+        WRITE_CODE(m_grain_seed, 16, "grain_seed");
+        WRITE_FLAG(1, "update_grain_flag");
+        WRITE_CODE(units_resolution_log2, 4, "apply_units_resolution_log2");
+        WRITE_CODE(horz_resolution, 12, "apply_horz_resolution");
+        WRITE_CODE(vert_resolution, 12, "apply_vert_resolution");
+        WRITE_FLAG(luma_only_flag, "luma_only_flag");
+        if (!luma_only_flag) {
+            WRITE_FLAG(subsamplingX, "subsampling_x");
+            WRITE_FLAG(subsamplingY, "subsampling_y");
+        }
+        WRITE_FLAG(sps.vuiParameters.videoSignalTypePresentFlag, "video_signal_characteristics_flag");
+        if (sps.vuiParameters.videoSignalTypePresentFlag)
         {
-            for (int i = 0; i < m_num_y_points; i++)
+            WRITE_CODE(m_bitDepth - 8, 3, "bit_depth_minus8");
+            WRITE_FLAG(sps.vuiParameters.colourDescriptionPresentFlag, "cicp_info_present_flag");
+            if (sps.vuiParameters.colourDescriptionPresentFlag)
             {
-                for (int j = 0; j < 2; j++)
-                {
-                    WRITE_CODE(m_scaling_points_yij, 8, "scaling_points_yij");
-                }
+                WRITE_CODE(sps.vuiParameters.colourPrimaries, 8, "colour_primaries");
+                WRITE_CODE(sps.vuiParameters.transferCharacteristics, 8, "transfer_characteristics");
+                WRITE_CODE(sps.vuiParameters.matrixCoefficients, 8, "matrix_coefficients");
+                WRITE_FLAG(sps.vuiParameters.videoFullRangeFlag, "video_full_range_flag");
             }
         }
-        WRITE_FLAG(m_num_cb_points == 0 && m_num_cr_points == 0, "luma_only_flag");
-        WRITE_FLAG(0, "chroma_scaling_from_luma");
-        WRITE_CODE(m_num_cb_points, 4, "num_cb_points");
-        if (m_num_cb_points)
-        {
-            for (int i = 0; i < m_num_cb_points; i++)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    WRITE_CODE(m_scaling_points_cbij, 8, "scaling_points_cbij");
-                }
+        WRITE_FLAG(predict_scaling_flag, "predict_scaling_flag");
+        if (predict_scaling_flag) {
+            WRITE_FLAG(predict_y_scaling_flag, "predict_y_scaling_flag");
+        }
+
+        WRITE_CODE(m_num_y_points, 4, "num_y_points");
+        if (m_num_y_points) {
+            WRITE_CODE(point_y_value_increment_bits - 1, 3, "point_y_value_increment_bits_minus1");
+            int bitsIncr = point_y_value_increment_bits;
+            WRITE_CODE(point_y_scaling_bits - 5, 2, "point_y_scaling_bits_minus5");
+            int bitsScal = point_y_scaling_bits;
+            for (int i = 0; i < m_num_y_points; i++) {
+                if (i)
+                    WRITE_CODE(m_scaling_points_yi0 - m_scaling_points_yi - 10, bitsIncr, "point_y_value_incrementi");
+                else
+                    WRITE_CODE(m_scaling_points_yi0, bitsIncr, "point_y_value_incrementi");
+                WRITE_CODE(m_scaling_points_yi1, bitsScal, "point_y_scalingi");
             }
         }
-        WRITE_CODE(m_num_cr_points, 4, "num_cr_points");
-        if (m_num_cr_points)
-        {
-            for (int i = 0; i < m_num_cr_points; i++)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    WRITE_CODE(m_scaling_points_crij, 8, "scaling_points_crij");
+
+        if (!luma_only_flag)
+            WRITE_FLAG(m_chroma_scaling_from_luma, "chroma_scaling_from_luma");
+
+        if (luma_only_flag || m_chroma_scaling_from_luma) {
+            m_num_cb_points = 0;
+            m_num_cr_points = 0;
+        }
+        else {
+            if (predict_scaling_flag)
+                WRITE_FLAG(predict_cb_scaling_flag, "predict_cb_scaling_flag");
+
+            WRITE_CODE(m_num_cb_points, 4, "num_cb_points");
+            if (m_num_cb_points) {
+                WRITE_CODE(point_cb_value_increment_bits - 1, 3, "point_cb_value_increment_bits_minus1");
+                int bitsIncr = point_cb_value_increment_bits;
+                WRITE_CODE(point_cb_scaling_bits - 5, 2, "point_cb_scaling_bits_minus5");
+                int bitsScal = point_cb_scaling_bits;
+                WRITE_CODE(cb_scaling_offset, 8, "cb_scaling_offset");
+
+                for (int i = 0; i < m_num_cb_points; i++) {
+                    if (i)
+                        WRITE_CODE(m_scaling_points_cbi0 - m_scaling_points_cbi - 10, bitsIncr, "point_cb_value_incrementi");
+                    else
+                        WRITE_CODE(m_scaling_points_cbi0, bitsIncr, "point_cb_value_incrementi");
+                    WRITE_CODE(m_scaling_points_cbi1, bitsScal, "point_cb_scalingi");
+                }
+            }
+
+            if (predict_scaling_flag)
+                WRITE_FLAG(predict_cr_scaling_flag, "predict_cr_scaling_flag");
+
+            WRITE_CODE(m_num_cr_points, 4, "num_cr_points");
+            if (m_num_cr_points) {
+                WRITE_CODE(point_cr_value_increment_bits - 1, 3, "point_cr_value_increment_bits_minus1");
+                int bitsIncr = point_cr_value_increment_bits;
+                WRITE_CODE(point_cr_scaling_bits - 5, 2, "point_cr_scaling_bits_minus5");
+                int bitsScal = point_cr_scaling_bits;
+                WRITE_CODE(cr_scaling_offset, 8, "cr_scaling_offset");
+
+                for (int i = 0; i < m_num_cr_points; i++) {
+                    if (i)
+                        WRITE_CODE(m_scaling_points_cri0 - m_scaling_points_cri - 10, bitsIncr, "point_cr_value_incrementi");
+                    else
+                        WRITE_CODE(m_scaling_points_cri0, bitsIncr, "point_cr_value_incrementi");
+                    WRITE_CODE(m_scaling_points_cri1, bitsScal, "point_cr_scalingi");
                 }
             }
         }
-        WRITE_CODE(m_scaling_shift - 8, 2, "scaling_shift");
+
+        WRITE_CODE(m_scaling_shift - 8, 2, "grain_scaling_minus8");
         WRITE_CODE(m_ar_coeff_lag, 2, "ar_coeff_lag");
-        if (m_num_y_points)
+        int bits_per_ar_coef = 8;
+        if (m_num_y_points || predict_y_scaling_flag)
         {
+            WRITE_CODE(bits_per_ar_coef - 5, 2, "bits_per_ar_coeff_y_minus5");
+
             for (int i = 0; i < 24; i++)
             {
-                WRITE_CODE(m_ar_coeffs_yi + 128, 8, "ar_coeff_yi");
+                WRITE_CODE(m_ar_coeffs_yi + (1 << (bits_per_ar_coef - 1)), bits_per_ar_coef, "ar_coeff_yi");
             }
         }
-        if (m_num_cb_points || m_chroma_scaling_from_luma)
+        if (m_num_cb_points || m_chroma_scaling_from_luma || predict_cb_scaling_flag)
         {
+            WRITE_CODE(bits_per_ar_coef - 5, 2, "bits_per_ar_coeff_cb_minus5");
+
             for (int i = 0; i < 25; i++)
             {
-                WRITE_CODE(m_ar_coeffs_cbi + 128, 8, "ar_coeff_cbi");
+                WRITE_CODE(m_ar_coeffs_cbi + (1 << (bits_per_ar_coef - 1)), bits_per_ar_coef, "ar_coeff_cbi");
             }
         }
-        if (m_num_cr_points || m_chroma_scaling_from_luma)
+        if (m_num_cr_points || m_chroma_scaling_from_luma || predict_cr_scaling_flag)
         {
+            WRITE_CODE(bits_per_ar_coef - 5, 2, "bits_per_ar_coeff_cr_minus5");
+
             for (int i = 0; i < 25; i++)
             {
-                WRITE_CODE(m_ar_coeffs_cri + 128, 8, "ar_coeff_cri");
+                WRITE_CODE(m_ar_coeffs_cri + (1 << (bits_per_ar_coef - 1)), bits_per_ar_coef, "ar_coeff_cri");
             }
         }
         WRITE_CODE(m_ar_coeff_shift - 6, 2, "ar_coeff_shift");

x265-4.1.tar/source/encoder/slicetype.cpp -> x265-4.2.tar/source/encoder/slicetype.cpp Changed

@@ -937,7 +937,7 @@
     if (mindenom > 0 && !(minscale & 1))
     {
         unsigned long idx;
-        CTZ(idx, minscale);
+        BSF(idx, minscale);
         int shift = X265_MIN((int)idx, mindenom);
         mindenom -= shift;
         minscale >>= shift;
@@ -1284,7 +1284,10 @@
     slicetypeDecide();
 
     m_inputLock.acquire();
-    if (m_outputSignalRequired)
+    m_sliceTypeBusy = false;
+    m_helpWanted = true;
+
+    if (m_outputSignalRequired && m_outputQueue.size())
     {
         m_outputSignal.trigger();
         m_outputSignalRequired = false;
@@ -1473,7 +1476,6 @@
     uint64_t picTotVariance = 0;
     uint32_t variance;
 
-    uint64_t blockXY = 0;
     pixel* src = curFrame->m_fencPic->m_picOrg0;
 
     for (int blockY = 0; blockY < maxRow; blockY += 8)
@@ -1489,7 +1491,6 @@
                 blockOffsetLuma, 0);
 
             rowVariance += variance;
-            blockXY++;
         }
         picTotVariance += (uint16_t)(rowVariance / maxCol);
     }
@@ -1520,7 +1521,6 @@
                 blockOffsetChroma, 1);
 
             rowVariance += variance;
-            blockXY++;
         }
         picTotVariance += (uint16_t)(rowVariance / maxColChroma);
     }
@@ -1544,7 +1544,6 @@
                 blockOffsetChroma, 2);
 
             rowVariance += variance;
-            blockXY++;
         }
         picTotVariance += (uint16_t)(rowVariance / maxColChroma);
     }
@@ -1566,13 +1565,18 @@
 
 {
     *sum = 0;
+    uint8_t shift = X265_DEPTH - 8;
 
     for (uint32_t verticalIdx = 0; verticalIdx < inputHeight; verticalIdx += dsFactor)
     {
         for (uint32_t horizontalIdx = 0; horizontalIdx < inputWidth; horizontalIdx += dsFactor)
         {
-            ++(histograminputSrchorizontalIdx);
-            *sum += inputSrchorizontalIdx;
+            pixel val = inputSrchorizontalIdx >> shift;
+#if HIGH_BIT_DEPTH
+            X265_CHECK(val < HISTOGRAM_NUMBER_OF_BINS, "Pixel value out of allocated histogram range. This will lead to memory corruption.\n");
+#endif
+            ++(histogramval);
+            *sum += val;
         }
         inputSrc += (stride << (dsFactor >> 1));
     }
@@ -2759,27 +2763,33 @@
             break;
     }
 
-    if (!framecnt)
+    if (!framecnt && m_param->analysisLoadReuseLevel != 1)
     {
         if (m_param->rc.cuTree)
             cuTree(frames, 0, bKeyframe);
         return;
     }
-    framesframecnt + 1 = NULL;
 
-    if (m_param->bResetZoneConfig)
+    if (framecnt)
     {
-        for (int i = 0; i < m_param->rc.zonefileCount; i++)
+
+        framesframecnt + 1 = NULL;
+
+        if (m_param->bResetZoneConfig)
         {
-            int curZoneStart = m_param->rc.zonesi.startFrame, nextZoneStart = 0;
-            curZoneStart += curZoneStart ? m_param->rc.zonesi.zoneParam->radl : 0;
-            nextZoneStart += (i + 1 < m_param->rc.zonefileCount) ? m_param->rc.zonesi + 1.startFrame + m_param->rc.zonesi + 1.zoneParam->radl : m_param->totalFrames;
-            if (curZoneStart <= frames0->frameNum && nextZoneStart > frames0->frameNum)
-                m_param->keyframeMax = nextZoneStart - curZoneStart;
-            if (m_param->rc.zonesm_param->rc.zonefileCount - 1.startFrame <= frames0->frameNum && nextZoneStart == 0)
-                m_param->keyframeMax = m_param->rc.zones0.keyframeMax;
+            for (int i = 0; i < m_param->rc.zonefileCount; i++)
+            {
+                int curZoneStart = m_param->rc.zonesi.startFrame, nextZoneStart = 0;
+                curZoneStart += curZoneStart ? m_param->rc.zonesi.zoneParam->radl : 0;
+                nextZoneStart += (i + 1 < m_param->rc.zonefileCount) ? m_param->rc.zonesi + 1.startFrame + m_param->rc.zonesi + 1.zoneParam->radl : m_param->totalFrames;
+                if (curZoneStart <= frames0->frameNum && nextZoneStart > frames0->frameNum)
+                    m_param->keyframeMax = nextZoneStart - curZoneStart;
+                if (m_param->rc.zonesm_param->rc.zonefileCount - 1.startFrame <= frames0->frameNum && nextZoneStart == 0)
+                    m_param->keyframeMax = m_param->rc.zones0.keyframeMax;
+            }
         }
     }
+
     int keylimit = m_param->keyframeMax;
     if (frames0->frameNum < m_param->chunkEnd)
     {
@@ -2798,266 +2808,281 @@
         keyintLimit = keyFrameLimit;
 
     origNumFrames = numFrames = m_param->bIntraRefresh ? framecnt : X265_MIN(framecnt, keyintLimit);
-    if (bIsVbvLookahead)
-        numFrames = framecnt;
-    else if (m_param->bOpenGOP && numFrames < framecnt)
-        numFrames++;
-    else if (numFrames == 0)
-    {
-        frames1->sliceType = X265_TYPE_I;
-        return;
-    }
 
-    if (m_bBatchMotionSearch)
+    if (framecnt)
     {
-        /* pre-calculate all motion searches, using many worker threads */
-        CostEstimateGroup estGroup(*this, frames);
-        for (int b = 2; b < numFrames; b++)
+        if (bIsVbvLookahead)
+            numFrames = framecnt;
+        else if (m_param->bOpenGOP && numFrames < framecnt)
+            numFrames++;
+        else if (numFrames == 0)
         {
-            for (int i = 1; i <= m_param->bframes + 1; i++)
-            {
-                int p0 = b - i;
-                if (p0 < 0)
-                    continue;
-
-                /* Skip search if already done */
-                if (framesb->lowresMvs0i0.x != 0x7FFF)
-                    continue;
-
-                /* perform search to p1 at same distance, if possible */
-                int p1 = b + i;
-                if (p1 >= numFrames || framesb->lowresMvs1i0.x != 0x7FFF)
-                    p1 = b;
-
-                estGroup.add(p0, p1, b);
-            }
+            frames1->sliceType = X265_TYPE_I;
+            return;
         }
-        /* auto-disable after the first batch if pool is small */
-        m_bBatchMotionSearch &= m_pool->m_numWorkers >= 4;
-        estGroup.finishBatch();
 
-        if (m_bBatchFrameCosts)
+        if (m_bBatchMotionSearch)
         {
-            /* pre-calculate all frame cost estimates, using many worker threads */
+            /* pre-calculate all motion searches, using many worker threads */
+            CostEstimateGroup estGroup(*this, frames);
             for (int b = 2; b < numFrames; b++)
             {
                 for (int i = 1; i <= m_param->bframes + 1; i++)
                 {
-                    if (b < i)
+                    int p0 = b - i;
+                    if (p0 < 0)
                         continue;
 
-                    /* only measure frame cost in this pass if motion searches
-                     * are already done */
-                    if (framesb->lowresMvs0i0.x == 0x7FFF)
+                    /* Skip search if already done */
+                    if (framesb->lowresMvs0i0.x != 0x7FFF)
                         continue;
 
-                    int p0 = b - i;
+                    /* perform search to p1 at same distance, if possible */
+                    int p1 = b + i;
+                    if (p1 >= numFrames || framesb->lowresMvs1i0.x != 0x7FFF)
+                        p1 = b;
 
-                    for (int j = 0; j <= m_param->bframes; j++)
-                    {
-                        int p1 = b + j;

x265-4.2.tar/source/encoder/threadedme.cpp Added

@@ -0,0 +1,260 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2025 MulticoreWare, Inc
+ *
+ * Authors: Shashank Pathipati <shashank.pathipati@multicorewareinc.com>
+ *          Somu Vineela <somu@mutlicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "threadedme.h"
+#include "encoder.h"
+#include "frameencoder.h"
+
+#include <iostream>
+#include <sstream>
+
+namespace X265_NS {
+int g_puStartIdx1288 = {0};
+
+bool ThreadedME::create()
+{
+    m_active = true;
+    m_tldCount = m_pool->m_numWorkers;
+    m_tld = new ThreadLocalDatam_tldCount;
+    for (int i = 0; i < m_tldCount; i++)
+    {
+        m_tldi.analysis.initSearch(*m_param, m_enc.m_scalingList);
+        m_tldi.analysis.create(m_tld);
+    }
+
+    initPuStartIdx();
+
+    /* start sequence at zero */
+    m_enqueueSeq = 0ULL;
+
+    return true;
+}
+
+void ThreadedME::initPuStartIdx()
+{
+    int startIdx = 0;
+    uint32_t ctuSize = m_param->maxCUSize;
+
+    for (uint32_t puIdx = 0; puIdx < MAX_NUM_PU_SIZES; ++puIdx)
+    {
+        const PUBlock& pu = g_puLookuppuIdx;
+
+        if (pu.width > ctuSize || pu.height > ctuSize)
+            continue;
+
+        int indexWidth = pu.isAmp ? X265_MAX(pu.width, pu.height) : pu.width;
+        int indexHeight = pu.isAmp ? indexWidth : pu.height;
+
+        int numPUs = (ctuSize / indexWidth) * (ctuSize / indexHeight);
+        int partIdx = static_cast<int>(pu.partsize);
+
+        g_puStartIdxpu.width + pu.heightpartIdx = startIdx;
+
+        startIdx += pu.isAmp ? 2 * numPUs : numPUs;
+    }
+}
+
+void ThreadedME::enqueueCTUBlock(int row, int col, int width, int height, int layer, FrameEncoder* frameEnc)
+{
+    frameEnc->m_tmeTasksLock.acquire();
+
+    Frame* frame = frameEnc->m_framelayer;
+
+    CTUTask task;
+    task.seq = ATOMIC_ADD(&m_enqueueSeq, 1ULL);
+    task.row = row;
+    task.col = col;
+    task.width = width;
+    task.height = height;
+    task.layer = layer;
+
+    task.frame = frame;
+    task.frameEnc = frameEnc;
+
+    frameEnc->m_tmeTasks.push(task);
+    frameEnc->m_tmeTasksLock.release();
+
+    m_taskEvent.trigger();
+}
+
+void ThreadedME::enqueueReadyRows(int row, int layer, FrameEncoder* frameEnc)
+{
+    int bufRow = X265_MIN(row + m_param->tmeNumBufferRows, static_cast<int>(frameEnc->m_numRows));
+
+    for (int r = 0; r < bufRow; r++)
+    {
+        if (frameEnc->m_tmeDepsr.isQueued)
+            continue;
+
+        bool isInitialRow = r < m_param->tmeNumBufferRows;
+        bool isExternalDepResolved = frameEnc->m_tmeDepsr.external;
+
+        int prevRow = X265_MAX(0, r - m_param->tmeNumBufferRows);
+        bool isInternalDepResolved = frameEnc->m_tmeDepsprevRow.internal;
+
+        if ((isInitialRow && isExternalDepResolved) ||
+            (!isInitialRow && isExternalDepResolved && isInternalDepResolved))
+        {
+            int cols = static_cast<int>(frameEnc->m_numCols);
+            for (int c = 0; c < cols; c += m_param->tmeTaskBlockSize)
+            {
+                int blockWidth = X265_MIN(m_param->tmeTaskBlockSize, cols - c);
+                enqueueCTUBlock(r, c, blockWidth, 1, layer, frameEnc);
+            }
+            frameEnc->m_tmeDepsr.isQueued = true;
+        }
+    }
+}
+
+void ThreadedME::threadMain()
+{
+    while (m_active)
+    {
+        int newCTUsPushed = 0;
+
+        for (int i = 0; i < m_param->frameNumThreads; i++)
+        {
+            FrameEncoder* frameEnc = m_enc.m_frameEncoderi;
+            frameEnc->m_tmeTasksLock.acquire();
+
+            while (!frameEnc->m_tmeTasks.empty())
+            {
+                CTUTask task = frameEnc->m_tmeTasks.front();
+                frameEnc->m_tmeTasks.pop();
+
+                m_taskQueueLock.acquire();
+                m_taskQueue.push(task);
+                m_taskQueueLock.release();
+
+                newCTUsPushed++;
+                tryWakeOne();
+            }
+
+            frameEnc->m_tmeTasksLock.release();
+        }
+
+        if (newCTUsPushed == 0)
+            m_taskEvent.wait();
+    }
+}
+
+void ThreadedME::findJob(int workerThreadId)
+{
+    m_taskQueueLock.acquire();
+    if (m_taskQueue.empty())
+    {
+        m_helpWanted = false;
+        m_taskQueueLock.release();
+        return;
+    }
+    
+    m_helpWanted = true;
+    int64_t stime = x265_mdate();
+
+#ifdef DETAILED_CU_STATS
+    ScopedElapsedTime tmeTime(m_tldworkerThreadId.analysis.m_statsm_jpId.tmeTime);
+    m_tldworkerThreadId.analysis.m_statsm_jpId.countTmeTasks++;
+#endif
+
+    CTUTask task = m_taskQueue.top();
+    m_taskQueue.pop();
+    m_taskQueueLock.release();
+
+    int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
+    Frame* frame = task.frame;
+
+    for (int i = 0; i < task.height; i++)
+    {
+        for (int j = 0; j < task.width; j++)
+        {
+
+            int ctuAddr = (task.row + i) * numCols + (task.col + j);
+            CUData* ctu = frame->m_encData->getPicCTU(ctuAddr);
+            ctu->m_slice = frame->m_encData->m_slice;
+
+            task.ctu = ctu;
+            task.geom = &task.frameEnc->m_cuGeomstask.frameEnc->m_ctuGeomMapctuAddr;
+
+            frame->m_encData->m_cuStatctuAddr.baseQp = frame->m_encData->m_avgQpRc;

x265-4.2.tar/source/encoder/threadedme.h Added

@@ -0,0 +1,249 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2025 MulticoreWare, Inc
+ *
+ * Authors: Shashank Pathipati <shashank.pathipati@multicorewareinc.com>
+ *          Somu Vineela <somu@mutlicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef THREADED_ME_H
+#define THREADED_ME_H
+
+#include "common.h"
+#include "threading.h"
+#include "threadpool.h"
+#include "cudata.h"
+#include "lowres.h"
+#include "frame.h"
+#include "analysis.h"
+#include "mv.h"
+
+#include <queue>
+#include <vector>
+#include <fstream>
+
+namespace X265_NS {
+
+extern int g_puStartIdx1288;
+
+class Encoder;
+class Analysis;
+class FrameEncoder;
+
+struct PUBlock {
+    uint32_t width;
+    uint32_t height;
+    PartSize partsize;
+    bool isAmp;
+};
+
+const PUBlock g_puLookupMAX_NUM_PU_SIZES = {
+    { 8,   4, SIZE_2NxN,  0 },
+    { 4,   8, SIZE_Nx2N,  0 },
+    { 8,   8, SIZE_2Nx2N, 0 },
+    { 16,  4, SIZE_2NxnU, 1 },
+    { 16, 12, SIZE_2NxnD, 1 },
+    { 4,  16, SIZE_nLx2N, 1 },
+    { 12, 16, SIZE_nRx2N, 1 },
+    { 16,  8, SIZE_2NxN,  0 },
+    { 8,  16, SIZE_Nx2N,  0 },
+    { 16, 16, SIZE_2Nx2N, 0 },
+    { 32,  8, SIZE_2NxnU, 1 },
+    { 32, 24, SIZE_2NxnD, 1 },
+    { 8,  32, SIZE_nLx2N, 1 },
+    { 24, 32, SIZE_nRx2N, 1 },
+    { 32, 16, SIZE_2NxN,  0 },
+    { 16, 32, SIZE_Nx2N,  0 },
+    { 32, 32, SIZE_2Nx2N, 0 },
+    { 64, 16, SIZE_2NxnU, 1 },
+    { 64, 48, SIZE_2NxnD, 1 },
+    { 16, 64, SIZE_nLx2N, 1 },
+    { 48, 64, SIZE_nRx2N, 1 },
+    { 64, 32, SIZE_2NxN,  0 },
+    { 32, 64, SIZE_Nx2N,  0 },
+    { 64, 64, SIZE_2Nx2N, 0 }
+};
+
+struct CTUTaskData
+{
+    CUData& ctuData;
+    CUGeom& ctuGeom;
+    Frame& frame;
+};
+
+struct CTUBlockTask
+{
+    int row;
+    int col;
+    int width;
+    int height;
+    Frame* frame;
+    class FrameEncoder* frameEnc;
+    unsigned long long seq; /* monotonic sequence to preserve enqueue order */
+};
+
+struct PUData
+{
+    PartSize part;
+    const CUGeom* cuGeom;
+    int puOffset;
+    int areaId;
+    int finalIdx;
+    int qp;
+};
+
+struct MEData
+{
+    MV       mv2;
+    MV       mvp2;
+    uint32_t mvCost2;
+    int      ref2;
+    int      bits;
+    uint32_t cost;
+};
+
+struct CTUTask
+{
+    uint64_t seq;
+    int row;
+    int col;
+    int width;
+    int height;
+    int layer;
+
+    CUData* ctu;
+    CUGeom* geom;
+    Frame* frame;
+    FrameEncoder* frameEnc;
+};
+
+
+struct CompareCTUTask {
+    bool operator()(const CTUTask& a, const CTUTask& b) const {
+        if (a.frame->m_poc == b.frame->m_poc)
+        {
+            int a_pos = a.row + a.col;
+            int b_pos = b.row + b.col;
+            if (a_pos != b_pos) return a_pos > b_pos;
+        }
+
+        /* Compare by sequence number to preserve FIFO enqueue order.
+         * priority_queue in C++ is a max-heap, so return true when a.seq > b.seq
+         * to make smaller seq (earlier enqueue) the top() element. */
+        return a.seq > b.seq;
+    }
+};
+
+/**
+ * @brief Threaded motion-estimation module that schedules CTU blocks across worker threads.
+ *
+ * Owns per-worker analysis state (ThreadLocalData), manages the CTU task queues,
+ * and exposes a JobProvider interface for the thread pool to execute MVP
+ * derivation and ME searches in parallel.
+ */
+class ThreadedME: public JobProvider, public Thread
+{
+public:
+    x265_param*             m_param;
+    Encoder&                m_enc;
+
+    std::priority_queue<CTUTask, std::vector<CTUTask>, CompareCTUTask>  m_taskQueue;
+    Lock                    m_taskQueueLock;
+    Event                   m_taskEvent;
+
+    volatile bool           m_active;
+    unsigned long long      m_enqueueSeq;
+
+    ThreadLocalData*        m_tld;
+    int                     m_tldCount;
+
+#ifdef DETAILED_CU_STATS
+    CUStats                 m_cuStats;
+#endif
+
+    /**
+     * @brief Construct the ThreadedME manager; call create() before use.
+     */
+    ThreadedME(x265_param* param, Encoder& enc): m_param(param), m_enc(enc) {};
+    
+    /**
+     * @brief Creates threadpool, thread local data and registers itself as a job provider
+     */
+    bool create();
+
+    /**
+     * @brief Initialize lookup table used to index PU offsets for all valid CTU sizes.
+     */
+    void initPuStartIdx();
+
+    /**
+     * @brief Enqueue a block of CTUs for motion estimation.
+     *
+     * Blocks are queued per FrameEncoder and later moved into the global
+     * priority queue consumed by worker threads.

x265-4.1.tar/source/encoder/weightPrediction.cpp -> x265-4.2.tar/source/encoder/weightPrediction.cpp Changed

x265-4.1.tar/source/input/input.h -> x265-4.2.tar/source/input/input.h Changed

x265-4.1.tar/source/input/y4m.cpp -> x265-4.2.tar/source/input/y4m.cpp Changed

x265-4.1.tar/source/input/yuv.cpp -> x265-4.2.tar/source/input/yuv.cpp Changed

@@ -30,6 +30,7 @@
 #define ENABLE_THREADING 1
 
 #if _WIN32
+#define strncasecmp _strnicmp
 #include <io.h>
 #include <fcntl.h>
 #if defined(_MSC_VER)
@@ -53,6 +54,11 @@
     threadActive = false;
     ifs = NULL;
 
+    if (colorSpace < 0 || colorSpace >= X265_CSP_MAX)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "Invalid color space: %d\n", colorSpace);
+        return;
+    }
     uint32_t pixelbytes = depth > 8 ? 2 : 1;
     framesize = 0;
     for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
@@ -99,7 +105,11 @@
 
     info.frameCount = -1;
     /* try to estimate frame count, if this is not stdin */
+#if _WIN32
+    if (ifs != stdin && strncasecmp(info.filename, "\\\\.\\pipe\\", 9))
+#else
     if (ifs != stdin)
+#endif
     {
         int64_t cur = ftello(ifs);
         if (cur >= 0)
@@ -113,7 +123,11 @@
     }
     if (info.skipFrames)
     {
+#if _WIN32
+        if (ifs != stdin && strncasecmp(info.filename, "\\\\.\\pipe\\", 9))
+#else
         if (ifs != stdin)
+#endif
             fseeko(ifs, (int64_t)framesize * info.skipFrames, SEEK_CUR);
         else
             for (int i = 0; i < info.skipFrames; i++)

x265-4.1.tar/source/output/reconplay.cpp -> x265-4.2.tar/source/output/reconplay.cpp Changed

x265-4.1.tar/source/output/yuv.cpp -> x265-4.2.tar/source/output/yuv.cpp Changed

x265-4.1.tar/source/x265.cpp -> x265-4.2.tar/source/x265.cpp Changed

x265-4.1.tar/source/x265.h -> x265-4.2.tar/source/x265.h Changed

@@ -276,6 +276,8 @@
     double           decideWaitTime;
     double           row0WaitTime;
     double           wallTime;
+    int64_t          tmeTime;
+    int64_t          tmeWaitTime;
     double           refWaitWallTime;
     double           totalCTUTime;
     double           stallTime;
@@ -555,9 +557,14 @@
 #define X265_CPU_NEON            (1 << 1)   /* ARM NEON */
 #define X265_CPU_FAST_NEON_MRC   (1 << 2)   /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 #define X265_CPU_SVE2            (1 << 3)   /* AArch64 SVE2 */
-#define X265_CPU_SVE             (1 << 4)   /* AArch64 SVE2 */
+#define X265_CPU_SVE             (1 << 4)   /* AArch64 SVE */
 #define X265_CPU_NEON_DOTPROD    (1 << 5)   /* AArch64 Neon DotProd */
 #define X265_CPU_NEON_I8MM       (1 << 6)   /* AArch64 Neon I8MM */
+#define X265_CPU_SVE2_BITPERM    (1 << 7)   /* AArch64 SVE2 BitPerm */
+
+/* RISCV */
+#define X265_CPU_RVV             (1 << 0)   /* RISCV vector */
+#define X265_CPU_ZBB             (1 << 1)   /* RISCV zbb */
 
 /* IBM Power8 */
 #define X265_CPU_ALTIVEC         0x0000001
@@ -746,7 +753,7 @@
                                                     "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12",
                                                     "smpte2084", "smpte428", "arib-std-b67", 0 };
 static const char * const x265_colmatrix_names = { "gbr", "bt709", "unknown", "", "fcc", "bt470bg", "smpte170m", "smpte240m",
-                                                     "ycgco", "bt2020nc", "bt2020c", "smpte2085", "chroma-derived-nc", "chroma-derived-c", "ictcp", 0 };
+                                                     "ycgco", "bt2020nc", "bt2020c", "smpte2085", "chroma-derived-nc", "chroma-derived-c", "ictcp", "ipt-pq-c2", 0 };
 static const char * const x265_sar_names = { "unknown", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11",
                                                "32:11", "80:33", "18:11", "15:11", "64:33", "160:99", "4:3", "3:2", "2:1", 0 };
 static const char * const x265_interlace_names = { "prog", "tff", "bff", 0 };
@@ -1167,6 +1174,15 @@
      * win, particularly in video sequences with low motion. Default disabled */
     int       bDistributeMotionEstimation;
 
+    /* Use a dedicated threadpool to pre-process motion estimation. Evaluates all
+     * PU combinations for CTUs in parallel. Dependencies between CTUs in inter
+     * frames is broken to allow for more parallelism, and as result may cause
+     * drop in compression efficiency. Recommended for many core CPUs and when
+     * loss in compression efficiency is acceptable for speedup of encoding.
+     * Default disabled.
+     */
+    int       bThreadedME;
+
     /*== Logging Features ==*/
 
     /* Enable analysis and logging distribution of CUs. Now deprecated */
@@ -2324,6 +2340,13 @@
     int      searchRangeForLayer1;
     int      searchRangeForLayer2;
 
+    /* Threaded ME */
+    /* Number of CTUs processed at once when a worker thread picks up a task from ThreadedME. */
+    int      tmeTaskBlockSize;
+    
+    /* Number of rows upto which ThreadedME processes tasks ahead of WPP */
+    int      tmeNumBufferRows;
+
     /*SBRC*/
     int      bEnableSBRC;
     int mcstfFrameRange;
@@ -2344,6 +2367,9 @@
     /*Frame level RateControl Configuration*/
     int     bConfigRCFrame;
     int    isAbrLadderEnable;
+
+    /*tune*/
+    const char* tune;
 } x265_param;
 
 /* x265_param_alloc:

x265-4.1.tar/source/x265_config.h.in -> x265-4.2.tar/source/x265_config.h.in Changed

x265-4.1.tar/source/x265cli.cpp -> x265-4.2.tar/source/x265cli.cpp Changed

@@ -103,8 +103,9 @@
         H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
         H0("   --no-wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
         H0("   --no-slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
-        H0("   --no-pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
-        H0("   --no-pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
+        H0("   --no-pmode                  Parallel mode analysis. Deprecated from release 4.1. Default %s\n", OPT(param->bDistributeModeAnalysis));
+        H0("   --no-pme                    Parallel motion estimation. Deprecated from release 4.1. Default %s\n", OPT(param->bDistributeMotionEstimation));
+        H0("   --no-threaded-me            Enables standalone multi-threaded module for motion estimation at CTU level. Default %s\n", OPT(param->bThreadedME));
         H0("   --no-asm <bool|int|string>  Override CPU detection. Default: auto\n");
         H0("\nPresets:\n");
         H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
@@ -327,7 +328,7 @@
         H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
         H0("                                 bt2020-10, bt2020-12, smpte2084, smpte428, arib-std-b67. Default unknown\n");
         H1("   --colormatrix <string>        Specify color matrix setting from unknown, bt709, fcc, bt470bg, smpte170m,\n");
-        H1("                                 smpte240m, gbr, ycgco, bt2020nc, bt2020c, smpte2085, chroma-derived-nc, chroma-derived-c, ictcp. Default unknown\n");
+        H1("                                 smpte240m, gbr, ycgco, bt2020nc, bt2020c, smpte2085, chroma-derived-nc, chroma-derived-c, ictcp, ipt-pq-c2. Default unknown\n");
         H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
         H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
         H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
@@ -480,7 +481,21 @@
         if (output)
             output->release();
         output = NULL;
-    }
+        if (param && api)
+        {
+            api->param_free(param);
+            param = NULL;
+        }
+        // Free dynamically allocated input filenames
+        for (int i = 0; i < MAX_VIEWS; i++)
+        {
+            if (inputfni)
+            {
+                X265_FREE(inputfni);
+                inputfni = NULL;
+            }
+        }
+   }
 
     void CLIOptions::printStatus(uint32_t frameNum)
     {
@@ -617,12 +632,17 @@
 
     bool CLIOptions::parse(int argc, char **argv)
     {
+        if (argc <= 1)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "No input file. Run x265 --help for a list of options.\n");
+            return true;
+        }
+
         bool bError = false;
         int bShowHelp = false;
         int inputBitDepth = 8;
         int outputBitDepth = 0;
         int reconFileBitDepth = 0;
-        char* inputfnMAX_VIEWS = { NULL };
         for (int view = 0; view < MAX_VIEWS; view++)
         {
             inputfnview = X265_MALLOC(char, sizeof(char) * 1024);
@@ -636,12 +656,6 @@
         int svtEnabled = 0;
         argCnt = argc;
 
-        if (argc <= 1)
-        {
-            x265_log(NULL, X265_LOG_ERROR, "No input file. Run x265 --help for a list of options.\n");
-            return true;
-        }
-
         /* Presets are applied before all other options. */
         for (optind = 0;;)
         {

x265-4.1.tar/source/x265cli.h -> x265-4.2.tar/source/x265cli.h Changed

x265-4.1.tar/x265Version.txt -> x265-4.2.tar/x265Version.txt Changed