[SCM] x265/master: Imported Upstream version 1.5
sramacher at users.alioth.debian.org
sramacher at users.alioth.debian.org
Wed Feb 18 01:09:16 UTC 2015
The following commit has been merged in the master branch:
commit 45477f81bc13b7333f082ead6f87b53ed23c0112
Author: Sebastian Ramacher <sramacher at debian.org>
Date: Wed Feb 18 01:01:26 2015 +0100
Imported Upstream version 1.5
diff --git a/.hg_archival.txt b/.hg_archival.txt
index 39aab44..7806951 100644
--- a/.hg_archival.txt
+++ b/.hg_archival.txt
@@ -1,4 +1,4 @@
repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
-node: 5e604833c5aa605d0b6efbe5234492b5e7d8ac61
+node: 9f0324125f53a12f766f6ed6f98f16e2f42337f4
branch: stable
-tag: 1.4
+tag: 1.5
diff --git a/.hgtags b/.hgtags
index 42d4ebd..78b9509 100644
--- a/.hgtags
+++ b/.hgtags
@@ -1,6 +1,3 @@
-681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD1
-3ec4837e6f6c7159f438e1f537dff117c93ee139 LASTKNOWNGOOD2
-9a6800e84295db446fdce2e7f27059ec8ae838a7 LASTKNOWNGOOD
99fab2ef92be051cd3b3b2d817064cead282b42c 0.1
b3471d9009f5cd487b23c8c61a6bfff8980e54f2 0.2
3767fbfa970ff4b2dc2e8647db0274168727147e 0.3
@@ -15,3 +12,4 @@ cea97c4d79456842e00ade6be6fd5ec34610e5f8 1.0
ae9609aeebdc3271114168ece003679e9b1dca1b 1.1
d6257335c5370ee54317a0426a12c1f0724b18b9 1.2
c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3
+5e604833c5aa605d0b6efbe5234492b5e7d8ac61 1.4
diff --git a/build/README.txt b/build/README.txt
index c087349..d131884 100644
--- a/build/README.txt
+++ b/build/README.txt
@@ -11,26 +11,27 @@ Note: MSVC12 requires cmake 2.8.11 or later
1. Yasm 1.2.0 or later, to compile assembly primitives (performance)
- For Windows, download
- http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win32.exe or
- http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win64.exe
- depending on your O/S and copy the EXE into C:\Windows or somewhere else
- in your %PATH% that a 32-bit app (cmake) can find it. If it is not in the
- path, you must manually tell cmake where to find it.
+ For Windows, download the latest yasm executable
+ http://yasm.tortall.net/Download.html and copy the EXE into
+ C:\Windows or somewhere else in your %PATH% that a 32-bit app (cmake)
+ can find it. If it is not in the path, you must manually tell cmake
+ where to find it. Note: you do not need the vsyasm packages, x265
+ does not use them. You only need the yasm executable itself.
- For Linux, yasm-1.2.0 is likely too new to be packaged for your system so you
- will need get http://www.tortall.net/projects/yasm/releases/yasm-1.2.0.tar.gz
- compile, and install it.
+ On Linux, the packaged yasm may be older than 1.2, in which case
+ so you will need get the latest source and build it yourself.
Once YASM is properly installed, run cmake to regenerate projects. If you
do not see the below line in the cmake output, YASM is not in the PATH.
- -- Found Yasm 1.2.0 to build assembly primitives
+ -- Found Yasm 1.3.0 to build assembly primitives
- Now build the encoder and run x265 -V. If you see "assembly" on this
- line, you have YASM properly installed:
+ Now build the encoder and run x265 -V:
- x265 [info]: performance primitives: intrinsic assembly
+ x265 [info]: using cpu capabilities: MMX, SSE2, ...
+
+ If cpu capabilities line says 'none!', then the encoder was built
+ without yasm.
2. VisualLeakDetector (Windows Only)
diff --git a/build/icl32/build-all.bat b/build/icl32/build-all.bat
deleted file mode 100644
index cbe9a59..0000000
--- a/build/icl32/build-all.bat
+++ /dev/null
@@ -1,14 +0,0 @@
- at echo off
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
- msg "%username%" "Intel C++ 2013 not detected"
- exit 1
-)
-if not exist Makefile (
- call make-makefile.bat
-)
-if exist Makefile (
- call "%ICL%\bin\compilervars.bat" ia32
- nmake
-)
diff --git a/build/icl32/make-makefile.bat b/build/icl32/make-makefile.bat
deleted file mode 100644
index 799344e..0000000
--- a/build/icl32/make-makefile.bat
+++ /dev/null
@@ -1,15 +0,0 @@
- at echo off
-::
-:: run this batch file to create an Intel C++ 2013 NMake makefile for this project.
-:: See the cmake documentation for other generator targets
-::
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
- msg "%username%" "Intel C++ 2013 not detected"
- exit 1
-)
-call "%ICL%\bin\compilervars.bat" ia32
-set CC=icl
-set CXX=icl
-cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source
diff --git a/build/icl64/build-all.bat b/build/icl64/build-all.bat
deleted file mode 100644
index d1d6b8d..0000000
--- a/build/icl64/build-all.bat
+++ /dev/null
@@ -1,14 +0,0 @@
- at echo off
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
- msg "%username%" "Intel C++ 2013 not detected"
- exit 1
-)
-if not exist Makefile (
- call make-makefile.bat
-)
-if exist Makefile (
- call "%ICL%\bin\compilervars.bat" intel64
- nmake
-)
diff --git a/build/icl64/make-makefile.bat b/build/icl64/make-makefile.bat
deleted file mode 100644
index 2d3f629..0000000
--- a/build/icl64/make-makefile.bat
+++ /dev/null
@@ -1,17 +0,0 @@
- at echo off
-::
-:: run this batch file to create an Intel C++ 2013 NMake makefile for this project.
-:: See the cmake documentation for other generator targets
-::
-if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" )
-if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" )
-if "%ICL%" == "" (
- msg "%username%" "Intel C++ 2013 not detected"
- pause
- exit 1
-)
-call "%ICL%\bin\compilervars.bat" intel64
-set CC=icl
-set CXX=icl
-cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source
-pause
diff --git a/doc/intra/intra-4x4.txt b/doc/intra/intra-4x4.txt
index 1609d0d..18ab7ce 100644
--- a/doc/intra/intra-4x4.txt
+++ b/doc/intra/intra-4x4.txt
@@ -1,123 +1,123 @@
---- 4x4, Mode= 2 ---
-[ 0]: Fact= 0: -2, -3, -4, -5, *
-[ 1]: Fact= 0: -3, -4, -5, -6, *
-[ 2]: Fact= 0: -4, -5, -6, -7, *
-[ 3]: Fact= 0: -5, -6, -7, -8, *
+--- 4x4, Mode= 2 --- [refPix]
+[ 0]: Fact= 0: 10, 11, 12, 13 *
+[ 1]: Fact= 0: 11, 12, 13, 14 *
+[ 2]: Fact= 0: 12, 13, 14, 15 *
+[ 3]: Fact= 0: 13, 14, 15, 16 *
--- 4x4, Mode= 3 ---
-[ 0]: Fact=26: -1, -2, -3, -4, -5, *
-[ 1]: Fact=20: -2, -3, -4, -5, -6, *
-[ 2]: Fact=14: -3, -4, -5, -6, -7, *
-[ 3]: Fact= 8: -4, -5, -6, -7, -8, *
+[ 0]: Fact=26: 9, 10, 11, 12, 13 *
+[ 1]: Fact=20: 10, 11, 12, 13, 14 *
+[ 2]: Fact=14: 11, 12, 13, 14, 15 *
+[ 3]: Fact= 8: 12, 13, 14, 15, 16 *
--- 4x4, Mode= 4 ---
-[ 0]: Fact=21: -1, -2, -3, -4, -5, *
-[ 1]: Fact=10: -2, -3, -4, -5, -6, *
-[ 2]: Fact=31: -2, -3, -4, -5, -6, *
-[ 3]: Fact=20: -3, -4, -5, -6, -7, *
+[ 0]: Fact=21: 9, 10, 11, 12, 13 *
+[ 1]: Fact=10: 10, 11, 12, 13, 14 *
+[ 2]: Fact=31: 10, 11, 12, 13, 14 *
+[ 3]: Fact=20: 11, 12, 13, 14, 15 *
--- 4x4, Mode= 5 ---
-[ 0]: Fact=17: -1, -2, -3, -4, -5, *
-[ 1]: Fact= 2: -2, -3, -4, -5, -6, *
-[ 2]: Fact=19: -2, -3, -4, -5, -6, *
-[ 3]: Fact= 4: -3, -4, -5, -6, -7, *
+[ 0]: Fact=17: 9, 10, 11, 12, 13 *
+[ 1]: Fact= 2: 10, 11, 12, 13, 14 *
+[ 2]: Fact=19: 10, 11, 12, 13, 14 *
+[ 3]: Fact= 4: 11, 12, 13, 14, 15 *
--- 4x4, Mode= 6 ---
-[ 0]: Fact=13: -1, -2, -3, -4, -5, *
-[ 1]: Fact=26: -1, -2, -3, -4, -5, *
-[ 2]: Fact= 7: -2, -3, -4, -5, -6, *
-[ 3]: Fact=20: -2, -3, -4, -5, -6, *
+[ 0]: Fact=13: 9, 10, 11, 12, 13 *
+[ 1]: Fact=26: 9, 10, 11, 12, 13 *
+[ 2]: Fact= 7: 10, 11, 12, 13, 14 *
+[ 3]: Fact=20: 10, 11, 12, 13, 14 *
--- 4x4, Mode= 7 ---
-[ 0]: Fact= 9: -1, -2, -3, -4, -5, *
-[ 1]: Fact=18: -1, -2, -3, -4, -5, *
-[ 2]: Fact=27: -1, -2, -3, -4, -5, *
-[ 3]: Fact= 4: -2, -3, -4, -5, -6, *
+[ 0]: Fact= 9: 9, 10, 11, 12, 13 *
+[ 1]: Fact=18: 9, 10, 11, 12, 13 *
+[ 2]: Fact=27: 9, 10, 11, 12, 13 *
+[ 3]: Fact= 4: 10, 11, 12, 13, 14 *
--- 4x4, Mode= 8 ---
-[ 0]: Fact= 5: -1, -2, -3, -4, -5, *
-[ 1]: Fact=10: -1, -2, -3, -4, -5, *
-[ 2]: Fact=15: -1, -2, -3, -4, -5, *
-[ 3]: Fact=20: -1, -2, -3, -4, -5, *
+[ 0]: Fact= 5: 9, 10, 11, 12, 13 *
+[ 1]: Fact=10: 9, 10, 11, 12, 13 *
+[ 2]: Fact=15: 9, 10, 11, 12, 13 *
+[ 3]: Fact=20: 9, 10, 11, 12, 13 *
--- 4x4, Mode= 9 ---
-[ 0]: Fact= 2: -1, -2, -3, -4, -5, *
-[ 1]: Fact= 4: -1, -2, -3, -4, -5, *
-[ 2]: Fact= 6: -1, -2, -3, -4, -5, *
-[ 3]: Fact= 8: -1, -2, -3, -4, -5, *
---- 4x4, Mode=10 ---
-[ 0]: Fact= 0: -1, -2, -3, -4, *
-[ 1]: Fact= 0: -1, -2, -3, -4, *
-[ 2]: Fact= 0: -1, -2, -3, -4, *
-[ 3]: Fact= 0: -1, -2, -3, -4, *
+[ 0]: Fact= 2: 9, 10, 11, 12, 13 *
+[ 1]: Fact= 4: 9, 10, 11, 12, 13 *
+[ 2]: Fact= 6: 9, 10, 11, 12, 13 *
+[ 3]: Fact= 8: 9, 10, 11, 12, 13 *
+--- 4x4, Mode=10 --- filtPix
+[ 0]: Fact= 0: 9, 10, 11, 12 *
+[ 1]: Fact= 0: 9, 10, 11, 12 *
+[ 2]: Fact= 0: 9, 10, 11, 12 *
+[ 3]: Fact= 0: 9, 10, 11, 12 *
--- 4x4, Mode=11 ---
-[ 0]: Fact=30: 0, -1, -2, -3, -4, *
-[ 1]: Fact=28: 0, -1, -2, -3, -4, *
-[ 2]: Fact=26: 0, -1, -2, -3, -4, *
-[ 3]: Fact=24: 0, -1, -2, -3, -4, *
+[ 0]: Fact=30: 0, 9, 10, 11, 12 *
+[ 1]: Fact=28: 0, 9, 10, 11, 12 *
+[ 2]: Fact=26: 0, 9, 10, 11, 12 *
+[ 3]: Fact=24: 0, 9, 10, 11, 12 *
--- 4x4, Mode=12 ---
-[ 0]: Fact=27: 0, -1, -2, -3, -4, *
-[ 1]: Fact=22: 0, -1, -2, -3, -4, *
-[ 2]: Fact=17: 0, -1, -2, -3, -4, *
-[ 3]: Fact=12: 0, -1, -2, -3, -4, *
+[ 0]: Fact=27: 0, 9, 10, 11, 12 *
+[ 1]: Fact=22: 0, 9, 10, 11, 12 *
+[ 2]: Fact=17: 0, 9, 10, 11, 12 *
+[ 3]: Fact=12: 0, 9, 10, 11, 12 *
--- 4x4, Mode=13 ---
-[ 0]: Fact=23: 0, -1, -2, -3, -4, *
-[ 1]: Fact=14: 0, -1, -2, -3, -4, *
-[ 2]: Fact= 5: 0, -1, -2, -3, -4, *
-[ 3]: Fact=28: 4, 0, -1, -2, -3, *
+[ 0]: Fact=23: 0, 9, 10, 11, 12 *
+[ 1]: Fact=14: 0, 9, 10, 11, 12 *
+[ 2]: Fact= 5: 0, 9, 10, 11, 12 *
+[ 3]: Fact=28: 4, 0, 9, 10, 11 *
--- 4x4, Mode=14 ---
-[ 0]: Fact=19: 0, -1, -2, -3, -4, *
-[ 1]: Fact= 6: 0, -1, -2, -3, -4, *
-[ 2]: Fact=25: 2, 0, -1, -2, -3, *
-[ 3]: Fact=12: 2, 0, -1, -2, -3, *
+[ 0]: Fact=19: 0, 9, 10, 11, 12 *
+[ 1]: Fact= 6: 0, 9, 10, 11, 12 *
+[ 2]: Fact=25: 2, 0, 9, 10, 11 *
+[ 3]: Fact=12: 2, 0, 9, 10, 11 *
--- 4x4, Mode=15 ---
-[ 0]: Fact=15: 0, -1, -2, -3, -4, *
-[ 1]: Fact=30: 2, 0, -1, -2, -3, *
-[ 2]: Fact=13: 2, 0, -1, -2, -3, *
-[ 3]: Fact=28: 4, 2, 0, -1, -2, *
+[ 0]: Fact=15: 0, 9, 10, 11, 12 *
+[ 1]: Fact=30: 2, 0, 9, 10, 11 *
+[ 2]: Fact=13: 2, 0, 9, 10, 11 *
+[ 3]: Fact=28: 4, 2, 0, 9, 10 *
--- 4x4, Mode=16 ---
-[ 0]: Fact=11: 0, -1, -2, -3, -4, *
-[ 1]: Fact=22: 2, 0, -1, -2, -3, *
-[ 2]: Fact= 1: 2, 0, -1, -2, -3, *
-[ 3]: Fact=12: 3, 2, 0, -1, -2, *
+[ 0]: Fact=11: 0, 9, 10, 11, 12 *
+[ 1]: Fact=22: 2, 0, 9, 10, 11 *
+[ 2]: Fact= 1: 2, 0, 9, 10, 11 *
+[ 3]: Fact=12: 3, 2, 0, 9, 10 *
--- 4x4, Mode=17 ---
-[ 0]: Fact= 6: 0, -1, -2, -3, -4, *
-[ 1]: Fact=12: 1, 0, -1, -2, -3, *
-[ 2]: Fact=18: 2, 1, 0, -1, -2, *
-[ 3]: Fact=24: 4, 2, 1, 0, -1, *
+[ 0]: Fact= 6: 0, 9, 10, 11, 12 *
+[ 1]: Fact=12: 1, 0, 9, 10, 11 *
+[ 2]: Fact=18: 2, 1, 0, 9, 10 *
+[ 3]: Fact=24: 4, 2, 1, 0, 9 *
--- 4x4, Mode=18 ---
-[ 0]: Fact= 0: 0, 1, 2, 3, *
-[ 1]: Fact= 0: -1, 0, 1, 2, *
-[ 2]: Fact= 0: -2, -1, 0, 1, *
-[ 3]: Fact= 0: -3, -2, -1, 0, *
+[ 0]: Fact= 0: 0, 1, 2, 3 *
+[ 1]: Fact= 0: 9, 0, 1, 2 *
+[ 2]: Fact= 0: 10, 9, 0, 1 *
+[ 3]: Fact= 0: 11, 10, 9, 0 *
--- 4x4, Mode=19 ---
-[ 0]: Fact= 6: 0, 1, 2, 3, 4, *
-[ 1]: Fact=12: -1, 0, 1, 2, 3, *
-[ 2]: Fact=18: -2, -1, 0, 1, 2, *
-[ 3]: Fact=24: -4, -2, -1, 0, 1, *
+[ 0]: Fact= 6: 0, 1, 2, 3, 4 *
+[ 1]: Fact=12: 9, 0, 1, 2, 3 *
+[ 2]: Fact=18: 10, 9, 0, 1, 2 *
+[ 3]: Fact=24: 12, 10, 9, 0, 1 *
--- 4x4, Mode=20 ---
-[ 0]: Fact=11: 0, 1, 2, 3, 4, *
-[ 1]: Fact=22: -2, 0, 1, 2, 3, *
-[ 2]: Fact= 1: -2, 0, 1, 2, 3, *
-[ 3]: Fact=12: -3, -2, 0, 1, 2, *
+[ 0]: Fact=11: 0, 1, 2, 3, 4 *
+[ 1]: Fact=22: 10, 0, 1, 2, 3 *
+[ 2]: Fact= 1: 10, 0, 1, 2, 3 *
+[ 3]: Fact=12: 11, 10, 0, 1, 2 *
--- 4x4, Mode=21 ---
-[ 0]: Fact=15: 0, 1, 2, 3, 4, *
-[ 1]: Fact=30: -2, 0, 1, 2, 3, *
-[ 2]: Fact=13: -2, 0, 1, 2, 3, *
-[ 3]: Fact=28: -4, -2, 0, 1, 2, *
+[ 0]: Fact=15: 0, 1, 2, 3, 4 *
+[ 1]: Fact=30: 10, 0, 1, 2, 3 *
+[ 2]: Fact=13: 10, 0, 1, 2, 3 *
+[ 3]: Fact=28: 12, 10, 0, 1, 2 *
--- 4x4, Mode=22 ---
-[ 0]: Fact=19: 0, 1, 2, 3, 4, *
-[ 1]: Fact= 6: 0, 1, 2, 3, 4, *
-[ 2]: Fact=25: -2, 0, 1, 2, 3, *
-[ 3]: Fact=12: -2, 0, 1, 2, 3, *
+[ 0]: Fact=19: 0, 1, 2, 3, 4 *
+[ 1]: Fact= 6: 0, 1, 2, 3, 4 *
+[ 2]: Fact=25: 10, 0, 1, 2, 3 *
+[ 3]: Fact=12: 10, 0, 1, 2, 3 *
--- 4x4, Mode=23 ---
-[ 0]: Fact=23: 0, 1, 2, 3, 4, *
-[ 1]: Fact=14: 0, 1, 2, 3, 4, *
-[ 2]: Fact= 5: 0, 1, 2, 3, 4, *
-[ 3]: Fact=28: -4, 0, 1, 2, 3, *
+[ 0]: Fact=23: 0, 1, 2, 3, 4 *
+[ 1]: Fact=14: 0, 1, 2, 3, 4 *
+[ 2]: Fact= 5: 0, 1, 2, 3, 4 *
+[ 3]: Fact=28: 12, 0, 1, 2, 3 *
--- 4x4, Mode=24 ---
-[ 0]: Fact=27: 0, 1, 2, 3, 4, *
-[ 1]: Fact=22: 0, 1, 2, 3, 4, *
-[ 2]: Fact=17: 0, 1, 2, 3, 4, *
-[ 3]: Fact=12: 0, 1, 2, 3, 4, *
+[ 0]: Fact=27: 0, 1, 2, 3, 4 *
+[ 1]: Fact=22: 0, 1, 2, 3, 4 *
+[ 2]: Fact=17: 0, 1, 2, 3, 4 *
+[ 3]: Fact=12: 0, 1, 2, 3, 4 *
--- 4x4, Mode=25 ---
-[ 0]: Fact=30: 0, 1, 2, 3, 4, *
-[ 1]: Fact=28: 0, 1, 2, 3, 4, *
-[ 2]: Fact=26: 0, 1, 2, 3, 4, *
-[ 3]: Fact=24: 0, 1, 2, 3, 4, *
+[ 0]: Fact=30: 0, 1, 2, 3, 4 *
+[ 1]: Fact=28: 0, 1, 2, 3, 4 *
+[ 2]: Fact=26: 0, 1, 2, 3, 4 *
+[ 3]: Fact=24: 0, 1, 2, 3, 4 *
--- 4x4, Mode=26 ---
[ 0]: Fact= 0: 1, 2, 3, 4, *
[ 1]: Fact= 0: 1, 2, 3, 4, *
diff --git a/doc/reST/Makefile b/doc/reST/Makefile
index 6b1d44c..b2d1c3d 100644
--- a/doc/reST/Makefile
+++ b/doc/reST/Makefile
@@ -13,7 +13,7 @@ PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-.PHONY: help clean html web pickle htmlhelp qthelp qhc latex changes linkcheck
+.PHONY: help clean distclean html web pickle htmlhelp qthelp qhc latex changes linkcheck
help:
@echo "Please use \`make <target>' where <target> is one of"
@@ -24,12 +24,16 @@ help:
@echo " qthelp to make HTML files and a qthelp project"
@echo " qhc to make QHC file"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " man to make manpages"
@echo " changes to make an overview over all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
clean:
-rm -rf build/*
+distclean: clean
+ -rmdir build/
+
html:
mkdir -p build/html build/doctrees
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
@@ -83,6 +87,14 @@ latex:
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
+man:
+ mkdir -p build/man build/doctrees
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) build/man
+ @echo
+ @echo "Build finished; the manpages are in build/man."
+ @echo "Run \`man -l build/man/x265.1' or \`man -l build/man/libx265.3'" \
+ "to view them."
+
changes:
mkdir -p build/changes build/doctrees
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes
diff --git a/doc/reST/api.rst b/doc/reST/api.rst
index 214881a..f15212d 100644
--- a/doc/reST/api.rst
+++ b/doc/reST/api.rst
@@ -32,6 +32,12 @@ library was compiled (it will contain a value of 8 or 10). Further,
x265 which was compiled, and **x265_build_info_str** is a pointer to a
string identifying the compiler and build options.
+.. Note::
+
+ **x265_version_str** is only updated when **cmake** runs. If you are
+ making binaries for others to use, it is recommended to run
+ **cmake** prior to **make** in your build scripts.
+
x265 will accept input pixels of any depth between 8 and 16 bits
regardless of the depth of its internal pixels (8 or 10). It will shift
and mask input pixels as required to reach the internal depth. If
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index 324b83a..84a6783 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -28,10 +28,10 @@ consider this an error and abort.
Generally, when an option expects a string value from a list of strings
the user may specify the integer ordinal of the value they desire. ie:
-:option:`--log-level` 3 is equivalent to :option:`--log-level` debug.
+:option:`--log-level` 4 is equivalent to :option:`--log-level` debug.
-Standalone Executable Options
-=============================
+Executable Options
+==================
.. option:: --help, -h
@@ -45,6 +45,109 @@ Standalone Executable Options
**CLI ONLY**
+Command line executable return codes::
+
+ 0. encode successful
+ 1. unable to parse command line
+ 2. unable to open encoder
+ 3. unable to generate stream headers
+ 4. encoder abort
+
+Logging/Statistic Options
+=========================
+
+.. option:: --log-level <integer|string>
+
+ Logging level. Debug level enables per-frame QP, metric, and bitrate
+ logging. If a CSV file is being generated, frame level makes the log
+ be per-frame rather than per-encode. Full level enables hash and
+ weight logging. -1 disables all logging, except certain fatal
+ errors, and can be specified by the string "none".
+
+ 0. error
+ 1. warning
+ 2. info **(default)**
+ 3. frame
+ 4. debug
+ 5. full
+
+.. option:: --no-progress
+
+ Disable periodic progress reports from the CLI
+
+ **CLI ONLY**
+
+.. option:: --csv <filename>
+
+ Writes encoding results to a comma separated value log file. Creates
+ the file if it doesnt already exist, else adds one line per run. if
+ :option:`--log-level` is frame or above, it writes one line per
+ frame. Default none
+
+ When frame level logging is enabled, several frame performance
+ statistics are listed:
+
+ **DecideWait ms** number of milliseconds the frame encoder had to
+ wait, since the previous frame was retrieved by the API thread,
+ before a new frame has been given to it. This is the latency
+ introduced by slicetype decisions (lookahead).
+
+ **Row0Wait ms** number of milliseconds since the frame encoder
+ received a frame to encode before its first row of CTUs is allowed
+ to begin compression. This is the latency introduced by reference
+ frames making reconstructed and filtered rows available.
+
+ **Wall time ms** number of milliseconds between the first CTU
+ being ready to be compressed and the entire frame being compressed
+ and the output NALs being completed.
+
+ **Ref Wait Wall ms** number of milliseconds between the first
+ reference row being available and the last reference row becoming
+ available.
+
+ **Total CTU time ms** the total time (measured in milliseconds)
+ spent by worker threads compressing and filtering CTUs for this
+ frame.
+
+ **Stall Time ms** the number of milliseconds of the reported wall
+ time that were spent with zero worker threads, aka all compression
+ was completely stalled.
+
+ **Avg WPP** the average number of worker threads working on this
+ frame, at any given time. This value is sampled at the completion of
+ each CTU. This shows the effectiveness of Wavefront Parallel
+ Processing.
+
+ **Row Blocks** the number of times a worker thread had to abandon
+ the row of CTUs it was encoding because the row above it was not far
+ enough ahead for the necessary reference data to be available. This
+ is more of a problem for P frames where some blocks are much more
+ expensive than others.
+
+
+.. option:: --cu-stats, --no-cu-stats
+
+ Records statistics on how each CU was coded (split depths and other
+ mode decisions) and reports those statistics at the end of the
+ encode. Default disabled
+
+.. option:: --ssim, --no-ssim
+
+ Calculate and report Structural Similarity values. It is
+ recommended to use :option:`--tune` ssim if you are measuring ssim,
+ else the results should not be used for comparison purposes.
+ Default disabled
+
+.. option:: --psnr, --no-psnr
+
+ Calculate and report Peak Signal to Noise Ratio. It is recommended
+ to use :option:`--tune` psnr if you are measuring PSNR, else the
+ results should not be used for comparison purposes. Default
+ disabled
+
+Performance Options
+===================
+
.. option:: --asm <integer:false:string>, --no-asm
x265 will use all detected CPU SIMD architectures by default. You can
@@ -57,13 +160,24 @@ Standalone Executable Options
One may also directly supply the CPU capability bitmap as an integer.
+.. option:: --frame-threads, -F <integer>
+
+ Number of concurrently encoded frames. Using a single frame thread
+ gives a slight improvement in compression, since the entire reference
+ frames are always available for motion compensation, but it has
+ severe performance implications. Default is an autodetected count
+ based on the number of CPU cores and whether WPP is enabled or not.
+
+ Over-allocation of frame threads will not improve performance, it
+ will generally just increase memory use.
+
.. option:: --threads <integer>
Number of threads to allocate for the worker thread pool This pool
is used for WPP and for distributed analysis and motion search:
:option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
- If :option:`--threads`=1 is specified, then no thread pool is
+ If :option:`--threads` 1 is specified, then no thread pool is
created. When no thread pool is created, all the thread pool
features are implicitly disabled. If all the pool features are
disabled by the user, then the pool is implicitly disabled.
@@ -71,13 +185,24 @@ Standalone Executable Options
Default 0, one thread is allocated per detected hardware thread
(logical CPU cores)
+.. option:: --wpp, --no-wpp
+
+ Enable Wavefront Parallel Processing. The encoder may begin encoding
+ a row as soon as the row above it is at least two CTUs ahead in the
+ encode process. This gives a 3-5x gain in parallelism for about 1%
+ overhead in compression efficiency.
+
+ This feature is implicitly disabled when no thread pool is present.
+
+ Default: Enabled
+
.. option:: --pmode, --no-pmode
Parallel mode decision, or distributed mode analysis. When enabled
the encoder will distribute the analysis work of each CU (merge,
inter, intra) across multiple worker threads. Only recommended if
x265 is not already saturating the CPU cores. In RD levels 3 and 4
- it will be most effective if --rect was enabled. At RD levels 5 and
+ it will be most effective if --rect is enabled. At RD levels 5 and
6 there is generally always enough work to distribute to warrant the
overhead, assuming your CPUs are not already saturated.
@@ -85,7 +210,8 @@ Standalone Executable Options
efficiency. In fact, since the modes are all measured in parallel it
makes certain early-outs impractical and thus you usually get
slightly better compression when it is enabled (at the expense of
- not skipping improbable modes).
+ not skipping improbable modes). This bypassing of early-outs can
+ cause pmode to slow down encodes, especially at faster presets.
This feature is implicitly disabled when no thread pool is present.
@@ -113,7 +239,8 @@ Standalone Executable Options
Sets parameters to preselected values, trading off compression efficiency against
encoding speed. These parameters are applied before all other input parameters are
- applied, and so you can override any parameters that these values control.
+ applied, and so you can override any parameters that these values control. See
+ :ref:`presets <presets>` for more detail.
0. ultrafast
1. superfast
@@ -129,84 +256,18 @@ Standalone Executable Options
.. option:: --tune, -t <string>
Tune the settings for a particular type of source or situation. The changes will
- be applied after :option:`--preset` but before all other parameters. Default none
-
- **Values:** psnr, ssim, zero-latency, fast-decode.
-
-.. option:: --frame-threads, -F <integer>
-
- Number of concurrently encoded frames. Using a single frame thread
- gives a slight improvement in compression, since the entire reference
- frames are always available for motion compensation, but it has
- severe performance implications. Default is an autodetected count
- based on the number of CPU cores and whether WPP is enabled or not.
-
- Over-allocation of frame threads will not improve performance, it
- will generally just increase memory use.
-
-.. option:: --log-level <integer|string>
-
- Logging level. Debug level enables per-frame QP, metric, and bitrate
- logging. If a CSV file is being generated, debug level makes the log
- be per-frame rather than per-encode. Full level enables hash and
- weight logging. -1 disables all logging, except certain fatal
- errors, and can be specified by the string "none".
-
- 0. error
- 1. warning
- 2. info **(default)**
- 3. debug
- 4. full
-
-.. option:: --csv <filename>
-
- Writes encoding results to a comma separated value log file. Creates
- the file if it doesnt already exist, else adds one line per run. if
- :option:`--log-level` is debug or above, it writes one line per
- frame. Default none
-
-.. option:: --cu-stats, --no-cu-stats
-
- Records statistics on how each CU was coded (split depths and other
- mode decisions) and reports those statistics at the end of the
- encode. Default disabled
-
-.. option:: --output, -o <filename>
-
- Bitstream output file name. If there are two extra CLI options, the
- first is implicitly the input filename and the second is the output
- filename, making the :option:`--output` option optional.
-
- The output file will always contain a raw HEVC bitstream, the CLI
- does not support any container file formats.
-
- **CLI ONLY**
-
-.. option:: --no-progress
-
- Disable CLI periodic progress reports
+ be applied after :option:`--preset` but before all other parameters. Default none.
+ See :ref:`tunings <tunings>` for more detail.
- **CLI ONLY**
+ **Values:** psnr, ssim, grain, zero-latency, fast-decode.
-Quality reporting metrics
+Input/Output File Options
=========================
-.. option:: --ssim, --no-ssim
-
- Calculate and report Structural Similarity values. It is
- recommended to use :option:`--tune` ssim if you are measuring ssim,
- else the results should not be used for comparison purposes.
- Default disabled
-
-.. option:: --psnr, --no-psnr
-
- Calculate and report Peak Signal to Noise Ratio. It is recommended
- to use :option:`--tune` psnr if you are measuring PSNR, else the
- results should not be used for comparison purposes. Default
- disabled
-
-Input Options
-=============
+These options all describe the input video sequence or, in the case of
+:option:`--dither`, operations that are performed on the sequence prior
+to encode. All options dealing with files (names, formats, offsets or
+frame counts) are only applicable to the CLI application.
.. option:: --input <filename>
@@ -242,21 +303,6 @@ Input Options
**CLI ONLY**
-.. option:: --nr <integer>
-
- Noise reduction - an adaptive deadzone applied after DCT
- (subtracting from DCT coefficients), before quantization, on inter
- blocks. It does no pixel-level filtering, doesn't cross DCT block
- boundaries, has no overlap, doesn't affect intra blocks. The higher
- the strength value parameter, the more aggressively it will reduce
- noise.
-
- Enabling noise reduction will make outputs diverge between different
- numbers of frame threads. Outputs will be deterministic but the
- outputs of -F2 will no longer match the outputs of -F3, etc.
-
- **Values:** any value in range of 100 to 1000. Default disabled.
-
.. option:: --input-res <wxh>
YUV only: Source picture size [w x h]
@@ -285,8 +331,6 @@ Input Options
.. option:: --interlaceMode <false|tff|bff>, --no-interlaceMode
- **EXPERIMENTAL** Specify interlace type of source pictures.
-
0. progressive pictures **(default)**
1. top field first
2. bottom field first
@@ -305,61 +349,20 @@ Input Options
.. option:: --frames, -f <integer>
- Number of frames to be encoded. Default 0 (all)
+ Number of frames of input sequence to be encoded. Default 0 (all)
**CLI ONLY**
-.. option:: --qpfile <filename>
-
- Specify a text file which contains frametypes and QPs for some or
- all frames. The format of each line is:
-
- framenumber frametype QP
-
- Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
- **b** is an unreferenced B frame. **I** is a keyframe (random
- access point) while **i** is a I frame that is not a keyframe
- (references are not broken).
-
- Specifying QP (integer) is optional, and if specified they are
- clamped within the encoder to qpmin/qpmax.
-
-.. option:: --scaling-list <filename>
-
- Quantization scaling lists. HEVC supports 6 quantization scaling
- lists to be defined; one each for Y, Cb, Cr for intra prediction and
- one each for inter prediction.
-
- x265 does not use scaling lists by default, but this can also be
- made explicit by :option:`--scaling-list` *off*.
-
- HEVC specifies a default set of scaling lists which may be enabled
- without requiring them to be signaled in the SPS. Those scaling
- lists can be enabled via :option:`--scaling-list` *default*.
-
- All other strings indicate a filename containing custom scaling
- lists in the HM format. The encode will abort if the file is not
- parsed correctly. Custom lists must be signaled in the SPS
+.. option:: --output, -o <filename>
-.. option:: --lambda-file <filename>
+ Bitstream output file name. If there are two extra CLI options, the
+ first is implicitly the input filename and the second is the output
+ filename, making the :option:`--output` option optional.
- Specify a text file containing values for x265_lambda_tab and
- x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
- values.
-
- The text file syntax is simple. Comma is considered to be
- white-space. All white-space is ignored. Lines must be less than 2k
- bytes in length. Content following hash (#) characters are ignored.
- The values read from the file are logged at :option:`--log-level`
- debug.
+ The output file will always contain a raw HEVC bitstream, the CLI
+ does not support any container file formats.
- Note that the lambda tables are process-global and so the new values
- affect all encoders running in the same process.
-
- Lambda values affect encoder mode decisions, the lower the lambda
- the more bits it will try to spend on signaling information (motion
- vectors and splits) and less on residual. This feature is intended
- for experimentation.
+ **CLI ONLY**
Profile, Level, Tier
====================
@@ -417,15 +420,41 @@ Profile, Level, Tier
parameters to meet those requirements but it will never raise
them.
-Quad-Tree analysis
-==================
+Mode decision / Analysis
+========================
-.. option:: --wpp, --no-wpp
+.. option:: --rd <0..6>
- Enable Wavefront Parallel Processing. The encoder may begin encoding
- a row as soon as the row above it is at least two CTUs ahead in the
- encode process. This gives a 3-5x gain in parallelism for about 1%
- overhead in compression efficiency. Default: Enabled
+ Level of RDO in mode decision. The higher the value, the more
+ exhaustive the analysis and the more rate distortion optimization is
+ used. The lower the value the faster the encode, the higher the
+ value the smaller the bitstream (in general). Default 3
+
+ Note that this table aims for accuracy, but is not necessarily our
+ final target behavior for each mode.
+
+ +-------+---------------------------------------------------------------+
+ | Level | Description |
+ +=======+===============================================================+
+ | 0 | sa8d mode and split decisions, intra w/ source pixels |
+ +-------+---------------------------------------------------------------+
+ | 1 | recon generated (better intra), RDO merge/skip selection |
+ +-------+---------------------------------------------------------------+
+ | 2 | RDO splits and merge/skip selection |
+ +-------+---------------------------------------------------------------+
+ | 3 | RDO mode and split decisions, chroma residual used for sa8d |
+ +-------+---------------------------------------------------------------+
+ | 4 | Adds RDO Quant |
+ +-------+---------------------------------------------------------------+
+ | 5 | Adds RDO prediction decisions |
+ +-------+---------------------------------------------------------------+
+ | 6 | Currently same as 5 |
+ +-------+---------------------------------------------------------------+
+
+ **Range of values:** 0: least .. 6: full RDO analysis
+
+Options which affect the coding unit quad-tree, sometimes referred to as
+the prediction quad-tree.
.. option:: --ctu, -s <64|32|16>
@@ -436,6 +465,108 @@ Quad-Tree analysis
and less frame parallelism as well. Because of this the faster
presets use a CU size of 32. Default: 64
+.. option:: --rect, --no-rect
+
+ Enable analysis of rectangular motion partitions Nx2N and 2NxN
+ (50/50 splits, two directions). Default disabled
+
+.. option:: --amp, --no-amp
+
+ Enable analysis of asymmetric motion partitions (75/25 splits, four
+ directions). At RD levels 0 through 4, AMP partitions are only
+ considered at CU sizes 32x32 and below. At RD levels 5 and 6, it
+ will only consider AMP partitions as merge candidates (no motion
+ search) at 64x64, and as merge or inter candidates below 64x64.
+
+ The AMP partitions which are searched are derived from the current
+ best inter partition. If Nx2N (vertical rectangular) is the best
+ current prediction, then left and right asymmetrical splits will be
+ evaluated. If 2NxN (horizontal rectangular) is the best current
+ prediction, then top and bottom asymmetrical splits will be
+ evaluated, If 2Nx2N is the best prediction, and the block is not a
+ merge/skip, then all four AMP partitions are evaluated.
+
+ This setting has no effect if rectangular partitions are disabled.
+ Default disabled
+
+.. option:: --early-skip, --no-early-skip
+
+ Measure full CU size (2Nx2N) merge candidates first; if no residual
+ is found the analysis is short circuited. Default disabled
+
+.. option:: --fast-cbf, --no-fast-cbf
+
+ Short circuit analysis if a prediction is found that does not set
+ the coded block flag (aka: no residual was encoded). It prevents
+ the encoder from perhaps finding other predictions that also have no
+ residual but require less signaling bits or have less distortion.
+ Only applicable for RD levels 5 and 6. Default disabled
+
+.. option:: --fast-intra, --no-fast-intra
+
+ Perform an initial scan of every fifth intra angular mode, then
+ check modes +/- 2 distance from the best mode, then +/- 1 distance
+ from the best mode, effectively performing a gradient descent. When
+ enabled 10 modes in total are checked. When disabled all 33 angular
+ modes are checked. Only applicable for :option:`--rd` levels 4 and
+ below (medium preset and faster).
+
+.. option:: --b-intra, --no-b-intra
+
+ Enables the evaluation of intra modes in B slices. Default disabled.
+
+.. option:: --cu-lossless, --no-cu-lossless
+
+ For each CU, evaluate lossless (transform and quant bypass) encode
+ of the best non-lossless mode option as a potential rate distortion
+ optimization. If the global option :option:`--lossless` has been
+ specified, all CUs will be encoded as lossless unconditionally
+ regardless of whether this option was enabled. Default disabled.
+
+ Only effective at RD levels 3 and above, which perform RDO mode
+ decisions.
+
+.. option:: --tskip, --no-tskip
+
+ Enable evaluation of transform skip (bypass DCT but still use
+ quantization) coding for 4x4 TU coded blocks.
+
+ Only effective at RD levels 3 and above, which perform RDO mode
+ decisions. Default disabled
+
+.. option:: --tskip-fast, --no-tskip-fast
+
+ Only evaluate transform skip for NxN intra predictions (4x4 blocks).
+ Only applicable if transform skip is enabled. For chroma, only
+ evaluate if luma used tskip. Inter block tskip analysis is
+ unmodified. Default disabled
+
+Analysis re-use options, to improve performance when encoding the same
+sequence multiple times (presumably at varying bitrates). The encoder
+will not reuse analysis if the resolution and slice type parameters do
+not match.
+
+.. option:: --analysis-mode <string|int>
+
+ Specify whether analysis information of each frame is output by encoder
+ or input for reuse. By reading the analysis data writen by an
+ earlier encode of the same sequence, substantial redundant work may
+ be avoided.
+
+ The following data may be stored and reused:
+ I frames - split decisions and luma intra directions of all CUs.
+ P/B frames - motion vectors are dumped at each depth for all CUs.
+
+ **Values:** off(0), save(1): dump analysis data, load(2): read analysis data
+
+.. option:: --analysis-file <filename>
+
+ Specify a filename for analysis data (see :option:`--analysis-mode`)
+ If no filename is specified, x265_analysis.dat is used.
+
+Options which affect the transform unit quad-tree, sometimes referred to
+as the residual quad-tree (RQT).
+
.. option:: --tu-intra-depth <1..4>
The transform unit (residual) quad-tree begins with the same depth
@@ -508,143 +639,40 @@ Temporal / motion search options
| 7 | 2 | 8 | 2 | 8 | true |
+----+------------+-----------+------------+-----------+-----------+
-.. option:: --merange <integer>
-
- Motion search range. Default 57
-
- The default is derived from the default CTU size (64) minus the luma
- interpolation half-length (4) minus maximum subpel distance (2)
- minus one extra pixel just in case the hex search method is used. If
- the search range were any larger than this, another CTU row of
- latency would be required for reference frames.
-
- **Range of values:** an integer from 0 to 32768
-
-.. option:: --max-merge <1..5>
-
- Maximum number of neighbor (spatial and temporal) candidate blocks
- that the encoder may consider for merging motion predictions. If a
- merge candidate results in no residual, it is immediately selected
- as a "skip". Otherwise the merge candidates are tested as part of
- motion estimation when searching for the least cost inter option.
- The max candidate number is encoded in the SPS and determines the
- bit cost of signaling merge CUs. Default 2
-
-.. option:: --temporal-mvp, --no-temporal-mvp
-
- Enable temporal motion vector predictors in P and B slices.
- This enables the use of the motion vector from the collocated block
- in the previous frame to be used as a predictor. Default is enabled
-
-Spatial/intra options
-=====================
-
-.. option:: --rdpenalty <0..2>
-
- When set to 1, transform units of size 32x32 are given a 4x bit cost
- penalty compared to smaller transform units, in intra coded CUs in P
- or B slices.
-
- When set to 2, transform units of size 32x32 are not even attempted,
- unless otherwise required by the maximum recursion depth. For this
- option to be effective with 32x32 intra CUs,
- :option:`--tu-intra-depth` must be at least 2. For it to be
- effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
- at least 3.
-
- Note that in HEVC an intra transform unit (a block of the residual
- quad-tree) is also a prediction unit, meaning that the intra
- prediction signal is generated for each TU block, the residual
- subtracted and then coded. The coding unit simply provides the
- prediction modes that will be used when predicting all of the
- transform units within the CU. This means that when you prevent
- 32x32 intra transform units, you are preventing 32x32 intra
- predictions.
-
- Default 0, disabled.
-
- **Values:** 0:disabled 1:4x cost penalty 2:force splits
-
-.. option:: --b-intra, --no-b-intra
-
- Enables the evaluation of intra modes in B slices. Default disabled.
-
-.. option:: --tskip, --no-tskip
-
- Enable evaluation of transform skip (bypass DCT but still use
- quantization) coding for 4x4 TU coded blocks.
-
- Only effective at RD levels 3 and above, which perform RDO mode
- decisions. Default disabled
-
-.. option:: --tskip-fast, --no-tskip-fast
-
- Only evaluate transform skip for NxN intra predictions (4x4 blocks).
- Only applicable if transform skip is enabled. For chroma, only
- evaluate if luma used tskip. Inter block tskip analysis is
- unmodified. Default disabled
-
-.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing
-
- Enable strong intra smoothing for 32x32 intra blocks. Default enabled
-
-.. option:: --constrained-intra, --no-constrained-intra
-
- Constrained intra prediction. When generating intra predictions for
- blocks in inter slices, only intra-coded reference pixels are used.
- Inter-coded reference pixels are replaced with intra-coded neighbor
- pixels or default values. The general idea is to block the
- propagation of reference errors that may have resulted from lossy
- signals. Default disabled
-
-Mode decision / Analysis
-========================
-
-.. option:: --rect, --no-rect
-
- Enable analysis of rectangular motion partitions Nx2N and 2NxN
- (50/50 splits, two directions). Default disabled
-
-.. option:: --amp, --no-amp
-
- Enable analysis of asymmetric motion partitions (75/25 splits, four
- directions). At RD levels 0 through 4, AMP partitions are only
- considered at CU sizes 32x32 and below. At RD levels 5 and 6, it
- will only consider AMP partitions as merge candidates (no motion
- search) at 64x64, and as merge or inter candidates below 64x64.
+ At --subme values larger than 2, chroma residual cost is included
+ in all subpel refinement steps and chroma residual is included in
+ all motion estimation decisions (selecting the best reference
+ picture in each list, and chosing between merge, uni-directional
+ motion and bi-directional motion). The 'slow' preset is the first
+ preset to enable the use of chroma residual.
- The AMP partitions which are searched are derived from the current
- best inter partition. If Nx2N (vertical rectangular) is the best
- current prediction, then left and right asymmetrical splits will be
- evaluated. If 2NxN (horizontal rectangular) is the best current
- prediction, then top and bottom asymmetrical splits will be
- evaluated, If 2Nx2N is the best prediction, and the block is not a
- merge/skip, then all four AMP partitions are evaluated.
+.. option:: --merange <integer>
- This setting has no effect if rectangular partitions are disabled.
- Default disabled
+ Motion search range. Default 57
-.. option:: --early-skip, --no-early-skip
+ The default is derived from the default CTU size (64) minus the luma
+ interpolation half-length (4) minus maximum subpel distance (2)
+ minus one extra pixel just in case the hex search method is used. If
+ the search range were any larger than this, another CTU row of
+ latency would be required for reference frames.
- Measure full CU size (2Nx2N) merge candidates first; if no residual
- is found the analysis is short circuited. Default disabled
+ **Range of values:** an integer from 0 to 32768
-.. option:: --fast-cbf, --no-fast-cbf
+.. option:: --max-merge <1..5>
- Short circuit analysis if a prediction is found that does not set
- the coded block flag (aka: no residual was encoded). It prevents
- the encoder from perhaps finding other predictions that also have no
- residual but require less signaling bits or have less distortion.
- Only applicable for RD levels 5 and 6. Default disabled
+ Maximum number of neighbor (spatial and temporal) candidate blocks
+ that the encoder may consider for merging motion predictions. If a
+ merge candidate results in no residual, it is immediately selected
+ as a "skip". Otherwise the merge candidates are tested as part of
+ motion estimation when searching for the least cost inter option.
+ The max candidate number is encoded in the SPS and determines the
+ bit cost of signaling merge CUs. Default 2
-.. option:: --fast-intra, --no-fast-intra
+.. option:: --temporal-mvp, --no-temporal-mvp
- Perform an initial scan of every fifth intra angular mode, then
- check modes +/- 2 distance from the best mode, then +/- 1 distance
- from the best mode, effectively performing a gradient descent. When
- enabled 10 modes in total are checked. When disabled all 33 angular
- modes are checked. Only applicable for :option:`--rd` levels 3 and
- below (medium preset and faster).
+ Enable temporal motion vector predictors in P and B slices.
+ This enables the use of the motion vector from the collocated block
+ in the previous frame to be used as a predictor. Default is enabled
.. option:: --weightp, -w, --no-weightp
@@ -660,54 +688,48 @@ Mode decision / Analysis
Enable weighted prediction in B slices. Default disabled
-.. option:: --rd <0..6>
+Spatial/intra options
+=====================
- Level of RDO in mode decision. The higher the value, the more
- exhaustive the analysis and the more rate distortion optimization is
- used. The lower the value the faster the encode, the higher the
- value the smaller the bitstream (in general). Default 3
+.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing
- Note that this table aims for accuracy, but is not necessarily our
- final target behavior for each mode.
+ Enable strong intra smoothing for 32x32 intra blocks. Default enabled
- +-------+---------------------------------------------------------------+
- | Level | Description |
- +=======+===============================================================+
- | 0 | sa8d mode and split decisions, intra w/ source pixels |
- +-------+---------------------------------------------------------------+
- | 1 | recon generated (better intra), RDO merge/skip selection |
- +-------+---------------------------------------------------------------+
- | 2 | RDO splits and merge/skip selection |
- +-------+---------------------------------------------------------------+
- | 3 | RDO mode and split decisions |
- +-------+---------------------------------------------------------------+
- | 4 | Adds RDO Quant |
- +-------+---------------------------------------------------------------+
- | 5 | Adds RDO prediction decisions |
- +-------+---------------------------------------------------------------+
- | 6 | Currently same as 5 |
- +-------+---------------------------------------------------------------+
+.. option:: --constrained-intra, --no-constrained-intra
- **Range of values:** 0: least .. 6: full RDO analysis
+ Constrained intra prediction. When generating intra predictions for
+ blocks in inter slices, only intra-coded reference pixels are used.
+ Inter-coded reference pixels are replaced with intra-coded neighbor
+ pixels or default values. The general idea is to block the
+ propagation of reference errors that may have resulted from lossy
+ signals. Default disabled
-.. option:: --cu-lossless, --no-cu-lossless
+.. option:: --rdpenalty <0..2>
- For each CU, evaluate lossless (transform and quant bypass) encode
- of the best non-lossless mode option as a potential rate distortion
- optimization. If the global option :option:`--lossless` has been
- specified, all CUs will be encoded as lossless unconditionally
- regardless of whether this option was enabled. Default disabled.
+ When set to 1, transform units of size 32x32 are given a 4x bit cost
+ penalty compared to smaller transform units, in intra coded CUs in P
+ or B slices.
- Only effective at RD levels 3 and above, which perform RDO mode
- decisions.
+ When set to 2, transform units of size 32x32 are not even attempted,
+ unless otherwise required by the maximum recursion depth. For this
+ option to be effective with 32x32 intra CUs,
+ :option:`--tu-intra-depth` must be at least 2. For it to be
+ effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
+ at least 3.
-.. option:: --signhide, --no-signhide
+ Note that in HEVC an intra transform unit (a block of the residual
+ quad-tree) is also a prediction unit, meaning that the intra
+ prediction signal is generated for each TU block, the residual
+ subtracted and then coded. The coding unit simply provides the
+ prediction modes that will be used when predicting all of the
+ transform units within the CU. This means that when you prevent
+ 32x32 intra transform units, you are preventing 32x32 intra
+ predictions.
+
+ Default 0, disabled.
+
+ **Values:** 0:disabled 1:4x cost penalty 2:force splits
- Hide sign bit of one coeff per TU (rdo). The last sign is implied.
- This requires analyzing all the coefficients to determine if a sign
- must be toggled, and then to determine which one can be toggled with
- the least amount of distortion. Default enabled
-
Psycho-visual options
=====================
@@ -753,16 +775,24 @@ quality and begin introducing artifacts and increase bitrate, which may
force rate control to increase global QP. Finding the optimal
psycho-visual parameters for a given video requires experimentation. Our
recommended defaults (1.0 for both) are generally on the low end of the
-spectrum. And generally the lower the bitrate, the lower the optimal
-psycho-visual settings.
+spectrum.
+
+The lower the bitrate, the lower the optimal psycho-visual settings. If
+the bitrate is too low for the psycho-visual settings, you will begin to
+see temporal artifacts (motion judder). This is caused when the encoder
+is forced to code skip blocks (no residual) in areas of difficult motion
+because it is the best option psycho-visually (they have great amounts
+of energy and no residual cost). One can lower psy-rd settings when
+judder is happening, and allow the encoder to use some blur in these
+areas of high motion.
.. option:: --psy-rd <float>
Influence rate distortion optimizated mode decision to preserve the
energy of the source image in the encoded image at the expense of
compression efficiency. It only has effect on presets which use
- RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
- typical value. Default disabled. Experimental
+ RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
+ typical value. Default 0.3
**Range of values:** 0 .. 2.0
@@ -772,9 +802,9 @@ psycho-visual settings.
energy in the reconstructed image. This generally improves perceived
visual quality at the cost of lower quality metric scores. It only
has effect on slower presets which use RDO Quantization
- (:option:`--rd` 4, 5 and 6). 1.0 is a typical value. Default
- disabled. High values can be beneficial in preserving high-frequency
- detail like film grain. Experimental
+ (:option:`--rd` 4, 5 and 6). 1.0 is a typical value. High values can
+ be beneficial in preserving high-frequency detail like film grain.
+ Default: 1.0
**Range of values:** 0 .. 50.0
@@ -880,9 +910,7 @@ Quality, rate control and rate distortion options
.. option:: --crf-min <0..51.0>
Specify an lower limit to the rate factor which may be assigned to
- any given frame (ensuring a min QP). This is dangerous when CRF is
- used in combination with VBV as it may result in buffer underruns.
- Default disabled
+ any given frame (ensuring a min compression factor).
.. option:: --vbv-bufsize <integer>
@@ -904,8 +932,8 @@ Quality, rate control and rate distortion options
between 0 and 1, or in kbits. In other words these two option pairs
are equivalent::
- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900
- :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9
+ --vbv-bufsize 1000 --vbv-init 900
+ --vbv-bufsize 1000 --vbv-init 0.9
Default 0.9
@@ -923,18 +951,6 @@ Quality, rate control and rate distortion options
**Range of values:** an integer from 0 to 51
-.. option:: --ipratio <float>
-
- QP ratio factor between I and P slices. This ratio is used in all of
- the rate control modes. Some :option:`--tune` options may change the
- default value. It is not typically manually specified. Default 1.4
-
-.. option:: --pbratio <float>
-
- QP ratio factor between P and B slices. This ratio is used in all of
- the rate control modes. Some :option:`--tune` options may change the
- default value. It is not typically manually specified. Default 1.3
-
.. option:: --lossless, --no-lossless
Enables true lossless coding by bypassing scaling, transform,
@@ -954,8 +970,8 @@ Quality, rate control and rate distortion options
and not enough in flat areas.
0. disabled
- 1. AQ enabled
- 2. AQ enabled with auto-variance **(default)**
+ 1. AQ enabled **(default)**
+ 2. AQ enabled with auto-variance
.. option:: --aq-strength <float>
@@ -974,25 +990,23 @@ Quality, rate control and rate distortion options
less bits. This tends to improve detail in the backgrounds of video
with less detail in areas of high motion. Default enabled
-.. option:: --cbqpoffs <integer>
-
- Offset of Cb chroma QP from the luma QP selected by rate control.
- This is a general way to spend more or less bits on the chroma
- channel. Default 0
-
- **Range of values:** -12 to 12
+.. option:: --nr-intra <integer>, --nr-inter <integer>
-.. option:: --crqpoffs <integer>
+ Noise reduction - an adaptive deadzone applied after DCT
+ (subtracting from DCT coefficients), before quantization. It does
+ no pixel-level filtering, doesn't cross DCT block boundaries, has no
+ overlap, The higher the strength value parameter, the more
+ aggressively it will reduce noise.
- Offset of Cr chroma QP from the luma QP selected by rate control.
- This is a general way to spend more or less bits on the chroma
- channel. Default 0
+ Enabling noise reduction will make outputs diverge between different
+ numbers of frame threads. Outputs will be deterministic but the
+ outputs of -F2 will no longer match the outputs of -F3, etc.
- **Range of values:** -12 to 12
+ **Values:** any value in range of 0 to 2000. Default 0 (disabled).
.. option:: --pass <integer>
- Enable multipass rate control mode. Input is encoded multiple times,
+ Enable multi-pass rate control mode. Input is encoded multiple times,
storing the encoded information of each pass in a stats file from which
the consecutive pass tunes the qp of each frame to improve the quality
of the output. Default disabled
@@ -1003,12 +1017,17 @@ Quality, rate control and rate distortion options
**Range of values:** 1 to 3
+.. option:: --stats <filename>
+
+ Specify file name of of the multi-pass stats file. If unspecified
+ the encoder will use x265_2pass.log
+
.. option:: --slow-firstpass, --no-slow-firstpass
- Enable a slow and more detailed first pass encode in Multipass rate
+ Enable a slow and more detailed first pass encode in multi-pass rate
control mode. Speed of the first pass encode is slightly lesser and
quality midly improved when compared to the default settings in a
- multipass encode. Default disabled (turbo mode enabled)
+ multi-pass encode. Default disabled (turbo mode enabled)
When **turbo** first pass is not disabled, these options are
set on the first pass to improve performance:
@@ -1023,30 +1042,165 @@ Quality, rate control and rate distortion options
* :option:`--subme` = MIN(2, :option:`--subme`)
* :option:`--rd` = MIN(2, :option:`--rd`)
-.. option:: --analysis-mode <string|int>
+.. option:: --strict-cbr, --no-strict-cbr
+
+ Enables stricter conditions to control bitrate deviance from the
+ target bitrate in CBR mode. Bitrate adherence is prioritised
+ over quality. Rate tolerance is reduced to 50%. Default disabled.
+
+ This option is for use-cases which require the final average bitrate
+ to be within very strict limits of the target - preventing overshoots
+ completely, and achieve bitrates within 5% of target bitrate,
+ especially in short segment encodes. Typically, the encoder stays
+ conservative, waiting until there is enough feedback in terms of
+ encoded frames to control QP. strict-cbr allows the encoder to be
+ more aggressive in hitting the target bitrate even for short segment
+ videos. Experimental.
+
+.. option:: --cbqpoffs <integer>
- Specify whether analysis information of each frame is output by encoder
- or input for reuse. By reading the analysis data writen by an
- earlier encode of the same sequence, substantial redundant work may
- be avoided.
+ Offset of Cb chroma QP from the luma QP selected by rate control.
+ This is a general way to spend more or less bits on the chroma
+ channel. Default 0
- The following data may be stored and reused:
- I frames - split decisions and luma intra directions of all CUs.
- P/B frames - motion vectors are dumped at each depth for all CUs.
+ **Range of values:** -12 to 12
- **Values:** off(0), save(1): dump analysis data, load(2): read analysis data
+.. option:: --crqpoffs <integer>
-.. option:: --analysis-file <filename>
+ Offset of Cr chroma QP from the luma QP selected by rate control.
+ This is a general way to spend more or less bits on the chroma
+ channel. Default 0
- Specify a filename for analysis data (see :option:`--analysis-mode`)
- If no filename is specified, x265_analysis.dat is used.
+ **Range of values:** -12 to 12
+
+.. option:: --ipratio <float>
+
+ QP ratio factor between I and P slices. This ratio is used in all of
+ the rate control modes. Some :option:`--tune` options may change the
+ default value. It is not typically manually specified. Default 1.4
+
+.. option:: --pbratio <float>
+
+ QP ratio factor between P and B slices. This ratio is used in all of
+ the rate control modes. Some :option:`--tune` options may change the
+ default value. It is not typically manually specified. Default 1.3
+
+.. option:: --qcomp <float>
+
+ qComp sets the quantizer curve compression factor. It weights the
+ frame quantizer based on the complexity of residual (measured by
+ lookahead). Default value is 0.6. Increasing it to 1 will
+ effectively generate CQP
+
+.. option:: --qstep <integer>
+
+ The maximum single adjustment in QP allowed to rate control. Default
+ 4
+
+.. option:: --qblur <float>
+
+ Temporally blur quants. Default 0.5
+
+.. option:: --cplxblur <float>
+
+ temporally blur complexity. default 20
+
+.. option:: --zones <zone0>/<zone1>/...
+
+ Tweak the bitrate of regions of the video. Each zone takes the form:
+
+ <start frame>,<end frame>,<option> where <option> is either q=<integer>
+ (force QP) or b=<float> (bitrate multiplier).
+
+ If zones overlap, whichever comes later in the list takes precedence.
+ Default none
+
+Quantization Options
+====================
+
+Note that rate-distortion optimized quantization (RDOQ) is enabled
+implicitly at :option:`--rd` 4, 5, and 6 and disabled implicitly at all
+other levels.
+
+.. option:: --signhide, --no-signhide
+
+ Hide sign bit of one coeff per TU (rdo). The last sign is implied.
+ This requires analyzing all the coefficients to determine if a sign
+ must be toggled, and then to determine which one can be toggled with
+ the least amount of distortion. Default enabled
+
+.. option:: --qpfile <filename>
+
+ Specify a text file which contains frametypes and QPs for some or
+ all frames. The format of each line is:
+
+ framenumber frametype QP
+
+ Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
+ **b** is an unreferenced B frame. **I** is a keyframe (random
+ access point) while **i** is a I frame that is not a keyframe
+ (references are not broken).
+
+ Specifying QP (integer) is optional, and if specified they are
+ clamped within the encoder to qpmin/qpmax.
+
+.. option:: --scaling-list <filename>
+
+ Quantization scaling lists. HEVC supports 6 quantization scaling
+ lists to be defined; one each for Y, Cb, Cr for intra prediction and
+ one each for inter prediction.
+
+ x265 does not use scaling lists by default, but this can also be
+ made explicit by :option:`--scaling-list` *off*.
+
+ HEVC specifies a default set of scaling lists which may be enabled
+ without requiring them to be signaled in the SPS. Those scaling
+ lists can be enabled via :option:`--scaling-list` *default*.
+
+ All other strings indicate a filename containing custom scaling
+ lists in the HM format. The encode will abort if the file is not
+ parsed correctly. Custom lists must be signaled in the SPS
+
+.. option:: --lambda-file <filename>
+
+ Specify a text file containing values for x265_lambda_tab and
+ x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
+ values.
+
+ The text file syntax is simple. Comma is considered to be
+ white-space. All white-space is ignored. Lines must be less than 2k
+ bytes in length. Content following hash (#) characters are ignored.
+ The values read from the file are logged at :option:`--log-level`
+ debug.
+
+ Note that the lambda tables are process-global and so the new values
+ affect all encoders running in the same process.
+
+ Lambda values affect encoder mode decisions, the lower the lambda
+ the more bits it will try to spend on signaling information (motion
+ vectors and splits) and less on residual. This feature is intended
+ for experimentation.
Loop filters
============
-.. option:: --lft, --no-lft
+.. option:: --deblock=<int>:<int>, --no-deblock
+
+ Toggle deblocking loop filter, optionally specify deblocking
+ strength offsets.
+
+ <int>:<int> - parsed as tC offset and Beta offset
+ <int>,<int> - parsed as tC offset and Beta offset
+ <int> - both tC and Beta offsets assigned the same value
+
+ If unspecified, the offsets default to 0. The offsets must be in a
+ range of -6 (lowest strength) to 6 (highest strength).
+
+ To disable the deblocking filter entirely, use --no-deblock or
+ --deblock=false. Default enabled, with both offsets defaulting to 0
- Toggle deblocking loop filter, default enabled
+ If deblocking is disabled, or the offsets are non-zero, these
+ changes from the default configuration are signaled in the PPS.
.. option:: --sao, --no-sao
@@ -1172,7 +1326,7 @@ VUI fields must be manually specified.
9. bt2020nc
10. bt2020c
-.. option:: --chromalocs <0..5>
+.. option:: --chromaloc <0..5>
Specify chroma sample location for 4:2:0 inputs. Consult the HEVC
specification for a description of these values. Default undefined
@@ -1206,7 +1360,7 @@ Bitstream options
.. option:: --aud, --no-aud
Emit an access unit delimiter NAL at the start of each slice access
- unit. If option:`--repeat-headers` is not enabled (indicating the
+ unit. If :option:`--repeat-headers` is not enabled (indicating the
user will be writing headers manually at the start of the stream)
the very first AUD will be skipped since it cannot be placed at the
start of the access unit, where it belongs. Default disabled
diff --git a/doc/reST/conf.py b/doc/reST/conf.py
index 561f7d0..eea837f 100644
--- a/doc/reST/conf.py
+++ b/doc/reST/conf.py
@@ -15,3 +15,12 @@ copyright = u'2014 MulticoreWare Inc'
# -- Options for HTML output ---------------------------------------------------
html_theme = "default"
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'libx265', 'Full x265 Documentation',
+ ['MulticoreWare Inc'], 3),
+ ('x265', 'x265', 'x265 CLI Documentation',
+ ['MulticoreWare Inc'], 1)
+]
diff --git a/doc/reST/introduction.rst b/doc/reST/introduction.rst
index 1d953f4..c503946 100644
--- a/doc/reST/introduction.rst
+++ b/doc/reST/introduction.rst
@@ -75,7 +75,7 @@ responsible for understanding the laws in your country, and for
licensing all applicable patent rights needed for use or distribution of
software applications created from the x265 source code. A good place
to start is with the `Motion Picture Experts Group - Licensing Authority
-- HEVC Licensing Program<http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
+- HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is
a trademark of MulticoreWare, and may only be used with explicit written
diff --git a/doc/reST/presets.rst b/doc/reST/presets.rst
index 99085a2..7e236fa 100644
--- a/doc/reST/presets.rst
+++ b/doc/reST/presets.rst
@@ -1,11 +1,11 @@
Preset Options
--------------
+.. _presets:
+
Presets
=======
-.. _preset-tune-ref:
-
x265 has a number of predefined :option:`--preset` options that make
trade-offs between encode speed (encoded frames per second) and
compression efficiency (quality per bit in the bitstream). The default
@@ -66,8 +66,6 @@ The presets adjust encoder parameters to affect these trade-offs.
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
| rdLevel | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 6 | 6 | 6 |
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| lft | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
| tu-intra | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 |
+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
| tu-inter | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 |
@@ -75,6 +73,8 @@ The presets adjust encoder parameters to affect these trade-offs.
Placebo mode enables transform-skip prediction evaluation.
+.. _tunings:
+
Tuning
======
@@ -97,7 +97,87 @@ after the preset.
+--------------+-----------------------------------------------------+
| ssim | enables adaptive quant auto-mode, disables psy-rd |
+--------------+-----------------------------------------------------+
+| grain | improves retention of film grain. more below |
++--------------+-----------------------------------------------------+
| fastdecode | no loop filters, no weighted pred, no intra in B |
+--------------+-----------------------------------------------------+
| zerolatency | no lookahead, no B frames, no cutree |
+--------------+-----------------------------------------------------+
+
+
+
+Film Grain Retention
+~~~~~~~~~~~~~~~~~~~~
+
+:option:`--tune` *grain* tries to improve the retention of film grain in
+the reconstructed output. It helps rate distortion optimizations select
+modes which preserve high frequency noise:
+
+ * :option:`--psy-rd` 0.5
+ * :option:`--psy-rdoq` 30
+
+.. Note::
+
+ --psy-rdoq is only effective when RDOQuant is enabled, which is at
+ RD levels 4, 5, and 6 (presets slow and below).
+
+It lowers the strength of adaptive quantization, so residual energy can
+be more evenly distributed across the (noisy) picture:
+
+ * :option:`--aq-mode` 1
+ * :option:`--aq-strength` 0.3
+
+And it similarly tunes rate control to prevent the slice QP from
+swinging too wildly from frame to frame:
+
+ * :option:`--ipratio` 1.1
+ * :option:`--pbratio` 1.1
+ * :option:`--qcomp` 0.8
+
+And lastly it reduces the strength of deblocking to prevent grain being
+blurred on block boundaries:
+
+ * :option:`--deblock` -2
+
+Fast Decode
+~~~~~~~~~~~
+
+:option:`--tune` *fastdecode* disables encoder features which tend to be
+bottlenecks for the decoder. It is intended for use with 4K content at
+high bitrates which can cause decoders to struggle. It disables both
+HEVC loop filters, which tend to be process bottlenecks:
+
+ * :option:`--no-deblock`
+ * :option:`--no-sao`
+
+It disables weighted prediction, which tend to be bandwidth bottlenecks:
+
+ * :option:`--no-weightp`
+ * :option:`--no-weightb`
+
+And it disables intra blocks in B frames with :option:`--no-b-intra`
+since intra predicted blocks cause serial dependencies in the decoder.
+
+Zero Latency
+~~~~~~~~~~~~
+
+There are two halves to the latency problem. There is latency at the
+decoder and latency at the encoder. :option:`--tune` *zerolatency*
+removes latency from both sides. The decoder latency is removed by:
+
+ * :option:`--bframes` 0
+
+Encoder latency is removed by:
+
+ * :option:`--b-adapt` 0
+ * :option:`--rc-lookahead` 0
+ * :option:`--no-scenecut`
+ * :option:`--no-cutree`
+ * :option:`--frame-threads` 1
+
+With all of these settings x265_encoder_encode() will run synchronously,
+the picture passed as pic_in will be encoded and returned as NALs. These
+settings disable frame parallelism, which is an important component for
+x265 performance. If you can tolerate any latency on the encoder, you
+can increase performance by increasing the number of frame threads. Each
+additional frame thread adds one frame of latency.
diff --git a/doc/reST/threading.rst b/doc/reST/threading.rst
index cbb851d..e9a4dca 100644
--- a/doc/reST/threading.rst
+++ b/doc/reST/threading.rst
@@ -172,7 +172,7 @@ count, but may be manually specified via :option:`--frame-threads`
+-------+--------+
| Cores | Frames |
+=======+========+
- | > 32 | 6 |
+ | > 32 | 6..8 |
+-------+--------+
| >= 16 | 5 |
+-------+--------+
diff --git a/doc/reST/x265.rst b/doc/reST/x265.rst
new file mode 100644
index 0000000..32a416d
--- /dev/null
+++ b/doc/reST/x265.rst
@@ -0,0 +1,49 @@
+x265 CLI Documentation
+######################
+
+
+SYNOPSIS
+========
+
+**x265** [options] infile [-o] outfile
+
+Bit depth: 8
+
+
+**x265-10bit** [options] infile [-o] outfile
+
+Bit depth: 10
+
+
+infile can be YUV or Y4M
+
+outfile is raw HEVC bitstream
+
+
+DESCRIPTION
+===========
+
+.. toctree::
+ :maxdepth: 2
+
+ introduction
+
+
+OPTIONS
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ cli
+ presets
+ lossless
+
+
+SEE ALSO
+========
+
+**libx265**\(3)
+
+Online documentation: http://x265.readthedocs.org/en/default/cli.html
+
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index ba63f81..4f52cf7 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -21,7 +21,7 @@ include(CheckSymbolExists)
include(CheckCXXCompilerFlag)
# X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 35)
+set(X265_BUILD 43)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -56,13 +56,19 @@ else()
endif()
if(UNIX)
- SET(PLATFORM_LIBS pthread)
+ list(APPEND PLATFORM_LIBS pthread)
find_library(LIBRT rt)
if(LIBRT)
- set(PLATFORM_LIBS ${PLATFORM_LIBS} rt)
+ list(APPEND PLATFORM_LIBS rt)
endif()
endif(UNIX)
+if(X64 AND NOT WIN32)
+ option(ENABLE_PIC "Enable Position Independent Code" ON)
+else()
+ option(ENABLE_PIC "Enable Position Independent Code" OFF)
+endif(X64 AND NOT WIN32)
+
# Compiler detection
if(CMAKE_GENERATOR STREQUAL "Xcode")
set(XCODE 1)
@@ -121,9 +127,9 @@ endif()
if(GCC)
add_definitions(-Wall -Wextra -Wshadow)
add_definitions(-D__STDC_LIMIT_MACROS=1)
- if(X64 AND NOT WIN32)
- add_definitions(-fPIC)
- endif(X64 AND NOT WIN32)
+ if(ENABLE_PIC)
+ add_definitions(-fPIC)
+ endif(ENABLE_PIC)
if(X86 AND NOT X64)
add_definitions(-march=i686)
endif()
@@ -190,24 +196,13 @@ if(WARNINGS_AS_ERRORS)
endif()
endif(WARNINGS_AS_ERRORS)
-
-option(ENABLE_PPA "Enable PPA profiling instrumentation" OFF)
-if(ENABLE_PPA)
- add_definitions(-DENABLE_PPA)
- add_subdirectory(PPA)
- SET(PLATFORM_LIBS ${PLATFORM_LIBS} PPA)
- if(UNIX)
- SET(PLATFORM_LIBS ${PLATFORM_LIBS} dl)
- endif(UNIX)
-endif(ENABLE_PPA)
-
if (WIN32)
# Visual leak detector
find_package(VLD QUIET)
if(VLD_FOUND)
add_definitions(-DHAVE_VLD)
include_directories(${VLD_INCLUDE_DIRS})
- set(PLATFORM_LIBS ${PLATFORM_LIBS} ${VLD_LIBRARIES})
+ list(APPEND PLATFORM_LIBS ${VLD_LIBRARIES})
link_directories(${VLD_LIBRARY_DIRS})
endif()
option(WINXP_SUPPORT "Make binaries compatible with Windows XP" OFF)
@@ -220,6 +215,31 @@ endif()
include(version) # determine X265_VERSION and X265_LATEST_TAG
include_directories(. common encoder "${PROJECT_BINARY_DIR}")
+
+option(ENABLE_PPA "Enable PPA profiling instrumentation" OFF)
+if(ENABLE_PPA)
+ add_definitions(-DENABLE_PPA)
+ list(APPEND PLATFORM_LIBS PPA)
+ if(UNIX)
+ list(APPEND PLATFORM_LIBS dl)
+ endif(UNIX)
+ add_subdirectory(profile/PPA)
+endif(ENABLE_PPA)
+
+option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
+if(ENABLE_VTUNE)
+ add_definitions(-DENABLE_VTUNE)
+ include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
+ list(APPEND PLATFORM_LIBS vtune)
+ link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
+ if(WIN32)
+ list(APPEND PLATFORM_LIBS libittnotify.lib)
+ else()
+ list(APPEND PLATFORM_LIBS libittnotify.a dl)
+ endif()
+ add_subdirectory(profile/vtune)
+endif(ENABLE_VTUNE)
+
add_subdirectory(encoder)
add_subdirectory(common)
@@ -323,7 +343,7 @@ if(X265_LATEST_TAG)
# Produce a pkg-config file
configure_file("x265.pc.in" "x265.pc" @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/x265.pc"
- DESTINATION "${CMAKE_INSTALL_PREFIX}/${LIB_INSTALL_DIR}/pkgconfig")
+ DESTINATION "${LIB_INSTALL_DIR}/pkgconfig")
endif()
if(NOT WIN32)
diff --git a/source/PPA/CMakeLists.txt b/source/PPA/CMakeLists.txt
deleted file mode 100644
index de13ddf..0000000
--- a/source/PPA/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_library(PPA ppa.h ppaApi.h ppaCPUEvents.h ppa.cpp)
diff --git a/source/PPA/ppa.h b/source/PPA/ppa.h
deleted file mode 100644
index 42f43b8..0000000
--- a/source/PPA/ppa.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Steve Borho <steve at borho.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef _PPA_H_
-#define _PPA_H_
-
-#if !defined(ENABLE_PPA)
-
-#define PPA_INIT()
-#define PPAStartCpuEventFunc(e)
-#define PPAStopCpuEventFunc(e)
-#define PPAScopeEvent(e)
-
-#else
-
-/* declare enum list of users CPU events */
-#define PPA_REGISTER_CPU_EVENT(x) x,
-enum PPACpuEventEnum
-{
-#include "ppaCPUEvents.h"
- PPACpuGroupNums
-};
-
-#undef PPA_REGISTER_CPU_EVENT
-
-#define PPA_INIT() initializePPA()
-#define PPAStartCpuEventFunc(e) if (ppabase) ppabase->triggerStartEvent(ppabase->getEventId(e))
-#define PPAStopCpuEventFunc(e) if (ppabase) ppabase->triggerEndEvent(ppabase->getEventId(e))
-#define PPAScopeEvent(e) _PPAScope __scope_(e)
-
-#include "ppaApi.h"
-
-void initializePPA();
-extern ppa::Base *ppabase;
-
-class _PPAScope
-{
-protected:
-
- ppa::EventID m_id;
-
-public:
-
- _PPAScope(int e) { if (ppabase) { m_id = ppabase->getEventId(e); ppabase->triggerStartEvent(m_id); } else m_id = 0; }
-
- ~_PPAScope() { if (ppabase) ppabase->triggerEndEvent(m_id); }
-};
-
-#endif // if !defined(ENABLE_PPA)
-
-#endif /* _PPA_H_ */
diff --git a/source/PPA/ppaCPUEvents.h b/source/PPA/ppaCPUEvents.h
deleted file mode 100644
index 1a47b39..0000000
--- a/source/PPA/ppaCPUEvents.h
+++ /dev/null
@@ -1,25 +0,0 @@
-PPA_REGISTER_CPU_EVENT(encode_block)
-PPA_REGISTER_CPU_EVENT(bitstream_write)
-PPA_REGISTER_CPU_EVENT(DPB_prepareEncode)
-PPA_REGISTER_CPU_EVENT(FrameEncoder_compressFrame)
-PPA_REGISTER_CPU_EVENT(FrameEncoder_compressRows)
-PPA_REGISTER_CPU_EVENT(CompressCU)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth1)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth2)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth3)
-PPA_REGISTER_CPU_EVENT(CompressCU_Depth4)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth1)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth2)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth3)
-PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth4)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth1)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth2)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth3)
-PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth4)
-PPA_REGISTER_CPU_EVENT(CalcRDCostIntra)
-PPA_REGISTER_CPU_EVENT(Thread_ProcessRow)
-PPA_REGISTER_CPU_EVENT(Thread_compressCU)
-PPA_REGISTER_CPU_EVENT(Thread_encodeCU)
-PPA_REGISTER_CPU_EVENT(Thread_filterCU)
diff --git a/source/cmake/CMakeASM_YASMInformation.cmake b/source/cmake/CMakeASM_YASMInformation.cmake
index 0af7c24..fb953ee 100644
--- a/source/cmake/CMakeASM_YASMInformation.cmake
+++ b/source/cmake/CMakeASM_YASMInformation.cmake
@@ -2,7 +2,10 @@ set(ASM_DIALECT "_YASM")
set(CMAKE_ASM${ASM_DIALECT}_SOURCE_FILE_EXTENSIONS asm)
if(X64)
- list(APPEND ASM_FLAGS -DARCH_X86_64=1 -DPIC)
+ list(APPEND ASM_FLAGS -DARCH_X86_64=1)
+ if(ENABLE_PIC)
+ list(APPEND ASM_FLAGS -DPIC)
+ endif()
if(APPLE)
set(ARGS -f macho64 -m amd64 -DPREFIX)
elseif(UNIX AND NOT CYGWIN)
@@ -32,6 +35,19 @@ if(HIGH_BIT_DEPTH)
else()
list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8)
endif()
+
+list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS}")
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+ list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_RELEASE}")
+elseif(CMAKE_BUILD_TYPE MATCHES Debug)
+ list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_DEBUG}")
+elseif(CMAKE_BUILD_TYPE MATCHES MinSizeRel)
+ list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_MINSIZEREL}")
+elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
+ list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_RELWITHDEBINFO}")
+endif()
+
set(YASM_FLAGS ${ARGS} ${ASM_FLAGS} PARENT_SCOPE)
string(REPLACE ";" " " CMAKE_ASM_YASM_COMPILER_ARG1 "${ARGS}")
diff --git a/source/cmake/FindVLD.cmake b/source/cmake/FindVLD.cmake
index 716625c..ece8bae 100644
--- a/source/cmake/FindVLD.cmake
+++ b/source/cmake/FindVLD.cmake
@@ -54,11 +54,14 @@ ELSEIF (CMAKE_SIZEOF_VOID_P EQUAL 8)
LIST (APPEND _VLD_POSSIBLE_LIB_SUFFIXES lib/Win64)
ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4)
+SET (PFILES "ProgramFiles")
+SET (PFILES_X86 "ProgramFiles(x86)") # hack to avoid escaping issues in cmake 3.1
+
FIND_PATH (VLD_ROOT_DIR
NAMES include/vld.h
PATHS ENV VLDROOT
- "$ENV{PROGRAMFILES}/Visual Leak Detector"
- "$ENV{PROGRAMFILES(X86)}/Visual Leak Detector"
+ "$ENV{PFILES}/Visual Leak Detector"
+ "$ENV{PFILES_X86}/Visual Leak Detector"
"[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]"
"[HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]"
DOC "VLD root directory")
diff --git a/source/cmake/version.cmake b/source/cmake/version.cmake
index b6adfb9..e89e42c 100644
--- a/source/cmake/version.cmake
+++ b/source/cmake/version.cmake
@@ -22,7 +22,7 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
set(hg_${key} ${value})
endforeach()
if(DEFINED hg_tag)
- set(X265_VERSION ${hg_tag} CACHE STRING "x265 version string.")
+ set(X265_VERSION ${hg_tag})
set(X265_LATEST_TAG ${hg_tag})
set(X265_TAG_DISTANCE "0")
elseif(DEFINED hg_node)
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 46929ca..4ead346 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -1,44 +1,46 @@
# vim: syntax=cmake
-set(SSE3 vec/dct-sse3.cpp)
-set(SSSE3 vec/dct-ssse3.cpp)
-set(SSE41 vec/dct-sse41.cpp)
-if(MSVC AND X86)
- set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
- set(WARNDISABLE "/wd4100") # unreferenced formal parameter
- if(INTEL_CXX)
- add_definitions(/Qwd111) # statement is unreachable
- add_definitions(/Qwd128) # loop is unreachable
- add_definitions(/Qwd177) # declared function is unused
- add_definitions(/Qwd185) # dynamic initialization in unreachable code
- add_definitions(/Qwd280) # conditional expression is constant
- endif()
- if(X64)
- set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}")
- else()
- # x64 implies SSE4, so only add /arch:SSE2 if building for Win32
- set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
- endif()
-endif()
-if(GCC AND X86)
- if(CLANG)
- # llvm intrinsic headers cause shadow warnings
- set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
- else()
- set(WARNDISABLE "-Wno-unused-parameter")
- endif()
- if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3))
+if(ENABLE_ASSEMBLY)
+ set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+
+ set(SSE3 vec/dct-sse3.cpp)
+ set(SSSE3 vec/dct-ssse3.cpp)
+ set(SSE41 vec/dct-sse41.cpp)
+
+ if(MSVC AND X86)
set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
- set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3")
- set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3")
- set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1")
+ set(WARNDISABLE "/wd4100") # unreferenced formal parameter
+ if(INTEL_CXX)
+ add_definitions(/Qwd111) # statement is unreachable
+ add_definitions(/Qwd128) # loop is unreachable
+ add_definitions(/Qwd177) # declared function is unused
+ add_definitions(/Qwd185) # dynamic initialization in unreachable code
+ add_definitions(/Qwd280) # conditional expression is constant
+ endif()
+ if(X64)
+ set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}")
+ else()
+ # x64 implies SSE4, so only add /arch:SSE2 if building for Win32
+ set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
+ endif()
endif()
-endif()
-set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
-source_group(Intrinsics FILES ${VEC_PRIMITIVES})
+ if(GCC AND X86)
+ if(CLANG)
+ # llvm intrinsic headers cause shadow warnings
+ set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
+ else()
+ set(WARNDISABLE "-Wno-unused-parameter")
+ endif()
+ if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3))
+ set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
+ set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3")
+ set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3")
+ set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1")
+ endif()
+ endif()
+ set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
+ source_group(Intrinsics FILES ${VEC_PRIMITIVES})
-if(ENABLE_ASSEMBLY)
- set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
mc-a2.asm pixel-util8.asm blockcopy8.asm
diff --git a/source/common/common.h b/source/common/common.h
index b447bb3..2cfb4df 100644
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -41,6 +41,31 @@
#include "x265.h"
+#if ENABLE_PPA && ENABLE_VTUNE
+#error "PPA and VTUNE cannot both be enabled. Disable one of them."
+#endif
+#if ENABLE_PPA
+#include "profile/PPA/ppa.h"
+#define ProfileScopeEvent(x) PPAScopeEvent(x)
+#define THREAD_NAME(n,i)
+#define PROFILE_INIT() PPA_INIT()
+#define PROFILE_PAUSE()
+#define PROFILE_RESUME()
+#elif ENABLE_VTUNE
+#include "profile/vtune/vtune.h"
+#define ProfileScopeEvent(x) VTuneScopeEvent _vtuneTask(x)
+#define THREAD_NAME(n,i) vtuneSetThreadName(n, i)
+#define PROFILE_INIT() vtuneInit()
+#define PROFILE_PAUSE() __itt_pause()
+#define PROFILE_RESUME() __itt_resume()
+#else
+#define ProfileScopeEvent(x)
+#define THREAD_NAME(n,i)
+#define PROFILE_INIT()
+#define PROFILE_PAUSE()
+#define PROFILE_RESUME()
+#endif
+
#define FENC_STRIDE 64
#define NUM_INTRA_MODE 35
@@ -56,6 +81,10 @@ extern "C" intptr_t x265_stack_align(void (*func)(), ...);
#define x265_stack_align(func, ...) func(__VA_ARGS__)
#endif
+#if defined(__MINGW32__)
+#define fseeko fseeko64
+#endif
+
#elif defined(_MSC_VER)
#define ALIGN_VAR_8(T, var) __declspec(align(8)) T var
@@ -133,22 +162,16 @@ typedef int32_t ssum2_t; //Signed sum
#define BITS_FOR_POC 8
template<typename T>
-inline pixel Clip(T x)
-{
- return (pixel)std::min<T>(T((1 << X265_DEPTH) - 1), std::max<T>(T(0), x));
-}
+inline T x265_min(T a, T b) { return a < b ? a : b; }
template<typename T>
-inline T Clip3(T minVal, T maxVal, T a)
-{
- return std::min<T>(std::max<T>(minVal, a), maxVal);
-}
+inline T x265_max(T a, T b) { return a > b ? a : b; }
template<typename T>
-inline T x265_min(T a, T b) { return a < b ? a : b; }
+inline T x265_clip3(T minVal, T maxVal, T a) { return x265_min(x265_max(minVal, a), maxVal); }
-template<typename T>
-inline T x265_max(T a, T b) { return a > b ? a : b; }
+template<typename T> /* clip to pixel range, 0..255 or 0..1023 */
+inline pixel x265_clip(T x) { return (pixel)x265_min<T>(T((1 << X265_DEPTH) - 1), x265_max<T>(T(0), x)); }
typedef int16_t coeff_t; // transform coefficient
@@ -245,9 +268,6 @@ typedef int16_t coeff_t; // transform coefficient
#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
-#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
-#define MAX_NUM_TR_CATEGORIES 8 /* 32, 16, 8, 4 transform categories each for luma and chroma */
-
#define COEF_REMAIN_BIN_REDUCTION 3 // indicates the level at which the VLC
// transitions from Golomb-Rice to TU+EG(k)
@@ -261,6 +281,7 @@ typedef int16_t coeff_t; // transform coefficient
#define MLS_GRP_NUM 64 // Max number of coefficient groups, max(16, 64)
#define MLS_CG_SIZE 4 // Coefficient group size of 4x4
+#define MLS_CG_BLK_SIZE (MLS_CG_SIZE * MLS_CG_SIZE)
#define MLS_CG_LOG2_SIZE 2
#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20
@@ -297,21 +318,12 @@ typedef int16_t coeff_t; // transform coefficient
#define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422)
#define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
+#define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
namespace x265 {
enum { SAO_NUM_OFFSET = 4 };
-// NOTE: MUST be alignment to 16 or 32 bytes for asm code
-struct NoiseReduction
-{
- /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
- * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
- uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
- uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
- uint32_t count[MAX_NUM_TR_CATEGORIES];
-};
-
enum SaoMergeMode
{
SAO_MERGE_NONE,
@@ -358,6 +370,22 @@ struct SAOParam
}
};
+/* Stores inter analysis data for a single frame */
+struct analysis_inter_data
+{
+ int32_t* ref;
+ uint8_t* depth;
+ uint8_t* modes;
+};
+
+/* Stores intra analysis data for a single frame. This struct needs better packing */
+struct analysis_intra_data
+{
+ uint8_t* depth;
+ uint8_t* modes;
+ char* partSizes;
+};
+
enum TextType
{
TEXT_LUMA = 0, // luma
@@ -382,6 +410,10 @@ enum SignificanceMapContextType
CONTEXT_TYPE_NxN = 2,
CONTEXT_NUMBER_OF_TYPES = 3
};
+
+/* located in pixel.cpp */
+void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
+
}
/* outside x265 namespace, but prefixed. defined in common.cpp */
diff --git a/source/common/constants.cpp b/source/common/constants.cpp
index 4252cb4..3ac725a 100644
--- a/source/common/constants.cpp
+++ b/source/common/constants.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
-* Copyright (C) 2014 x265 project
+* Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -27,21 +27,46 @@
namespace x265 {
-static int initialized /* = 0 */;
-
-// initialize ROM variables
-void initROM()
+#if HIGH_BIT_DEPTH
+// lambda = pow(2, (double)q / 6 - 2) * (1 << (X265_DEPTH - 8));
+double x265_lambda_tab[QP_MAX_MAX + 1] =
{
- if (ATOMIC_CAS32(&initialized, 0, 1) == 1)
- return;
-}
+ 1.0000, 1.1225, 1.2599, 1.4142, 1.5874,
+ 1.7818, 2.0000, 2.2449, 2.5198, 2.8284,
+ 3.1748, 3.5636, 4.0000, 4.4898, 5.0397,
+ 5.6569, 6.3496, 7.1272, 8.0000, 8.9797,
+ 10.0794, 11.3137, 12.6992, 14.2544, 16.0000,
+ 17.9594, 20.1587, 22.6274, 25.3984, 28.5088,
+ 32.0000, 35.9188, 40.3175, 45.2548, 50.7968,
+ 57.0175, 64.0000, 71.8376, 80.6349, 90.5097,
+ 101.5937, 114.0350, 128.0000, 143.6751, 161.2699,
+ 181.0193, 203.1873, 228.0701, 256.0000, 287.3503,
+ 322.5398, 362.0387, 406.3747, 456.1401, 512.0000,
+ 574.7006, 645.0796, 724.0773, 812.7493, 912.2803,
+ 1024.0000, 1149.4011, 1290.1592, 1448.1547, 1625.4987,
+ 1824.5606, 2048.0000, 2298.8023, 2580.3183, 2896.3094,
+};
-void destroyROM()
+// lambda2 = pow(lambda, 2) * scale (0.85);
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
{
- if (ATOMIC_CAS32(&initialized, 1, 0) == 0)
- return;
-}
+ 0.8500, 1.0709, 1.3493, 1.7000, 2.1419,
+ 2.6986, 3.4000, 4.2837, 5.3972, 6.8000,
+ 8.5675, 10.7943, 13.6000, 17.1349, 21.5887,
+ 27.2000, 34.2699, 43.1773, 54.4000, 68.5397,
+ 86.3546, 108.8000, 137.0794, 172.7092, 217.6000,
+ 274.1588, 345.4185, 435.2000, 548.3176, 690.8369,
+ 870.4000, 1096.6353, 1381.6739, 1740.8000, 2193.2706,
+ 2763.3478, 3481.6000, 4386.5411, 5526.6955, 6963.2000,
+ 8773.0823, 11053.3910, 13926.4000, 17546.1645, 22106.7820,
+ 27852.8000, 35092.3290, 44213.5640, 55705.6000, 70184.6580,
+ 88427.1280, 111411.2000, 140369.3161, 176854.2561, 222822.4000,
+ 280738.6321, 353708.5122, 445644.8000, 561477.2643, 707417.0243,
+ 891289.6000, 1122954.5286, 1414834.0486, 1782579.2000, 2245909.0572,
+ 2829668.0973, 3565158.4000, 4491818.1144, 5659336.1946, 7130316.8000,
+};
+#else /* !HIGH_BIT_DEPTH */
// lambda = pow(2, (double)q / 6 - 2);
double x265_lambda_tab[QP_MAX_MAX + 1] =
@@ -81,6 +106,8 @@ double x265_lambda2_tab[QP_MAX_MAX + 1] =
176854.2222, 222822.4000, 280738.6627, 353708.5368, 445644.7459
};
+#endif
+
const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
{
16, 20, 25, 32, 40, 50,
@@ -385,7 +412,16 @@ const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE] =
{ g_scan4x4[2], g_scan2x2[0], g_scan4x4[0], g_scan8x8diag }
};
-const uint8_t g_minInGroup[10] = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24 };
+// Table used for encoding the last coefficient position. The index is the position.
+// The low 4 bits are the number of "1" in the prefix and the high 4 bits are the number
+// of bits in the suffix.
+const uint8_t g_lastCoeffTable[32] =
+{
+ 0x00, 0x01, 0x02, 0x03, 0x14, 0x14, 0x15, 0x15,
+ 0x26, 0x26, 0x26, 0x26, 0x27, 0x27, 0x27, 0x27,
+ 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38,
+ 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+};
// Rice parameters for absolute transform levels
const uint8_t g_goRiceRange[5] = { 7, 14, 26, 46, 78 };
diff --git a/source/common/constants.h b/source/common/constants.h
index 9db47db..d3777fe 100644
--- a/source/common/constants.h
+++ b/source/common/constants.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -29,9 +29,6 @@
namespace x265 {
// private namespace
-void initROM();
-void destroyROM();
-
void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx);
void initRasterToZscan(uint32_t maxFullDepth);
@@ -86,7 +83,7 @@ extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
extern const uint16_t g_scan8x8diag[8 * 8];
extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4];
-extern const uint8_t g_minInGroup[10];
+extern const uint8_t g_lastCoeffTable[32];
extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes
// CABAC tables
diff --git a/source/common/contexts.h b/source/common/contexts.h
index b692806..567a9e9 100644
--- a/source/common/contexts.h
+++ b/source/common/contexts.h
@@ -1,5 +1,5 @@
/*****************************************************************************
-* Copyright (C) 2014 x265 project
+* Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp
index d28e005..a112ed5 100644
--- a/source/common/cudata.cpp
+++ b/source/common/cudata.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -57,57 +57,57 @@ void copy256(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 256); }
void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
/* Check whether 2 addresses point to the same column */
-inline bool isEqualCol(int addrA, int addrB, int numUnitsPerRow)
+inline bool isEqualCol(int addrA, int addrB, int numUnits)
{
- // addrA % numUnitsPerRow == addrB % numUnitsPerRow
- return ((addrA ^ addrB) & (numUnitsPerRow - 1)) == 0;
+ // addrA % numUnits == addrB % numUnits
+ return ((addrA ^ addrB) & (numUnits - 1)) == 0;
}
/* Check whether 2 addresses point to the same row */
-inline bool isEqualRow(int addrA, int addrB, int numUnitsPerRow)
+inline bool isEqualRow(int addrA, int addrB, int numUnits)
{
- // addrA / numUnitsPerRow == addrB / numUnitsPerRow
- return ((addrA ^ addrB) & ~(numUnitsPerRow - 1)) == 0;
+ // addrA / numUnits == addrB / numUnits
+ return ((addrA ^ addrB) & ~(numUnits - 1)) == 0;
}
/* Check whether 2 addresses point to the same row or column */
-inline bool isEqualRowOrCol(int addrA, int addrB, int numUnitsPerRow)
+inline bool isEqualRowOrCol(int addrA, int addrB, int numUnits)
{
- return isEqualCol(addrA, addrB, numUnitsPerRow) | isEqualRow(addrA, addrB, numUnitsPerRow);
+ return isEqualCol(addrA, addrB, numUnits) | isEqualRow(addrA, addrB, numUnits);
}
/* Check whether one address points to the first column */
-inline bool isZeroCol(int addr, int numUnitsPerRow)
+inline bool isZeroCol(int addr, int numUnits)
{
- // addr % numUnitsPerRow == 0
- return (addr & (numUnitsPerRow - 1)) == 0;
+ // addr % numUnits == 0
+ return (addr & (numUnits - 1)) == 0;
}
/* Check whether one address points to the first row */
-inline bool isZeroRow(int addr, int numUnitsPerRow)
+inline bool isZeroRow(int addr, int numUnits)
{
- // addr / numUnitsPerRow == 0
- return (addr & ~(numUnitsPerRow - 1)) == 0;
+ // addr / numUnits == 0
+ return (addr & ~(numUnits - 1)) == 0;
}
/* Check whether one address points to a column whose index is smaller than a given value */
-inline bool lessThanCol(int addr, int val, int numUnitsPerRow)
+inline bool lessThanCol(int addr, int val, int numUnits)
{
- // addr % numUnitsPerRow < val
- return (addr & (numUnitsPerRow - 1)) < val;
+ // addr % numUnits < val
+ return (addr & (numUnits - 1)) < val;
}
/* Check whether one address points to a row whose index is smaller than a given value */
-inline bool lessThanRow(int addr, int val, int numUnitsPerRow)
+inline bool lessThanRow(int addr, int val, int numUnits)
{
- // addr / numUnitsPerRow < val
- return addr < val * numUnitsPerRow;
+ // addr / numUnits < val
+ return addr < val * numUnits;
}
inline MV scaleMv(MV mv, int scale)
{
- int mvx = Clip3(-32768, 32767, (scale * mv.x + 127 + (scale * mv.x < 0)) >> 8);
- int mvy = Clip3(-32768, 32767, (scale * mv.y + 127 + (scale * mv.y < 0)) >> 8);
+ int mvx = x265_clip3(-32768, 32767, (scale * mv.x + 127 + (scale * mv.x < 0)) >> 8);
+ int mvy = x265_clip3(-32768, 32767, (scale * mv.y + 127 + (scale * mv.y < 0)) >> 8);
return MV((int16_t)mvx, (int16_t)mvy);
}
@@ -227,16 +227,15 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp,
/* Each CU's data is layed out sequentially within the charMemBlock */
uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
- m_qp = (char*)charBuf; charBuf += m_numPartitions;
+ m_qp = (int8_t*)charBuf; charBuf += m_numPartitions;
m_log2CUSize = charBuf; charBuf += m_numPartitions;
- m_partSize = charBuf; charBuf += m_numPartitions;
- m_predMode = charBuf; charBuf += m_numPartitions;
m_lumaIntraDir = charBuf; charBuf += m_numPartitions;
m_tqBypass = charBuf; charBuf += m_numPartitions;
- m_refIdx[0] = (char*)charBuf; charBuf += m_numPartitions;
- m_refIdx[1] = (char*)charBuf; charBuf += m_numPartitions;
+ m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+ m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
m_cuDepth = charBuf; charBuf += m_numPartitions;
- m_skipFlag = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+ m_predMode = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+ m_partSize = charBuf; charBuf += m_numPartitions;
m_mergeFlag = charBuf; charBuf += m_numPartitions;
m_interDir = charBuf; charBuf += m_numPartitions;
m_mvpIdx[0] = charBuf; charBuf += m_numPartitions;
@@ -278,8 +277,6 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
/* sequential memsets */
m_partSet((uint8_t*)m_qp, (uint8_t)qp);
m_partSet(m_log2CUSize, (uint8_t)g_maxLog2CUSize);
- m_partSet(m_partSize, (uint8_t)SIZE_NONE);
- m_partSet(m_predMode, (uint8_t)MODE_NONE);
m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
m_partSet(m_tqBypass, (uint8_t)frame.m_encData->m_param->bLossless);
if (m_slice->m_sliceType != I_SLICE)
@@ -291,7 +288,7 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
/* initialize the remaining CU data in one memset */
- memset(m_cuDepth, 0, (BytesPerPartition - 8) * m_numPartitions);
+ memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
@@ -318,8 +315,6 @@ void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom)
/* sequential memsets */
m_partSet((uint8_t*)m_qp, (uint8_t)ctu.m_qp[0]);
m_partSet(m_log2CUSize, (uint8_t)cuGeom.log2CUSize);
- m_partSet(m_partSize, (uint8_t)SIZE_NONE);
- m_partSet(m_predMode, (uint8_t)MODE_NONE);
m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
m_partSet(m_tqBypass, (uint8_t)m_encData->m_param->bLossless);
m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
@@ -327,7 +322,7 @@ void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom)
m_partSet(m_cuDepth, (uint8_t)cuGeom.depth);
/* initialize the remaining CU data in one memset */
- memset(m_skipFlag, 0, (BytesPerPartition - 9) * m_numPartitions);
+ memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
}
/* Copy the results of a sub-part (split) CU to the parent CU */
@@ -339,14 +334,13 @@ void CUData::copyPartFrom(const CUData& subCU, const CUGeom& childGeom, uint32_t
m_subPartCopy((uint8_t*)m_qp + offset, (uint8_t*)subCU.m_qp);
m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize);
- m_subPartCopy(m_partSize + offset, subCU.m_partSize);
- m_subPartCopy(m_predMode + offset, subCU.m_predMode);
m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir);
m_subPartCopy(m_tqBypass + offset, subCU.m_tqBypass);
m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]);
m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]);
m_subPartCopy(m_cuDepth + offset, subCU.m_cuDepth);
- m_subPartCopy(m_skipFlag + offset, subCU.m_skipFlag);
+ m_subPartCopy(m_predMode + offset, subCU.m_predMode);
+ m_subPartCopy(m_partSize + offset, subCU.m_partSize);
m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag);
m_subPartCopy(m_interDir + offset, subCU.m_interDir);
m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
@@ -410,7 +404,7 @@ void CUData::initLosslessCU(const CUData& cu, const CUGeom& cuGeom)
m_partSet(m_tqBypass, true);
/* clear residual coding flags */
- m_partSet(m_skipFlag, 0);
+ m_partSet(m_predMode, cu.m_predMode[0] & (MODE_INTRA | MODE_INTER));
m_partSet(m_tuDepth, 0);
m_partSet(m_transformSkip[0], 0);
m_partSet(m_transformSkip[1], 0);
@@ -427,14 +421,13 @@ void CUData::copyToPic(uint32_t depth) const
m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp);
m_partCopy(ctu.m_log2CUSize + m_absIdxInCTU, m_log2CUSize);
- m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
- m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
m_partCopy(ctu.m_lumaIntraDir + m_absIdxInCTU, m_lumaIntraDir);
m_partCopy(ctu.m_tqBypass + m_absIdxInCTU, m_tqBypass);
m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]);
m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]);
m_partCopy(ctu.m_cuDepth + m_absIdxInCTU, m_cuDepth);
- m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag);
+ m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
+ m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag);
m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir);
m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]);
@@ -477,13 +470,13 @@ void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
/* copy out all prediction info for this part */
m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
m_partCopy(m_log2CUSize, ctu.m_log2CUSize + m_absIdxInCTU);
- m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU);
- m_partCopy(m_predMode, ctu.m_predMode + m_absIdxInCTU);
m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
m_partCopy(m_tqBypass, ctu.m_tqBypass + m_absIdxInCTU);
m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU);
m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU);
m_partCopy(m_cuDepth, ctu.m_cuDepth + m_absIdxInCTU);
+ m_partSet(m_predMode, ctu.m_predMode[m_absIdxInCTU] & (MODE_INTRA | MODE_INTER)); /* clear skip flag */
+ m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU);
m_partCopy(m_mergeFlag, ctu.m_mergeFlag + m_absIdxInCTU);
m_partCopy(m_interDir, ctu.m_interDir + m_absIdxInCTU);
m_partCopy(m_mvpIdx[0], ctu.m_mvpIdx[0] + m_absIdxInCTU);
@@ -496,7 +489,6 @@ void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
/* clear residual coding flags */
- m_partSet(m_skipFlag, 0);
m_partSet(m_tuDepth, 0);
m_partSet(m_transformSkip[0], 0);
m_partSet(m_transformSkip[1], 0);
@@ -515,7 +507,7 @@ void CUData::updatePic(uint32_t depth) const
m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]);
m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
- m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag);
+ m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth);
m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]);
m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
@@ -552,7 +544,7 @@ const CUData* CUData::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx)
return m_cuLeft;
}
-const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary) const
+const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const
{
uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
@@ -563,15 +555,10 @@ const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx
if (isEqualRow(absPartIdx, absZorderCUIdx, s_numPartInCUSize))
return m_encData->getPicCTU(m_cuAddr);
else
- {
aPartUnitIdx -= m_absIdxInCTU;
- return this;
- }
+ return this;
}
- if (planarAtCTUBoundary)
- return NULL;
-
aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize];
return m_cuAbove;
}
@@ -621,7 +608,7 @@ const CUData* CUData::getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartU
{
if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1])
{
- uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
+ uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
arPartUnitIdx = g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1];
if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, s_numPartInCUSize))
return m_encData->getPicCTU(m_cuAddr);
@@ -702,8 +689,6 @@ const CUData* CUData::getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPa
return NULL;
}
blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (1 + partUnitOffset) * s_numPartInCUSize - 1];
- if (!m_cuLeft || !m_cuLeft->m_slice)
- return NULL;
return m_cuLeft;
}
@@ -736,8 +721,6 @@ const CUData* CUData::getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPa
return NULL;
}
arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset];
- if (!m_cuAbove || !m_cuAbove->m_slice)
- return NULL;
return m_cuAbove;
}
@@ -745,8 +728,6 @@ const CUData* CUData::getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPa
return NULL;
arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
- if ((m_cuAboveRight == NULL || m_cuAboveRight->m_slice == NULL || (m_cuAboveRight->m_cuAddr) > m_cuAddr))
- return NULL;
return m_cuAboveRight;
}
@@ -785,7 +766,7 @@ const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdx
}
/* Get reference QP from left QpMinCu or latest coded QP */
-char CUData::getRefQP(uint32_t curAbsIdxInCTU) const
+int8_t CUData::getRefQP(uint32_t curAbsIdxInCTU) const
{
uint32_t lPartIdx = 0, aPartIdx = 0;
const CUData* cULeft = getQpMinCuLeft(lPartIdx, m_absIdxInCTU + curAbsIdxInCTU);
@@ -807,7 +788,7 @@ int CUData::getLastValidPartIdx(int absPartIdx) const
return lastValidPartIdx;
}
-char CUData::getLastCodedQP(uint32_t absPartIdx) const
+int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const
{
uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask);
@@ -821,7 +802,7 @@ char CUData::getLastCodedQP(uint32_t absPartIdx) const
else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth)))
return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS);
else
- return (char)m_slice->m_sliceQp;
+ return (int8_t)m_slice->m_sliceQp;
}
}
@@ -859,7 +840,7 @@ int CUData::getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred
leftIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX;
// Get intra direction of above PU
- tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx, true);
+ tempCU = g_zscanToPelY[m_absIdxInCTU + absPartIdx] > 0 ? getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx) : NULL;
aboveIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX;
@@ -912,12 +893,12 @@ uint32_t CUData::getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const
void CUData::getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const
{
uint32_t log2CUSize = m_log2CUSize[absPartIdx];
- uint32_t splitFlag = m_partSize[absPartIdx] == SIZE_NxN;
+ uint32_t splitFlag = m_partSize[absPartIdx] != SIZE_2Nx2N;
tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
- tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag), tuDepthRange[1]));
+ tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1], log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag));
}
void CUData::getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const
@@ -929,7 +910,7 @@ void CUData::getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartId
tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
- tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag), tuDepthRange[1]));
+ tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1], log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag));
}
uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const
@@ -949,7 +930,7 @@ uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const
return ctx;
}
-bool CUData::setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth)
+bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth)
{
uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1);
uint32_t curPartNumQ = curPartNumb >> 2;
@@ -1224,7 +1205,7 @@ void CUData::setPUMv(int list, const MV& mv, int absPartIdx, int puIdx)
setAllPU(m_mv[list], mv, absPartIdx, puIdx);
}
-void CUData::setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx)
+void CUData::setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx)
{
setAllPU(m_refIdx[list], refIdx, absPartIdx, puIdx);
}
@@ -1250,7 +1231,7 @@ void CUData::getMvField(const CUData* cu, uint32_t absPartIdx, int picList, MVFi
else
{
// OUT OF BOUNDARY
- outMvField.mv.word = 0;
+ outMvField.mv = 0;
outMvField.refIdx = REF_NOT_VALID;
}
}
@@ -1376,14 +1357,6 @@ uint32_t CUData::deriveRightBottomIdx(uint32_t puIdx) const
return outPartIdxRB;
}
-void CUData::deriveLeftRightTopIdxAdi(uint32_t& outPartIdxLT, uint32_t& outPartIdxRT, uint32_t partOffset, uint32_t partDepth) const
-{
- uint32_t numPartInWidth = 1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE - partDepth);
-
- outPartIdxLT = m_absIdxInCTU + partOffset;
- outPartIdxRT = g_rasterToZscan[g_zscanToRaster[outPartIdxLT] + numPartInWidth - 1];
-}
-
bool CUData::hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const
{
if (m_interDir[absPartIdx] != candCU.m_interDir[candAbsPartIdx])
@@ -1412,6 +1385,8 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
for (uint32_t i = 0; i < maxNumMergeCand; ++i)
{
+ mvFieldNeighbours[i][0].mv = 0;
+ mvFieldNeighbours[i][1].mv = 0;
mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID;
}
@@ -1441,7 +1416,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
bool isAvailableA1 = cuLeft &&
cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) &&
!(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) &&
- !cuLeft->isIntra(leftPartIdx);
+ cuLeft->isInter(leftPartIdx);
if (isAvailableA1)
{
// get Inter Dir
@@ -1465,7 +1440,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
bool isAvailableB1 = cuAbove &&
cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) &&
!(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) &&
- !cuAbove->isIntra(abovePartIdx);
+ cuAbove->isInter(abovePartIdx);
if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx)))
{
// get Inter Dir
@@ -1486,7 +1461,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT);
bool isAvailableB0 = cuAboveRight &&
cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) &&
- !cuAboveRight->isIntra(aboveRightPartIdx);
+ cuAboveRight->isInter(aboveRightPartIdx);
if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx)))
{
// get Inter Dir
@@ -1507,7 +1482,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB);
bool isAvailableA0 = cuLeftBottom &&
cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) &&
- !cuLeftBottom->isIntra(leftBottomPartIdx);
+ cuLeftBottom->isInter(leftBottomPartIdx);
if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx)))
{
// get Inter Dir
@@ -1530,7 +1505,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr);
bool isAvailableB2 = cuAboveLeft &&
cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) &&
- !cuAboveLeft->isIntra(aboveLeftPartIdx);
+ cuAboveLeft->isInter(aboveLeftPartIdx);
if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx))
&& (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx)))
{
@@ -1558,17 +1533,17 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
{
uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
- uint32_t numPartInCUSize = s_numPartInCUSize;
- bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU
- bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row of CTU
+ uint32_t numUnits = s_numPartInCUSize;
+ bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1, numUnits); // is not at the last column of CTU
+ bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1, numUnits); // is not at the last row of CTU
if (bNotLastCol && bNotLastRow)
{
- absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1];
+ absPartAddr = g_rasterToZscan[absPartIdxRB + numUnits + 1];
ctuIdx = m_cuAddr;
}
else if (bNotLastCol)
- absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)];
+ absPartAddr = g_rasterToZscan[(absPartIdxRB + numUnits + 1) & (numUnits - 1)];
else if (bNotLastRow)
{
absPartAddr = g_rasterToZscan[absPartIdxRB + 1];
@@ -1659,7 +1634,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV
while (count < maxNumMergeCand)
{
interDirNeighbours[count] = 1;
- mvFieldNeighbours[count][0].mv.word = 0;
+ mvFieldNeighbours[count][0].mv = 0;
mvFieldNeighbours[count][0].refIdx = r;
if (isInterB)
@@ -1785,17 +1760,17 @@ int CUData::fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int re
m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
{
uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
- uint32_t numPartInCUSize = s_numPartInCUSize;
- bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU
- bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row of CTU
+ uint32_t numUnits = s_numPartInCUSize;
+ bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1, numUnits); // is not at the last column of CTU
+ bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1, numUnits); // is not at the last row of CTU
if (bNotLastCol && bNotLastRow)
{
- absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1];
+ absPartAddr = g_rasterToZscan[absPartIdxRB + numUnits + 1];
ctuIdx = m_cuAddr;
}
else if (bNotLastCol)
- absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)];
+ absPartAddr = g_rasterToZscan[(absPartIdxRB + numUnits + 1) & (numUnits - 1)];
else if (bNotLastRow)
{
absPartAddr = g_rasterToZscan[absPartIdxRB + 1];
@@ -1966,26 +1941,18 @@ bool CUData::addMVPCandOrder(MV& outMV, int picList, int refIdx, uint32_t partUn
bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const
{
- uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
-
- int colRefPicList;
- int colPOC, colRefPOC, curPOC, curRefPOC;
- MV colmv;
+ const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
+ const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
- // use coldir.
- Frame *colPic = m_slice->m_refPicList[m_slice->isInterB() ? 1 - m_slice->m_colFromL0Flag : 0][m_slice->m_colRefIdx];
- CUData *colCU = colPic->m_encData->getPicCTU(cuAddr);
-
- if (colCU->m_partSize[partUnitIdx] == SIZE_NONE)
+ if (colCU->m_predMode[partUnitIdx] == MODE_NONE)
return false;
- curPOC = m_slice->m_poc;
- colPOC = colCU->m_slice->m_poc;
+ uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
if (colCU->isIntra(absPartAddr))
return false;
- colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag;
+ int colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag;
int colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr];
@@ -1999,9 +1966,12 @@ bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int p
}
// Scale the vector
- colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx];
- colmv = colCU->m_mv[colRefPicList][absPartAddr];
- curRefPOC = m_slice->m_refPOCList[picList][outRefIdx];
+ int colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx];
+ int colPOC = colCU->m_slice->m_poc;
+ MV colmv = colCU->m_mv[colRefPicList][absPartAddr];
+
+ int curRefPOC = m_slice->m_refPOCList[picList][outRefIdx];
+ int curPOC = m_slice->m_poc;
scaleMvByPOCDist(outMV, colmv, curPOC, curRefPOC, colPOC, colRefPOC);
return true;
@@ -2016,10 +1986,10 @@ void CUData::scaleMvByPOCDist(MV& outMV, const MV& inMV, int curPOC, int curRefP
outMV = inMV;
else
{
- int tdb = Clip3(-128, 127, diffPocB);
- int tdd = Clip3(-128, 127, diffPocD);
+ int tdb = x265_clip3(-128, 127, diffPocB);
+ int tdd = x265_clip3(-128, 127, diffPocD);
int x = (0x4000 + abs(tdd / 2)) / tdd;
- int scale = Clip3(-4096, 4095, (tdb * x + 32) >> 6);
+ int scale = x265_clip3(-4096, 4095, (tdb * x + 32) >> 6);
outMV = scaleMv(inMV, scale);
}
}
@@ -2096,7 +2066,7 @@ void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uin
#define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag))
-void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const
+void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS])
{
// Initialize the coding blocks inside the CTB
for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--)
@@ -2111,10 +2081,10 @@ void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUS
uint32_t depthIdx = g_depthScanIdx[sbY][sbX];
uint32_t cuIdx = rangeCUIdx + depthIdx;
uint32_t childIdx = rangeCUIdx + sbWidth * sbWidth + (depthIdx << 2);
- uint32_t px = m_cuPelX + sbX * blockSize;
- uint32_t py = m_cuPelY + sbY * blockSize;
- int32_t presentFlag = px < picWidth && py < picHeight;
- int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > picWidth || py + blockSize > picHeight);
+ uint32_t px = sbX * blockSize;
+ uint32_t py = sbY * blockSize;
+ int32_t presentFlag = px < ctuWidth && py < ctuHeight;
+ int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > ctuWidth || py + blockSize > ctuHeight);
/* Offset of the luma CU in the X, Y direction in terms of pixels from the CTU origin */
uint32_t xOffset = (sbX * blockSize) >> 3;
diff --git a/source/common/cudata.h b/source/common/cudata.h
index 7f735d6..1f6e48b 100644
--- a/source/common/cudata.h
+++ b/source/common/cudata.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -46,14 +46,15 @@ enum PartSize
SIZE_2NxnD, // asymmetric motion partition, 2Nx(3N/2) + 2Nx( N/2)
SIZE_nLx2N, // asymmetric motion partition, ( N/2)x2N + (3N/2)x2N
SIZE_nRx2N, // asymmetric motion partition, (3N/2)x2N + ( N/2)x2N
- SIZE_NONE = 15
+ NUM_SIZES
};
enum PredMode
{
- MODE_INTER,
- MODE_INTRA,
- MODE_NONE = 15
+ MODE_NONE = 0,
+ MODE_INTER = (1 << 0),
+ MODE_INTRA = (1 << 1),
+ MODE_SKIP = (1 << 2) | MODE_INTER
};
// motion vector predictor direction used in AMVP
@@ -126,15 +127,14 @@ public:
int m_vChromaShift;
/* Per-part data, stored contiguously */
- char* m_qp; // array of QP values
+ int8_t* m_qp; // array of QP values
uint8_t* m_log2CUSize; // array of cu log2Size TODO: seems redundant to depth
- uint8_t* m_partSize; // array of partition sizes
- uint8_t* m_predMode; // array of prediction modes
uint8_t* m_lumaIntraDir; // array of intra directions (luma)
uint8_t* m_tqBypass; // array of CU lossless flags
- char* m_refIdx[2]; // array of motion reference indices per list
+ int8_t* m_refIdx[2]; // array of motion reference indices per list
uint8_t* m_cuDepth; // array of depths
- uint8_t* m_skipFlag; // array of skip flags
+ uint8_t* m_predMode; // array of prediction modes
+ uint8_t* m_partSize; // array of partition sizes
uint8_t* m_mergeFlag; // array of merge flags
uint8_t* m_interDir; // array of inter directions
uint8_t* m_mvpIdx[2]; // array of motion vector predictor candidates or merge candidate indices [0]
@@ -142,7 +142,7 @@ public:
uint8_t* m_transformSkip[3]; // array of transform skipping flags per plane
uint8_t* m_cbf[3]; // array of coded block flags (CBF) per plane
uint8_t* m_chromaIntraDir; // array of intra directions (chroma)
- enum { BytesPerPartition = 22 }; // combined sizeof() of all per-part data
+ enum { BytesPerPartition = 21 }; // combined sizeof() of all per-part data
coeff_t* m_trCoeff[3]; // transformed coefficient buffer per plane
@@ -158,7 +158,7 @@ public:
CUData();
void initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
- void calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const;
+ static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
void initCTU(const Frame& frame, uint32_t cuAddr, int qp);
void initSubCU(const CUData& ctu, const CUGeom& cuGeom);
@@ -173,12 +173,11 @@ public:
void updatePic(uint32_t depth) const;
void setPartSizeSubParts(PartSize size) { m_partSet(m_partSize, (uint8_t)size); }
- void setSkipFlagSubParts(uint8_t skipFlag) { m_partSet(m_skipFlag, skipFlag); }
void setPredModeSubParts(PredMode mode) { m_partSet(m_predMode, (uint8_t)mode); }
void clearCbf() { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
/* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
- void setQPSubParts(char qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
+ void setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
void setTUDepthSubParts(uint8_t tuDepth, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_tuDepth + absPartIdx, tuDepth); }
void setLumaIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_lumaIntraDir + absPartIdx, dir); }
void setChromIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_chromaIntraDir + absPartIdx, dir); }
@@ -187,15 +186,15 @@ public:
void setTransformSkipSubParts(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_transformSkip[ttype] + absPartIdx, tskip); }
void setTransformSkipPartRange(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t coveredPartIdxes) { memset(m_transformSkip[ttype] + absPartIdx, tskip, coveredPartIdxes); }
- bool setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth);
+ bool setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth);
void setPUInterDir(uint8_t dir, uint32_t absPartIdx, uint32_t puIdx);
void setPUMv(int list, const MV& mv, int absPartIdx, int puIdx);
- void setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx);
+ void setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
- uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t trDepth) const { return (m_cbf[ttype][absPartIdx] >> trDepth) & 0x1; }
+ uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
uint8_t getQtRootCbf(uint32_t absPartIdx) const { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
- char getRefQP(uint32_t currAbsIdxInCTU) const;
+ int8_t getRefQP(uint32_t currAbsIdxInCTU) const;
uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const;
void clipMv(MV& outMV) const;
int fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const;
@@ -204,7 +203,8 @@ public:
uint32_t getNumPartInter() const { return nbPartsTable[(int)m_partSize[0]]; }
bool isIntra(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_INTRA; }
- bool isSkipped(uint32_t absPartIdx) const { return !!m_skipFlag[absPartIdx]; }
+ bool isInter(uint32_t absPartIdx) const { return !!(m_predMode[absPartIdx] & MODE_INTER); }
+ bool isSkipped(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_SKIP; }
bool isBipredRestriction() const { return m_log2CUSize[0] == 3 && m_partSize[0] != SIZE_2Nx2N; }
void getPartIndexAndSize(uint32_t puIdx, uint32_t& absPartIdx, int& puWidth, int& puHeight) const;
@@ -212,7 +212,6 @@ public:
void getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const;
int getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const;
- void deriveLeftRightTopIdxAdi(uint32_t& partIdxLT, uint32_t& partIdxRT, uint32_t partOffset, uint32_t partDepth) const;
uint32_t getSCUAddr() const { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; }
uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
@@ -221,7 +220,7 @@ public:
void getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const;
const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const;
- const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary = false) const;
+ const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx) const;
const CUData* getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx) const;
const CUData* getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx) const;
const CUData* getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx) const;
@@ -237,7 +236,7 @@ protected:
template<typename T>
void setAllPU(T *p, const T& val, int absPartIdx, int puIdx);
- char getLastCodedQP(uint32_t absPartIdx) const;
+ int8_t getLastCodedQP(uint32_t absPartIdx) const;
int getLastValidPartIdx(int absPartIdx) const;
bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const;
diff --git a/source/common/dct.cpp b/source/common/dct.cpp
index 714006e..f24a68e 100644
--- a/source/common/dct.cpp
+++ b/source/common/dct.cpp
@@ -41,7 +41,7 @@ namespace {
// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
// give identical results
-void fastForwardDst(int16_t *block, int16_t *coeff, int shift) // input block, output coeff
+void fastForwardDst(const int16_t* block, int16_t* coeff, int shift) // input block, output coeff
{
int c[4];
int rnd_factor = 1 << (shift - 1);
@@ -61,7 +61,7 @@ void fastForwardDst(int16_t *block, int16_t *coeff, int shift) // input block,
}
}
-void inversedst(int16_t *tmp, int16_t *block, int shift) // input tmp, output block
+void inversedst(const int16_t* tmp, int16_t* block, int shift) // input tmp, output block
{
int i, c[4];
int rnd_factor = 1 << (shift - 1);
@@ -74,14 +74,14 @@ void inversedst(int16_t *tmp, int16_t *block, int shift) // input tmp, output b
c[2] = tmp[i] - tmp[12 + i];
c[3] = 74 * tmp[4 + i];
- block[4 * i + 0] = (int16_t)Clip3(-32768, 32767, (29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift);
- block[4 * i + 1] = (int16_t)Clip3(-32768, 32767, (55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
- block[4 * i + 2] = (int16_t)Clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i] + tmp[12 + i]) + rnd_factor) >> shift);
- block[4 * i + 3] = (int16_t)Clip3(-32768, 32767, (55 * c[0] + 29 * c[2] - c[3] + rnd_factor) >> shift);
+ block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift);
+ block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
+ block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i] + tmp[12 + i]) + rnd_factor) >> shift);
+ block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2] - c[3] + rnd_factor) >> shift);
}
}
-void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[8], O[8];
@@ -134,7 +134,7 @@ void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[16], O[16];
@@ -203,7 +203,7 @@ void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[4], O[4];
@@ -240,7 +240,7 @@ void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
{
int j;
int E[2], O[2];
@@ -255,17 +255,17 @@ void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line)
E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
- dst[0] = (int16_t)(Clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
- dst[1] = (int16_t)(Clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
- dst[2] = (int16_t)(Clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
- dst[3] = (int16_t)(Clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
+ dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
+ dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
+ dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
+ dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
src++;
dst += 4;
}
}
-void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[4], O[4];
@@ -292,8 +292,8 @@ void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line)
E[2] = EE[1] - EO[1];
for (k = 0; k < 4; k++)
{
- dst[k] = (int16_t)Clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
- dst[k + 4] = (int16_t)Clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
+ dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
+ dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
}
src++;
@@ -301,7 +301,7 @@ void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[8], O[8];
@@ -343,8 +343,8 @@ void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line)
for (k = 0; k < 8; k++)
{
- dst[k] = (int16_t)Clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
- dst[k + 8] = (int16_t)Clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
+ dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
+ dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
}
src++;
@@ -352,7 +352,7 @@ void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
{
int j, k;
int E[16], O[16];
@@ -407,8 +407,8 @@ void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line)
for (k = 0; k < 16; k++)
{
- dst[k] = (int16_t)Clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
- dst[k + 16] = (int16_t)Clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
+ dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
+ dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
}
src++;
@@ -416,7 +416,7 @@ void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line)
+void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
{
int j;
int E[2], O[2];
@@ -440,7 +440,7 @@ void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line)
}
}
-void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 1 + X265_DEPTH - 8;
const int shift_2nd = 8;
@@ -450,25 +450,14 @@ void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
for (int i = 0; i < 4; i++)
{
- memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+ memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
}
fastForwardDst(block, coef, shift_1st);
- fastForwardDst(coef, block, shift_2nd);
-
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ fastForwardDst(coef, dst, shift_2nd);
}
-void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 1 + X265_DEPTH - 8;
const int shift_2nd = 8;
@@ -478,24 +467,14 @@ void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
for (int i = 0; i < 4; i++)
{
- memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+ memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
}
partialButterfly4(block, coef, shift_1st, 4);
- partialButterfly4(coef, block, shift_2nd, 4);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly4(coef, dst, shift_2nd, 4);
}
-void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 2 + X265_DEPTH - 8;
const int shift_2nd = 9;
@@ -505,25 +484,14 @@ void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
for (int i = 0; i < 8; i++)
{
- memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
+ memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
}
partialButterfly8(block, coef, shift_1st, 8);
- partialButterfly8(coef, block, shift_2nd, 8);
-
-#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly8(coef, dst, shift_2nd, 8);
}
-void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 3 + X265_DEPTH - 8;
const int shift_2nd = 10;
@@ -533,25 +501,14 @@ void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
for (int i = 0; i < 16; i++)
{
- memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
+ memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
}
partialButterfly16(block, coef, shift_1st, 16);
- partialButterfly16(coef, block, shift_2nd, 16);
-
-#define N (16)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly16(coef, dst, shift_2nd, 16);
}
-void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 4 + X265_DEPTH - 8;
const int shift_2nd = 11;
@@ -561,25 +518,14 @@ void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
for (int i = 0; i < 32; i++)
{
- memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
+ memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
}
partialButterfly32(block, coef, shift_1st, 32);
- partialButterfly32(coef, block, shift_2nd, 32);
-
-#define N (32)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly32(coef, dst, shift_2nd, 32);
}
-void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -587,27 +533,16 @@ void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, block[4 * 4]);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
+ inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
for (int i = 0; i < 4; i++)
{
- memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
}
}
-void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -615,27 +550,16 @@ void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, block[4 * 4]);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
+ partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
for (int i = 0; i < 4; i++)
{
- memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
}
}
-void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -643,26 +567,16 @@ void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
ALIGN_VAR_32(int16_t, coef[8 * 8]);
ALIGN_VAR_32(int16_t, block[8 * 8]);
-#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse8(block, coef, shift_1st, 8);
+ partialButterflyInverse8(src, coef, shift_1st, 8);
partialButterflyInverse8(coef, block, shift_2nd, 8);
+
for (int i = 0; i < 8; i++)
{
- memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
}
}
-void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -670,26 +584,16 @@ void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
ALIGN_VAR_32(int16_t, coef[16 * 16]);
ALIGN_VAR_32(int16_t, block[16 * 16]);
-#define N (16)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse16(block, coef, shift_1st, 16);
+ partialButterflyInverse16(src, coef, shift_1st, 16);
partialButterflyInverse16(coef, block, shift_2nd, 16);
+
for (int i = 0; i < 16; i++)
{
- memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
}
}
-void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -697,27 +601,16 @@ void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
ALIGN_VAR_32(int16_t, coef[32 * 32]);
ALIGN_VAR_32(int16_t, block[32 * 32]);
-#define N (32)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse32(block, coef, shift_1st, 32);
+ partialButterflyInverse32(src, coef, shift_1st, 32);
partialButterflyInverse32(coef, block, shift_2nd, 32);
for (int i = 0; i < 32; i++)
{
- memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
}
}
-void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
+void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
{
#if HIGH_BIT_DEPTH
X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
@@ -737,11 +630,11 @@ void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scal
for (int n = 0; n < num; n++)
{
coeffQ = (quantCoef[n] * scale + add) >> shift;
- coef[n] = Clip3(-32768, 32767, coeffQ);
+ coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
}
}
-void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
@@ -756,20 +649,20 @@ void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int
for (int n = 0; n < num; n++)
{
coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
- coef[n] = Clip3(-32768, 32767, coeffQ);
+ coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
}
}
else
{
for (int n = 0; n < num; n++)
{
- coeffQ = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
- coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
+ coeffQ = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
+ coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
}
}
}
-uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK(qBits >= 8, "qBits less than 8\n");
X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
@@ -787,13 +680,13 @@ uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* q
if (level)
++numSig;
level *= sign;
- qCoef[blockpos] = (int16_t)Clip3(-32768, 32767, level);
+ qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
}
return numSig;
}
-uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
@@ -811,13 +704,13 @@ uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits,
if (level)
++numSig;
level *= sign;
- qCoef[blockpos] = (int16_t)Clip3(-32768, 32767, level);
+ qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
}
return numSig;
}
-int count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
+int count_nonzero_c(const int16_t* quantCoeff, int numCoeff)
{
X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
@@ -833,22 +726,22 @@ int count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
}
template<int trSize>
-uint32_t copy_count(int16_t* coeff, int16_t* residual, intptr_t stride)
+uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
{
uint32_t numSig = 0;
for (int k = 0; k < trSize; k++)
{
for (int j = 0; j < trSize; j++)
{
- coeff[k * trSize + j] = residual[k * stride + j];
- numSig += (residual[k * stride + j] != 0);
+ coeff[k * trSize + j] = residual[k * resiStride + j];
+ numSig += (residual[k * resiStride + j] != 0);
}
}
return numSig;
}
-void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
+void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
{
for (int i = 0; i < numCoeff; i++)
{
@@ -857,7 +750,7 @@ void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numC
level = (level + sign) ^ sign;
resSum[i] += level;
level -= offset[i];
- dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
+ dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
}
}
@@ -866,28 +759,28 @@ void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numC
namespace x265 {
// x265 private namespace
-void Setup_C_DCTPrimitives(EncoderPrimitives& p)
+void setupDCTPrimitives_c(EncoderPrimitives& p)
{
p.dequant_scaling = dequant_scaling_c;
p.dequant_normal = dequant_normal_c;
p.quant = quant_c;
p.nquant = nquant_c;
- p.dct[DST_4x4] = dst4_c;
- p.dct[DCT_4x4] = dct4_c;
- p.dct[DCT_8x8] = dct8_c;
- p.dct[DCT_16x16] = dct16_c;
- p.dct[DCT_32x32] = dct32_c;
- p.idct[IDST_4x4] = idst4_c;
- p.idct[IDCT_4x4] = idct4_c;
- p.idct[IDCT_8x8] = idct8_c;
- p.idct[IDCT_16x16] = idct16_c;
- p.idct[IDCT_32x32] = idct32_c;
+ p.dst4x4 = dst4_c;
+ p.cu[BLOCK_4x4].dct = dct4_c;
+ p.cu[BLOCK_8x8].dct = dct8_c;
+ p.cu[BLOCK_16x16].dct = dct16_c;
+ p.cu[BLOCK_32x32].dct = dct32_c;
+ p.idst4x4 = idst4_c;
+ p.cu[BLOCK_4x4].idct = idct4_c;
+ p.cu[BLOCK_8x8].idct = idct8_c;
+ p.cu[BLOCK_16x16].idct = idct16_c;
+ p.cu[BLOCK_32x32].idct = idct32_c;
p.count_nonzero = count_nonzero_c;
p.denoiseDct = denoiseDct_c;
- p.copy_cnt[BLOCK_4x4] = copy_count<4>;
- p.copy_cnt[BLOCK_8x8] = copy_count<8>;
- p.copy_cnt[BLOCK_16x16] = copy_count<16>;
- p.copy_cnt[BLOCK_32x32] = copy_count<32>;
+ p.cu[BLOCK_4x4].copy_cnt = copy_count<4>;
+ p.cu[BLOCK_8x8].copy_cnt = copy_count<8>;
+ p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
+ p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
}
}
diff --git a/source/common/deblock.cpp b/source/common/deblock.cpp
index c9a2731..52c255e 100644
--- a/source/common/deblock.cpp
+++ b/source/common/deblock.cpp
@@ -33,136 +33,154 @@ using namespace x265;
#define DEBLOCK_SMALLEST_BLOCK 8
#define DEFAULT_INTRA_TC_OFFSET 2
-void Deblock::deblockCTU(CUData* cu, int32_t dir)
+void Deblock::deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir)
{
- uint8_t blockingStrength[MAX_NUM_PARTITIONS];
+ uint8_t blockStrength[MAX_NUM_PARTITIONS];
- memset(blockingStrength, 0, sizeof(uint8_t) * m_numPartitions);
+ memset(blockStrength, 0, sizeof(uint8_t) * cuGeom.numPartitions);
- deblockCU(cu, 0, 0, dir, blockingStrength);
+ deblockCU(ctu, cuGeom, dir, blockStrength);
+}
+
+static inline uint8_t bsCuEdge(const CUData* cu, uint32_t absPartIdx, int32_t dir)
+{
+ if (dir == Deblock::EDGE_VER)
+ {
+ if (cu->m_cuPelX + g_zscanToPelX[absPartIdx] > 0)
+ {
+ uint32_t tempPartIdx;
+ const CUData* tempCU = cu->getPULeft(tempPartIdx, absPartIdx);
+ return tempCU ? 2 : 0;
+ }
+ }
+ else
+ {
+ if (cu->m_cuPelY + g_zscanToPelY[absPartIdx] > 0)
+ {
+ uint32_t tempPartIdx;
+ const CUData* tempCU = cu->getPUAbove(tempPartIdx, absPartIdx);
+ return tempCU ? 2 : 0;
+ }
+ }
+
+ return 0;
}
/* Deblocking filter process in CU-based (the same function as conventional's)
* param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
-void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockingStrength[])
+void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[])
{
- if (cu->m_partSize[absPartIdx] == SIZE_NONE)
+ uint32_t absPartIdx = cuGeom.encodeIdx;
+ uint32_t depth = cuGeom.depth;
+ if (cu->m_predMode[absPartIdx] == MODE_NONE)
return;
- uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
-
- const SPS& sps = *cu->m_slice->m_sps;
-
if (cu->m_cuDepth[absPartIdx] > depth)
{
- uint32_t qNumParts = curNumParts >> 2;
- uint32_t xmax = sps.picWidthInLumaSamples - cu->m_cuPelX;
- uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY;
- for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
- if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
- deblockCU(cu, absPartIdx, depth + 1, dir, blockingStrength);
+ for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+ {
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
+ deblockCU(cu, childGeom, dir, blockStrength);
+ }
return;
}
- const uint32_t widthInBaseUnits = sps.numPartInCUSize >> depth;
- Param params;
- setLoopfilterParam(cu, absPartIdx, ¶ms);
- setEdgefilterPU(cu, absPartIdx, dir, blockingStrength, widthInBaseUnits);
- setEdgefilterTU(cu, absPartIdx, depth, dir, blockingStrength);
- setEdgefilterMultiple(cu, absPartIdx, dir, 0, (dir == EDGE_VER ? params.leftEdge : params.topEdge), blockingStrength, widthInBaseUnits);
+ uint32_t numUnits = 1 << (cuGeom.log2CUSize - LOG2_UNIT_SIZE);
+ setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits);
+ setEdgefilterTU(cu, absPartIdx, 0, dir, blockStrength);
+ setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
- for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++)
+ uint32_t numParts = cuGeom.numPartitions;
+ for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + numParts; partIdx++)
{
uint32_t bsCheck = !(partIdx & (1 << dir));
- if (bsCheck && blockingStrength[partIdx])
- getBoundaryStrengthSingle(cu, dir, partIdx, blockingStrength);
+ if (bsCheck && blockStrength[partIdx])
+ blockStrength[partIdx] = getBoundaryStrength(cu, dir, partIdx, blockStrength);
}
const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE;
- uint32_t sizeInPU = sps.numPartInCUSize >> depth;
uint32_t shiftFactor = (dir == EDGE_VER) ? cu->m_hChromaShift : cu->m_vChromaShift;
uint32_t chromaMask = ((DEBLOCK_SMALLEST_BLOCK << shiftFactor) >> LOG2_UNIT_SIZE) - 1;
uint32_t e0 = (dir == EDGE_VER ? g_zscanToPelX[absPartIdx] : g_zscanToPelY[absPartIdx]) >> LOG2_UNIT_SIZE;
- for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr)
+ for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
{
- edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockingStrength);
+ edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
if (!((e0 + e) & chromaMask))
- edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockingStrength);
+ edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
}
}
-static inline uint32_t calcBsIdx(CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
+static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
{
- uint32_t ctuWidthInBaseUnits = cu->m_slice->m_sps->numPartInCUSize;
+ uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize;
if (dir)
- return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * ctuWidthInBaseUnits + baseUnitIdx];
+ return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numUnits + baseUnitIdx];
else
- return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * ctuWidthInBaseUnits + edgeIdx];
+ return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numUnits + edgeIdx];
}
-void Deblock::setEdgefilterMultiple(CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits)
+void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
{
- const uint32_t numElem = widthInBaseUnits;
- X265_CHECK(numElem > 0, "numElem edge filter check\n");
- for (uint32_t i = 0; i < numElem; i++)
+ X265_CHECK(numUnits > 0, "numUnits edge filter check\n");
+ for (uint32_t i = 0; i < numUnits; i++)
{
const uint32_t bsidx = calcBsIdx(cu, scanIdx, dir, edgeIdx, i);
- blockingStrength[bsidx] = value;
+ blockStrength[bsidx] = value;
}
}
-void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[])
+void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[])
{
- if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth)
+ uint32_t log2TrSize = cu->m_log2CUSize[absPartIdx] - tuDepth;
+ if (cu->m_tuDepth[absPartIdx] > tuDepth)
{
- const uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
- const uint32_t qNumParts = curNumParts >> 2;
-
- for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
- setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockingStrength);
+ uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE - 1) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ setEdgefilterTU(cu, absPartIdx, tuDepth + 1, dir, blockStrength);
return;
}
- uint32_t widthInBaseUnits = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
- setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockingStrength, widthInBaseUnits);
+ uint32_t numUnits = 1 << (log2TrSize - LOG2_UNIT_SIZE);
+ setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits);
}
-void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits)
+void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits)
{
- const uint32_t hWidthInBaseUnits = widthInBaseUnits >> 1;
- const uint32_t qWidthInBaseUnits = widthInBaseUnits >> 2;
+ const uint32_t hNumUnits = numUnits >> 1;
+ const uint32_t qNumUnits = numUnits >> 2;
switch (cu->m_partSize[absPartIdx])
{
case SIZE_2NxN:
if (EDGE_HOR == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_Nx2N:
if (EDGE_VER == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_NxN:
- setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_2NxnU:
if (EDGE_HOR == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_nLx2N:
if (EDGE_VER == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_2NxnD:
if (EDGE_HOR == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_nRx2N:
if (EDGE_VER == dir)
- setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits);
+ setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
break;
case SIZE_2Nx2N:
@@ -171,151 +189,65 @@ void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint
}
}
-void Deblock::setLoopfilterParam(CUData* cu, uint32_t absPartIdx, Param *params)
+uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[])
{
- uint32_t x = cu->m_cuPelX + g_zscanToPelX[absPartIdx];
- uint32_t y = cu->m_cuPelY + g_zscanToPelY[absPartIdx];
-
- const CUData* tempCU;
- uint32_t tempPartIdx;
+ // Calculate block index
+ uint32_t partP;
+ const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
- if (!x)
- params->leftEdge = 0;
- else
- {
- tempCU = cu->getPULeft(tempPartIdx, absPartIdx);
- if (tempCU)
- params->leftEdge = 2;
- else
- params->leftEdge = 0;
- }
+ // Set BS for Intra MB : BS = 2
+ if (cuP->isIntra(partP) || cuQ->isIntra(partQ))
+ return 2;
- if (!y)
- params->topEdge = 0;
- else
- {
- tempCU = cu->getPUAbove(tempPartIdx, absPartIdx);
- if (tempCU)
- params->topEdge = 2;
- else
- params->topEdge = 0;
- }
-}
+ // Set BS for not Intra MB : BS = 1 or 0
+ if (blockStrength[partQ] > 1 &&
+ (cuQ->getCbf(partQ, TEXT_LUMA, cuQ->m_tuDepth[partQ]) ||
+ cuP->getCbf(partP, TEXT_LUMA, cuP->m_tuDepth[partP])))
+ return 1;
-void Deblock::getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t absPartIdx, uint8_t blockingStrength[])
-{
- const Slice* const slice = cu->m_slice;
- const uint32_t partQ = absPartIdx;
- CUData* const cuQ = cu;
+ static const MV zeroMv(0, 0);
+ const Slice* const sliceQ = cuQ->m_slice;
+ const Slice* const sliceP = cuP->m_slice;
- uint32_t partP;
- const CUData* cuP;
- uint8_t bs = 0;
+ const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
+ const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
+ const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
+ const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
- // Calculate block index
- if (dir == EDGE_VER)
- cuP = cuQ->getPULeft(partP, partQ);
- else // (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
+ if (sliceQ->isInterP() && sliceP->isInterP())
+ {
+ return ((refP0 != refQ0) ||
+ (abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4)) ? 1 : 0;
+ }
- // Set BS for Intra MB : BS = 4 or 3
- if (cuP->isIntra(partP) || cuQ->isIntra(partQ))
- bs = 2;
+ // (sliceQ->isInterB() || sliceP->isInterB())
+ const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
+ const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
+ const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
+ const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
- // Set BS for not Intra MB : BS = 2 or 1 or 0
- if (!cuP->isIntra(partP) && !cuQ->isIntra(partQ))
+ if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0)))
{
- uint32_t nsPartQ = partQ;
- uint32_t nsPartP = partP;
-
- if (blockingStrength[absPartIdx] > 1 &&
- (cuQ->getCbf(nsPartQ, TEXT_LUMA, cuQ->m_tuDepth[nsPartQ]) ||
- cuP->getCbf(nsPartP, TEXT_LUMA, cuP->m_tuDepth[nsPartP])))
- bs = 1;
- else
+ if (refP0 != refP1) // Different L0 & L1
{
- if (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
-
- if (slice->isInterB() || cuP->m_slice->isInterB())
- {
- int32_t refIdx;
- Frame *refP0, *refP1, *refQ0, *refQ1;
- refIdx = cuP->m_refIdx[0][partP];
- refP0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx];
- refIdx = cuP->m_refIdx[1][partP];
- refP1 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[1][refIdx];
- refIdx = cuQ->m_refIdx[0][partQ];
- refQ0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx];
- refIdx = cuQ->m_refIdx[1][partQ];
- refQ1 = (refIdx < 0) ? NULL : slice->m_refPicList[1][refIdx];
-
- MV mvp0 = cuP->m_mv[0][partP];
- MV mvp1 = cuP->m_mv[1][partP];
- MV mvq0 = cuQ->m_mv[0][partQ];
- MV mvq1 = cuQ->m_mv[1][partQ];
-
- if (!refP0) mvp0 = 0;
- if (!refP1) mvp1 = 0;
- if (!refQ0) mvq0 = 0;
- if (!refQ1) mvq1 = 0;
-
- if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0)))
- {
- if (refP0 != refP1) // Different L0 & L1
- {
- if (refP0 == refQ0)
- {
- bs = ((abs(mvq0.x - mvp0.x) >= 4) ||
- (abs(mvq0.y - mvp0.y) >= 4) ||
- (abs(mvq1.x - mvp1.x) >= 4) ||
- (abs(mvq1.y - mvp1.y) >= 4)) ? 1 : 0;
- }
- else
- {
- bs = ((abs(mvq1.x - mvp0.x) >= 4) ||
- (abs(mvq1.y - mvp0.y) >= 4) ||
- (abs(mvq0.x - mvp1.x) >= 4) ||
- (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0;
- }
- }
- else // Same L0 & L1
- {
- bs = ((abs(mvq0.x - mvp0.x) >= 4) ||
- (abs(mvq0.y - mvp0.y) >= 4) ||
- (abs(mvq1.x - mvp1.x) >= 4) ||
- (abs(mvq1.y - mvp1.y) >= 4)) &&
- ((abs(mvq1.x - mvp0.x) >= 4) ||
- (abs(mvq1.y - mvp0.y) >= 4) ||
- (abs(mvq0.x - mvp1.x) >= 4) ||
- (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0;
- }
- }
- else // for all different Ref_Idx
- bs = 1;
- }
- else // slice->isInterP()
- {
- int32_t refIdx;
- Frame *refp0, *refq0;
- refIdx = cuP->m_refIdx[0][partP];
- refp0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx];
- refIdx = cuQ->m_refIdx[0][partQ];
- refq0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx];
- MV mvp0 = cuP->m_mv[0][partP];
- MV mvq0 = cuQ->m_mv[0][partQ];
-
- if (!refp0) mvp0 = 0;
- if (!refq0) mvq0 = 0;
-
- bs = ((refp0 != refq0) ||
- (abs(mvq0.x - mvp0.x) >= 4) ||
- (abs(mvq0.y - mvp0.y) >= 4)) ? 1 : 0;
- }
+ if (refP0 == refQ0)
+ return ((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) ||
+ (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) ? 1 : 0;
+ else
+ return ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) ||
+ (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4)) ? 1 : 0;
+ }
+ else // Same L0 & L1
+ {
+ return (((abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4) ||
+ (abs(mvQ1.x - mvP1.x) >= 4) || (abs(mvQ1.y - mvP1.y) >= 4)) &&
+ ((abs(mvQ1.x - mvP0.x) >= 4) || (abs(mvQ1.y - mvP0.y) >= 4) ||
+ (abs(mvQ0.x - mvP1.x) >= 4) || (abs(mvQ0.y - mvP1.y) >= 4))) ? 1 : 0;
}
}
-
- blockingStrength[absPartIdx] = bs;
+
+ // for all different Ref_Idx
+ return 1;
}
static inline int32_t calcDP(pixel* src, intptr_t offset)
@@ -340,46 +272,45 @@ static inline bool useStrongFiltering(intptr_t offset, int32_t beta, int32_t tc,
}
/* Deblocking for the luminance component with strong or weak filter
- * \param src pointer to picture data
- * \param offset offset value for picture data
- * \param tc tc value
- * \param partPNoFilter indicator to disable filtering on partP
- * \param partQNoFilter indicator to disable filtering on partQ
- * \param filterSecondP decision weak filter/no filter for partP
- * \param filterSecondQ decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter)
+ * \param src pointer to picture data
+ * \param offset offset value for picture data
+ * \param tc tc value
+ * \param maskP indicator to enable filtering on partP
+ * \param maskQ indicator to enable filtering on partQ
+ * \param maskP1 decision weak filter/no filter for partP
+ * \param maskQ1 decision weak filter/no filter for partQ */
+static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
{
+ int32_t tc2 = 2 * tc;
+ int32_t tcP = (tc2 & maskP);
+ int32_t tcQ = (tc2 & maskQ);
for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
{
int16_t m4 = (int16_t)src[0];
int16_t m3 = (int16_t)src[-offset];
int16_t m5 = (int16_t)src[offset];
int16_t m2 = (int16_t)src[-offset * 2];
- int32_t tc2 = 2 * tc;
- if (!partPNoFilter)
- {
- int16_t m1 = (int16_t)src[-offset * 3];
- int16_t m0 = (int16_t)src[-offset * 4];
- src[-offset * 3] = (pixel)(Clip3(-tc2, tc2, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
- src[-offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
- src[-offset] = (pixel)(Clip3(-tc2, tc2, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
- }
- if (!partQNoFilter)
- {
- int16_t m6 = (int16_t)src[offset * 2];
- int16_t m7 = (int16_t)src[offset * 3];
- src[0] = (pixel)(Clip3(-tc2, tc2, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
- src[offset] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
- src[offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
- }
+ int16_t m6 = (int16_t)src[offset * 2];
+ int16_t m1 = (int16_t)src[-offset * 3];
+ int16_t m7 = (int16_t)src[offset * 3];
+ int16_t m0 = (int16_t)src[-offset * 4];
+ src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+ src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+ src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+ src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+ src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+ src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
}
}
/* Weak filter */
-static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter,
- bool filterSecondP, bool filterSecondQ)
+static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
+ int32_t maskP1, int32_t maskQ1)
{
int32_t thrCut = tc * 10;
+ int32_t tc2 = tc >> 1;
+ maskP1 &= maskP;
+ maskQ1 &= maskQ;
for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
{
@@ -392,40 +323,33 @@ static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset,
if (abs(delta) < thrCut)
{
- delta = Clip3(-tc, tc, delta);
+ delta = x265_clip3(-tc, tc, delta);
- int32_t tc2 = tc >> 1;
- if (!partPNoFilter)
+ src[-offset] = x265_clip(m3 + (delta & maskP));
+ src[0] = x265_clip(m4 - (delta & maskQ));
+ if (maskP1)
{
- src[-offset] = Clip(m3 + delta);
- if (filterSecondP)
- {
- int16_t m1 = (int16_t)src[-offset * 3];
- int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
- src[-offset * 2] = Clip(m2 + delta1);
- }
+ int16_t m1 = (int16_t)src[-offset * 3];
+ int32_t delta1 = x265_clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
+ src[-offset * 2] = x265_clip(m2 + delta1);
}
- if (!partQNoFilter)
+ if (maskQ1)
{
- src[0] = Clip(m4 - delta);
- if (filterSecondQ)
- {
- int16_t m6 = (int16_t)src[offset * 2];
- int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
- src[offset] = Clip(m5 + delta2);
- }
+ int16_t m6 = (int16_t)src[offset * 2];
+ int32_t delta2 = x265_clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
+ src[offset] = x265_clip(m5 + delta2);
}
}
}
}
/* Deblocking of one line/column for the chrominance component
- * \param src pointer to picture data
- * \param offset offset value for picture data
- * \param tc tc value
- * \param partPNoFilter indicator to disable filtering on partP
- * \param partQNoFilter indicator to disable filtering on partQ */
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter)
+ * \param src pointer to picture data
+ * \param offset offset value for picture data
+ * \param tc tc value
+ * \param maskP indicator to disable filtering on partP
+ * \param maskQ indicator to disable filtering on partQ */
+static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
{
for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
{
@@ -434,32 +358,26 @@ static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset
int16_t m5 = (int16_t)src[offset];
int16_t m2 = (int16_t)src[-offset * 2];
- int32_t delta = Clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3));
- if (!partPNoFilter)
- src[-offset] = Clip(m3 + delta);
- if (!partQNoFilter)
- src[0] = Clip(m4 - delta);
+ int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3));
+ src[-offset] = x265_clip(m3 + (delta & maskP));
+ src[0] = x265_clip(m4 - (delta & maskQ));
}
}
-void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[])
+void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
{
- PicYuv* reconYuv = cu->m_encData->m_reconPicYuv;
- pixel* src = reconYuv->getLumaAddr(cu->m_cuAddr, absPartIdx);
-
- intptr_t stride = reconYuv->m_stride;
- uint32_t numParts = cu->m_slice->m_sps->numPartInCUSize >> depth;
+ PicYuv* reconPic = cuQ->m_encData->m_reconPic;
+ pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx);
+ intptr_t stride = reconPic->m_stride;
+ const PPS* pps = cuQ->m_slice->m_pps;
intptr_t offset, srcStep;
- bool partPNoFilter = false;
- bool partQNoFilter = false;
- uint32_t partP = 0;
- uint32_t partQ = 0;
- const CUData* cuP = cu;
- const CUData* cuQ = cu;
- int32_t betaOffset = cuQ->m_slice->m_pps->deblockingFilterBetaOffsetDiv2 << 1;
- int32_t tcOffset = cuQ->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1;
+ int32_t maskP = -1;
+ int32_t maskQ = -1;
+ int32_t betaOffset = pps->deblockingFilterBetaOffsetDiv2 << 1;
+ int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1;
+ bool bCheckNoFilter = pps->bTransquantBypassEnabled;
if (dir == EDGE_VER)
{
@@ -474,106 +392,104 @@ void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, in
src += (edge << LOG2_UNIT_SIZE) * stride;
}
- for (uint32_t idx = 0; idx < numParts; idx++)
+ uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> depth;
+ for (uint32_t idx = 0; idx < numUnits; idx++)
{
- uint32_t unitOffset = idx << LOG2_UNIT_SIZE;
- uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx);
- uint32_t bs = blockingStrength[bsAbsIdx];
- if (bs)
+ uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx);
+ uint32_t bs = blockStrength[partQ];
+
+ if (!bs)
+ continue;
+
+ // Derive neighboring PU index
+ uint32_t partP;
+ const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
+
+ if (bCheckNoFilter)
{
- int32_t qpQ = cu->m_qp[bsAbsIdx];
- partQ = bsAbsIdx;
+ // check if each of PUs is lossless coded
+ maskP = cuP->m_tqBypass[partP] - 1;
+ maskQ = cuQ->m_tqBypass[partQ] - 1;
+ if (!(maskP | maskQ))
+ continue;
+ }
- // Derive neighboring PU index
- if (dir == EDGE_VER)
- cuP = cuQ->getPULeft(partP, partQ);
- else // (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
+ int32_t qpQ = cuQ->m_qp[partQ];
+ int32_t qpP = cuP->m_qp[partP];
+ int32_t qp = (qpP + qpQ + 1) >> 1;
- int32_t qpP = cuP->m_qp[partP];
- int32_t qp = (qpP + qpQ + 1) >> 1;
+ int32_t indexB = x265_clip3(0, QP_MAX_SPEC, qp + betaOffset);
- int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
+ const int32_t bitdepthShift = X265_DEPTH - 8;
+ int32_t beta = s_betaTable[indexB] << bitdepthShift;
- const int32_t bitdepthShift = X265_DEPTH - 8;
- int32_t beta = s_betaTable[indexB] << bitdepthShift;
+ intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE;
+ int32_t dp0 = calcDP(src + unitOffset , offset);
+ int32_t dq0 = calcDQ(src + unitOffset , offset);
+ int32_t dp3 = calcDP(src + unitOffset + srcStep * 3, offset);
+ int32_t dq3 = calcDQ(src + unitOffset + srcStep * 3, offset);
+ int32_t d0 = dp0 + dq0;
+ int32_t d3 = dp3 + dq3;
- int32_t dp0 = calcDP(src + srcStep * (unitOffset + 0), offset);
- int32_t dq0 = calcDQ(src + srcStep * (unitOffset + 0), offset);
- int32_t dp3 = calcDP(src + srcStep * (unitOffset + 3), offset);
- int32_t dq3 = calcDQ(src + srcStep * (unitOffset + 3), offset);
- int32_t d0 = dp0 + dq0;
- int32_t d3 = dp3 + dq3;
+ int32_t d = d0 + d3;
- int32_t d = d0 + d3;
+ if (d >= beta)
+ continue;
- if (d < beta)
- {
- if (cu->m_slice->m_pps->bTransquantBypassEnabled)
- {
- // check if each of PUs is lossless coded
- partPNoFilter = !!cuP->m_tqBypass[partP];
- partQNoFilter = !!cuQ->m_tqBypass[partQ];
- }
-
- int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
- int32_t tc = s_tcTable[indexTC] << bitdepthShift;
-
- bool sw = (2 * d0 < (beta >> 2) &&
- 2 * d3 < (beta >> 2) &&
- useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 0)) &&
- useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 3)));
-
- if (sw)
- pelFilterLumaStrong(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter);
- else
- {
- int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
- int32_t dp = dp0 + dp3;
- int32_t dq = dq0 + dq3;
- bool filterP = (dp < sideThreshold);
- bool filterQ = (dq < sideThreshold);
-
- pelFilterLuma(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter, filterP, filterQ);
- }
- }
+ int32_t indexTC = x265_clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
+ int32_t tc = s_tcTable[indexTC] << bitdepthShift;
+
+ bool sw = (2 * d0 < (beta >> 2) &&
+ 2 * d3 < (beta >> 2) &&
+ useStrongFiltering(offset, beta, tc, src + unitOffset ) &&
+ useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
+
+ if (sw)
+ pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+ else
+ {
+ int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
+ int32_t dp = dp0 + dp3;
+ int32_t dq = dq0 + dq3;
+ int32_t maskP1 = (dp < sideThreshold ? -1 : 0);
+ int32_t maskQ1 = (dq < sideThreshold ? -1 : 0);
+
+ pelFilterLuma(src + unitOffset, srcStep, offset, tc, maskP, maskQ, maskP1, maskQ1);
}
}
}
-void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[])
+void Deblock::edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
{
- int32_t chFmt = cu->m_chromaFormat, chromaShift;
+ int32_t chFmt = cuQ->m_chromaFormat, chromaShift;
intptr_t offset, srcStep;
+ const PPS* pps = cuQ->m_slice->m_pps;
- bool partPNoFilter = false;
- bool partQNoFilter = false;
- uint32_t partP;
- uint32_t partQ;
- const CUData* cuP;
- const CUData* cuQ = cu;
- int32_t tcOffset = cu->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1;
+ int32_t maskP = -1;
+ int32_t maskQ = -1;
+ int32_t tcOffset = pps->deblockingFilterTcOffsetDiv2 << 1;
X265_CHECK(((dir == EDGE_VER)
- ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cu->m_hChromaShift)
- : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cu->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
+ ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_hChromaShift)
+ : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
"invalid edge\n");
- PicYuv* reconPic = cu->m_encData->m_reconPicYuv;
+ PicYuv* reconPic = cuQ->m_encData->m_reconPic;
intptr_t stride = reconPic->m_strideC;
- intptr_t srcOffset = reconPic->getChromaAddrOffset(cu->m_cuAddr, absPartIdx);
+ intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx);
+ bool bCheckNoFilter = pps->bTransquantBypassEnabled;
if (dir == EDGE_VER)
{
- chromaShift = cu->m_vChromaShift;
- srcOffset += (edge << (LOG2_UNIT_SIZE - cu->m_hChromaShift));
+ chromaShift = cuQ->m_vChromaShift;
+ srcOffset += (edge << (LOG2_UNIT_SIZE - cuQ->m_hChromaShift));
offset = 1;
srcStep = stride;
}
else // (dir == EDGE_HOR)
{
- chromaShift = cu->m_hChromaShift;
- srcOffset += edge * stride << (LOG2_UNIT_SIZE - cu->m_vChromaShift);
+ chromaShift = cuQ->m_hChromaShift;
+ srcOffset += edge * stride << (LOG2_UNIT_SIZE - cuQ->m_vChromaShift);
offset = stride;
srcStep = 1;
}
@@ -582,53 +498,45 @@ void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth,
srcChroma[0] = reconPic->m_picOrg[1] + srcOffset;
srcChroma[1] = reconPic->m_picOrg[2] + srcOffset;
- uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
-
+ uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
for (uint32_t idx = 0; idx < numUnits; idx++)
{
- uint32_t unitOffset = idx << LOG2_UNIT_SIZE;
- uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx << chromaShift);
- uint32_t bs = blockingStrength[bsAbsIdx];
+ uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift);
+ uint32_t bs = blockStrength[partQ];
- if (bs > 1)
+ if (bs <= 1)
+ continue;
+
+ // Derive neighboring PU index
+ uint32_t partP;
+ const CUData* cuP = (dir == EDGE_VER ? cuQ->getPULeft(partP, partQ) : cuQ->getPUAbove(partP, partQ));
+
+ if (bCheckNoFilter)
{
- int32_t qpQ = cu->m_qp[bsAbsIdx];
- partQ = bsAbsIdx;
+ // check if each of PUs is lossless coded
+ maskP = (cuP->m_tqBypass[partP] ? 0 : -1);
+ maskQ = (cuQ->m_tqBypass[partQ] ? 0 : -1);
+ if (!(maskP | maskQ))
+ continue;
+ }
- // Derive neighboring PU index
- if (dir == EDGE_VER)
- cuP = cuQ->getPULeft(partP, partQ);
- else // (dir == EDGE_HOR)
- cuP = cuQ->getPUAbove(partP, partQ);
+ int32_t qpQ = cuQ->m_qp[partQ];
+ int32_t qpP = cuP->m_qp[partP];
+ int32_t qpA = (qpP + qpQ + 1) >> 1;
- int32_t qpP = cuP->m_qp[partP];
+ intptr_t unitOffset = idx * srcStep << LOG2_UNIT_SIZE;
+ for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++)
+ {
+ int32_t qp = qpA + pps->chromaQpOffset[chromaIdx];
+ if (qp >= 30)
+ qp = chFmt == X265_CSP_I420 ? g_chromaScale[qp] : X265_MIN(qp, QP_MAX_SPEC);
- if (cu->m_slice->m_pps->bTransquantBypassEnabled)
- {
- // check if each of PUs is lossless coded
- partPNoFilter = !!cuP->m_tqBypass[partP];
- partQNoFilter = !!cuQ->m_tqBypass[partQ];
- }
+ int32_t indexTC = x265_clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset));
+ const int32_t bitdepthShift = X265_DEPTH - 8;
+ int32_t tc = s_tcTable[indexTC] << bitdepthShift;
+ pixel* srcC = srcChroma[chromaIdx];
- for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++)
- {
- int32_t chromaQPOffset = !chromaIdx ? cu->m_slice->m_pps->chromaCbQpOffset : cu->m_slice->m_pps->chromaCrQpOffset;
- int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset;
- if (qp >= 30)
- {
- if (chFmt == X265_CSP_I420)
- qp = g_chromaScale[qp];
- else
- qp = X265_MIN(qp, 51);
- }
-
- int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset));
- const int32_t bitdepthShift = X265_DEPTH - 8;
- int32_t tc = s_tcTable[indexTC] << bitdepthShift;
- pixel* srcC = srcChroma[chromaIdx];
-
- pelFilterChroma(srcC + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter);
- }
+ pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
}
}
}
diff --git a/source/common/deblock.h b/source/common/deblock.h
index 4bdfeff..dacebfd 100644
--- a/source/common/deblock.h
+++ b/source/common/deblock.h
@@ -30,43 +30,31 @@ namespace x265 {
// private namespace
class CUData;
+struct CUGeom;
class Deblock
{
public:
enum { EDGE_VER, EDGE_HOR };
- uint32_t m_numPartitions;
-
- Deblock() : m_numPartitions(0) {}
-
- void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); }
-
- void deblockCTU(CUData* cu, int32_t dir);
+ void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
protected:
// CU-level deblocking function
- void deblockCU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, const int32_t Edge, uint8_t blockingStrength[]);
-
- struct Param
- {
- uint8_t leftEdge;
- uint8_t topEdge;
- };
+ void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
// set filtering functions
- void setLoopfilterParam(CUData* cu, uint32_t absZOrderIdx, Param *params);
- void setEdgefilterTU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]);
- void setEdgefilterPU(CUData* cu, uint32_t absZOrderIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits);
- void setEdgefilterMultiple(CUData* cu, uint32_t absZOrderIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits);
+ void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
+ void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
+ void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
// get filtering functions
- void getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t partIdx, uint8_t blockingStrength[]);
+ uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
// filter luma/chroma functions
- void edgeFilterLuma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]);
- void edgeFilterChroma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]);
+ void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+ void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
static const uint8_t s_tcTable[54];
static const uint8_t s_betaTable[52];
diff --git a/source/common/frame.cpp b/source/common/frame.cpp
index 8ae912f..ca62e31 100644
--- a/source/common/frame.cpp
+++ b/source/common/frame.cpp
@@ -34,7 +34,7 @@ Frame::Frame()
m_reconRowCount.set(0);
m_countRefEncoders = 0;
m_encData = NULL;
- m_reconPicYuv = NULL;
+ m_reconPic = NULL;
m_next = NULL;
m_prev = NULL;
memset(&m_lowres, 0, sizeof(m_lowres));
@@ -42,26 +42,26 @@ Frame::Frame()
bool Frame::create(x265_param *param)
{
- m_origPicYuv = new PicYuv;
+ m_fencPic = new PicYuv;
- return m_origPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
- m_lowres.create(m_origPicYuv, param->bframes, !!param->rc.aqMode);
+ return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
+ m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
}
bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
{
m_encData = new FrameData;
- m_reconPicYuv = new PicYuv;
- m_encData->m_reconPicYuv = m_reconPicYuv;
- bool ok = m_encData->create(param, sps) && m_reconPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+ m_reconPic = new PicYuv;
+ m_encData->m_reconPic = m_reconPic;
+ bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
if (ok)
{
/* initialize right border of m_reconpicYuv as SAO may read beyond the
* end of the picture accessing uninitialized pixels */
int maxHeight = sps.numCuInHeight * g_maxCUSize;
- memset(m_reconPicYuv->m_picOrg[0], 0, m_reconPicYuv->m_stride * maxHeight);
- memset(m_reconPicYuv->m_picOrg[1], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift));
- memset(m_reconPicYuv->m_picOrg[2], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift));
+ memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel) * m_reconPic->m_stride * maxHeight);
+ memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+ memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
}
return ok;
}
@@ -70,7 +70,7 @@ bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
void Frame::reinit(const SPS& sps)
{
m_bChromaExtended = false;
- m_reconPicYuv = m_encData->m_reconPicYuv;
+ m_reconPic = m_encData->m_reconPic;
m_encData->reinit(sps);
}
@@ -83,18 +83,18 @@ void Frame::destroy()
m_encData = NULL;
}
- if (m_origPicYuv)
+ if (m_fencPic)
{
- m_origPicYuv->destroy();
- delete m_origPicYuv;
- m_origPicYuv = NULL;
+ m_fencPic->destroy();
+ delete m_fencPic;
+ m_fencPic = NULL;
}
- if (m_reconPicYuv)
+ if (m_reconPic)
{
- m_reconPicYuv->destroy();
- delete m_reconPicYuv;
- m_reconPicYuv = NULL;
+ m_reconPic->destroy();
+ delete m_reconPic;
+ m_reconPic = NULL;
}
m_lowres.destroy();
diff --git a/source/common/frame.h b/source/common/frame.h
index 0fae62a..d023946 100644
--- a/source/common/frame.h
+++ b/source/common/frame.h
@@ -43,30 +43,29 @@ public:
/* These two items will be NULL until the Frame begins to be encoded, at which point
* it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
- FrameData* m_encData;
- PicYuv* m_reconPicYuv;
+ FrameData* m_encData;
+ PicYuv* m_reconPic;
/* Data associated with x265_picture */
- PicYuv* m_origPicYuv;
- int m_poc;
- int64_t m_pts; // user provided presentation time stamp
- int64_t m_reorderedPts;
- int64_t m_dts;
- int32_t m_forceqp; // Force to use the qp specified in qp file
- x265_intra_data* m_intraData;
- x265_inter_data* m_interData;
- void* m_userData; // user provided pointer passed in with this picture
+ PicYuv* m_fencPic;
+ int m_poc;
+ int64_t m_pts; // user provided presentation time stamp
+ int64_t m_reorderedPts;
+ int64_t m_dts;
+ int32_t m_forceqp; // Force to use the qp specified in qp file
+ void* m_userData; // user provided pointer passed in with this picture
- Lowres m_lowres;
- bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis
+ Lowres m_lowres;
+ bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis
/* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
- ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference
- volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount
+ ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference
+ volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount
- Frame* m_next; // PicList doubly linked list pointers
- Frame* m_prev;
+ Frame* m_next; // PicList doubly linked list pointers
+ Frame* m_prev;
+ x265_analysis_data m_analysisData;
Frame();
bool create(x265_param *param);
diff --git a/source/common/framedata.h b/source/common/framedata.h
index f6ea9d4..92754ce 100644
--- a/source/common/framedata.h
+++ b/source/common/framedata.h
@@ -49,7 +49,7 @@ public:
x265_param* m_param;
FrameData* m_freeListNext;
- PicYuv* m_reconPicYuv;
+ PicYuv* m_reconPic;
bool m_bHasReferences; /* used during DPB/RPS updates */
int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */
diff --git a/source/common/intrapred.cpp b/source/common/intrapred.cpp
index f43ec19..346bd13 100644
--- a/source/common/intrapred.cpp
+++ b/source/common/intrapred.cpp
@@ -27,35 +27,13 @@
using namespace x265;
namespace {
-pixel dcPredValue(pixel* above, pixel* left, intptr_t width)
-{
- int w, sum = 0;
- pixel pDcVal;
-
- for (w = 0; w < width; w++)
- {
- sum += above[w];
- }
-
- for (w = 0; w < width; w++)
- {
- sum += left[w];
- }
-
- pDcVal = (pixel)((sum + width) / (width + width));
-
- return pDcVal;
-}
-
-void dcPredFilter(pixel* above, pixel* left, pixel* dst, intptr_t dststride, int size)
+void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
{
// boundary pixels processing
dst[0] = (pixel)((above[0] + left[0] + 2 * dst[0] + 2) >> 2);
for (int x = 1; x < size; x++)
- {
dst[x] = (pixel)((above[x] + 3 * dst[x] + 2) >> 2);
- }
dst += dststride;
for (int y = 1; y < size; y++)
@@ -66,195 +44,152 @@ void dcPredFilter(pixel* above, pixel* left, pixel* dst, intptr_t dststride, int
}
template<int width>
-void intra_pred_dc_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int bFilter)
+void intra_pred_dc_c(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int bFilter)
{
int k, l;
- pixel dcval = dcPredValue(above + 1, left + 1, width);
+ int dcVal = width;
+ for (int i = 0; i < width; i++)
+ dcVal += srcPix[1 + i] + srcPix[2 * width + 1 + i];
+ dcVal = dcVal / (width + width);
for (k = 0; k < width; k++)
- {
for (l = 0; l < width; l++)
- {
- dst[k * dstStride + l] = dcval;
- }
- }
+ dst[k * dstStride + l] = (pixel)dcVal;
if (bFilter)
- {
- dcPredFilter(above + 1, left + 1, dst, dstStride, width);
- }
+ dcPredFilter(srcPix + 1, srcPix + (2 * width + 1), dst, dstStride, width);
}
template<int log2Size>
-void planar_pred_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int /*bFilter*/)
+void planar_pred_c(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
{
- above += 1;
- left += 1;
- int k, l;
- pixel bottomLeft, topRight;
- int horPred;
- int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
- // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
- int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
const int blkSize = 1 << log2Size;
- const int offset2D = blkSize;
- const int shift1D = log2Size;
- const int shift2D = shift1D + 1;
- // Get left and above reference column and row
- for (k = 0; k < blkSize + 1; k++)
- {
- topRow[k] = above[k];
- leftColumn[k] = left[k];
- }
+ const pixel* above = srcPix + 1;
+ const pixel* left = srcPix + (2 * blkSize + 1);
- // Prepare intermediate variables used in interpolation
- bottomLeft = (pixel)leftColumn[blkSize];
- topRight = (pixel)topRow[blkSize];
- for (k = 0; k < blkSize; k++)
- {
- bottomRow[k] = (int16_t)(bottomLeft - topRow[k]);
- rightColumn[k] = (int16_t)(topRight - leftColumn[k]);
- topRow[k] <<= shift1D;
- leftColumn[k] <<= shift1D;
- }
+ pixel topRight = above[blkSize];
+ pixel bottomLeft = left[blkSize];
+ for (int y = 0; y < blkSize; y++)
+ for (int x = 0; x < blkSize; x++)
+ dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+}
+
+template<int width>
+void intra_pred_ang_c(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
+{
+ int width2 = width << 1;
+ // Flip the neighbours in the horizontal case.
+ int horMode = dirMode < 18;
+ pixel neighbourBuf[129];
+ const pixel *srcPix = srcPix0;
- // Generate prediction signal
- for (k = 0; k < blkSize; k++)
+ if (horMode)
{
- horPred = leftColumn[k] + offset2D;
- for (l = 0; l < blkSize; l++)
+ neighbourBuf[0] = srcPix[0];
+ for (int i = 0; i < width << 1; i++)
{
- horPred += rightColumn[k];
- topRow[l] += bottomRow[l];
- dst[k * dstStride + l] = (pixel)((horPred + topRow[l]) >> shift2D);
+ neighbourBuf[1 + i] = srcPix[width2 + 1 + i];
+ neighbourBuf[width2 + 1 + i] = srcPix[1 + i];
}
+ srcPix = neighbourBuf;
}
-}
-
-template<int width>
-void intra_pred_ang_c(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-{
- // Map the mode index to main prediction direction and angle
- int k, l;
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
- // Set bitshifts and scale the angle parameter to block size
- static const int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- static const int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
+ // Intra prediction angle and inverse angle tables.
+ const int8_t angleTable[17] = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ const int16_t invAngleTable[8] = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
+ // Get the prediction angle.
+ int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
+ int angle = angleTable[8 + angleOffset];
- // Do angular predictions
+ // Vertical Prediction.
+ if (!angle)
{
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (width - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (width - 1);
+ for (int y = 0; y < width; y++)
+ for (int x = 0; x < width; x++)
+ dst[y * dstStride + x] = srcPix[1 + x];
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (k = -1; k > width * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
+ if (bFilter)
{
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
+ int topLeft = srcPix[0], top = srcPix[1];
+ for (int y = 0; y < width; y++)
+ dst[y * dstStride] = x265_clip((int16_t)(top + ((srcPix[width2 + 1 + y] - topLeft) >> 1)));
}
+ }
+ else // Angular prediction.
+ {
+ // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf[1]).
+ pixel refBuf[64];
+ const pixel *ref;
- if (intraPredAngle == 0)
+ // Use the projected left neighbours and the top neighbours.
+ if (angle < 0)
{
- for (k = 0; k < width; k++)
+ // Number of neighbours projected.
+ int nbProjected = -((width * angle) >> 5) - 1;
+ pixel *ref_pix = refBuf + nbProjected + 1;
+
+ // Project the neighbours.
+ int invAngle = invAngleTable[- angleOffset - 1];
+ int invAngleSum = 128;
+ for (int i = 0; i < nbProjected; i++)
{
- for (l = 0; l < width; l++)
- {
- dst[k * dstStride + l] = refMain[l + 1];
- }
+ invAngleSum += invAngle;
+ ref_pix[- 2 - i] = srcPix[width2 + (invAngleSum >> 8)];
}
- if (bFilter)
- {
- for (k = 0; k < width; k++)
- {
- dst[k * dstStride] = (pixel)Clip3((int16_t)0, (int16_t)((1 << X265_DEPTH) - 1), static_cast<int16_t>((dst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
- }
- }
+ // Copy the top-left and top pixels.
+ for (int i = 0; i < width + 1; i++)
+ ref_pix[-1 + i] = srcPix[i];
+ ref = ref_pix;
}
- else
- {
- int deltaPos = 0;
- int deltaInt;
- int deltaFract;
- int refMainIndex;
-
- for (k = 0; k < width; k++)
- {
- deltaPos += intraPredAngle;
- deltaInt = deltaPos >> 5;
- deltaFract = deltaPos & (32 - 1);
+ else // Use the top and top-right neighbours.
+ ref = srcPix + 1;
- if (deltaFract)
- {
- // Do linear filtering
- for (l = 0; l < width; l++)
- {
- refMainIndex = l + deltaInt + 1;
- dst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
- }
- }
- else
- {
- // Just copy the integer samples
- for (l = 0; l < width; l++)
- {
- dst[k * dstStride + l] = refMain[l + deltaInt + 1];
- }
- }
- }
+ // Pass every row.
+ int angleSum = 0;
+ for (int y = 0; y < width; y++)
+ {
+ angleSum += angle;
+ int offset = angleSum >> 5;
+ int fraction = angleSum & 31;
+
+ if (fraction) // Interpolate
+ for (int x = 0; x < width; x++)
+ dst[y * dstStride + x] = (pixel)(((32 - fraction) * ref[offset + x] + fraction * ref[offset + x + 1] + 16) >> 5);
+ else // Copy.
+ for (int x = 0; x < width; x++)
+ dst[y * dstStride + x] = ref[offset + x];
}
+ }
- // Flip the block if this is the horizontal mode
- if (modeHor)
+ // Flip for horizontal.
+ if (horMode)
+ {
+ for (int y = 0; y < width - 1; y++)
{
- for (k = 0; k < width - 1; k++)
+ for (int x = y + 1; x < width; x++)
{
- for (l = k + 1; l < width; l++)
- {
- pixel tmp = dst[k * dstStride + l];
- dst[k * dstStride + l] = dst[l * dstStride + k];
- dst[l * dstStride + k] = tmp;
- }
+ pixel tmp = dst[y * dstStride + x];
+ dst[y * dstStride + x] = dst[x * dstStride + y];
+ dst[x * dstStride + y] = tmp;
}
}
}
}
template<int log2Size>
-void all_angs_pred_c(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
+void all_angs_pred_c(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
{
const int size = 1 << log2Size;
for (int mode = 2; mode <= 34; mode++)
{
- pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0);
- pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0);
+ pixel *srcPix = (g_intraFilterFlags[mode] & size ? filtPix : refPix);
pixel *out = dest + ((mode - 2) << (log2Size * 2));
- intra_pred_ang_c<size>(out, size, left, above, mode, bLuma);
+ intra_pred_ang_c<size>(out, size, srcPix, mode, bLuma);
// Optimize code don't flip buffer
bool modeHor = (mode < 18);
@@ -279,29 +214,29 @@ void all_angs_pred_c(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pi
namespace x265 {
// x265 private namespace
-void Setup_C_IPredPrimitives(EncoderPrimitives& p)
+void setupIntraPrimitives_c(EncoderPrimitives& p)
{
- p.intra_pred[0][BLOCK_4x4] = planar_pred_c<2>;
- p.intra_pred[0][BLOCK_8x8] = planar_pred_c<3>;
- p.intra_pred[0][BLOCK_16x16] = planar_pred_c<4>;
- p.intra_pred[0][BLOCK_32x32] = planar_pred_c<5>;
-
- // Intra Prediction DC
- p.intra_pred[1][BLOCK_4x4] = intra_pred_dc_c<4>;
- p.intra_pred[1][BLOCK_8x8] = intra_pred_dc_c<8>;
- p.intra_pred[1][BLOCK_16x16] = intra_pred_dc_c<16>;
- p.intra_pred[1][BLOCK_32x32] = intra_pred_dc_c<32>;
+ p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_c<2>;
+ p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_c<3>;
+ p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_c<4>;
+ p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = planar_pred_c<5>;
+
+ p.cu[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_c<4>;
+ p.cu[BLOCK_8x8].intra_pred[DC_IDX] = intra_pred_dc_c<8>;
+ p.cu[BLOCK_16x16].intra_pred[DC_IDX] = intra_pred_dc_c<16>;
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = intra_pred_dc_c<32>;
+
for (int i = 2; i < NUM_INTRA_MODE; i++)
{
- p.intra_pred[i][BLOCK_4x4] = intra_pred_ang_c<4>;
- p.intra_pred[i][BLOCK_8x8] = intra_pred_ang_c<8>;
- p.intra_pred[i][BLOCK_16x16] = intra_pred_ang_c<16>;
- p.intra_pred[i][BLOCK_32x32] = intra_pred_ang_c<32>;
+ p.cu[BLOCK_4x4].intra_pred[i] = intra_pred_ang_c<4>;
+ p.cu[BLOCK_8x8].intra_pred[i] = intra_pred_ang_c<8>;
+ p.cu[BLOCK_16x16].intra_pred[i] = intra_pred_ang_c<16>;
+ p.cu[BLOCK_32x32].intra_pred[i] = intra_pred_ang_c<32>;
}
- p.intra_pred_allangs[BLOCK_4x4] = all_angs_pred_c<2>;
- p.intra_pred_allangs[BLOCK_8x8] = all_angs_pred_c<3>;
- p.intra_pred_allangs[BLOCK_16x16] = all_angs_pred_c<4>;
- p.intra_pred_allangs[BLOCK_32x32] = all_angs_pred_c<5>;
+ p.cu[BLOCK_4x4].intra_pred_allangs = all_angs_pred_c<2>;
+ p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_c<3>;
+ p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_c<4>;
+ p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_c<5>;
}
}
diff --git a/source/common/ipfilter.cpp b/source/common/ipfilter.cpp
index 4982cba..b95be9d 100644
--- a/source/common/ipfilter.cpp
+++ b/source/common/ipfilter.cpp
@@ -35,7 +35,7 @@ using namespace x265;
namespace {
template<int dstStride>
-void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
{
int shift = IF_INTERNAL_PREC - X265_DEPTH;
int row, col;
@@ -74,9 +74,9 @@ void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, in
}
template<int N, int width, int height>
-void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
{
- int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_FILTER_PREC;
int offset = (1 << (headRoom - 1));
uint16_t maxVal = (1 << X265_DEPTH) - 1;
@@ -115,9 +115,9 @@ void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstS
}
template<int N, int width, int height>
-void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
{
- int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC - headRoom;
int offset = -IF_INTERNAL_OFFS << shift;
@@ -160,9 +160,9 @@ void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t ds
}
template<int N, int width, int height>
-void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
{
- int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int shift = IF_FILTER_PREC;
int offset = 1 << (shift - 1);
uint16_t maxVal = (1 << X265_DEPTH) - 1;
@@ -201,9 +201,9 @@ void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstSt
}
template<int N, int width, int height>
-void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
{
- int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+ const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC - headRoom;
int offset = -IF_INTERNAL_OFFS << shift;
@@ -239,13 +239,13 @@ void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dst
}
template<int N, int width, int height>
-void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
{
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC + headRoom;
int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
uint16_t maxVal = (1 << X265_DEPTH) - 1;
- const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+ const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
src -= (N / 2 - 1) * srcStride;
@@ -282,9 +282,9 @@ void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dst
}
template<int N, int width, int height>
-void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
{
- const int16_t *const c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+ const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
int shift = IF_FILTER_PREC;
int row, col;
@@ -317,13 +317,13 @@ void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t d
}
template<int N>
-void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
+void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int coeffIdx)
{
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC + headRoom;
int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
uint16_t maxVal = (1 << X265_DEPTH) - 1;
- const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+ const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
src -= (N / 2 - 1) * srcStride;
@@ -360,7 +360,7 @@ void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t
}
template<int N, int width, int height>
-void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
{
short immedVals[(64 + 8) * (64 + 8)];
@@ -373,39 +373,39 @@ namespace x265 {
// x265 private namespace
#define CHROMA_420(W, H) \
- p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
- p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
- p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \
- p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \
- p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \
- p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
#define CHROMA_422(W, H) \
- p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
- p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
- p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \
- p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \
- p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \
- p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
#define CHROMA_444(W, H) \
- p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
- p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
- p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \
- p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \
- p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \
- p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
#define LUMA(W, H) \
- p.luma_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<8, W, H>; \
- p.luma_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<8, W, H>; \
- p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>; \
- p.luma_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<8, W, H>; \
- p.luma_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<8, W, H>; \
- p.luma_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<8, W, H>; \
- p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_hv_pp_c<8, W, H>;
-
-void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
+ p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_hps = interp_horiz_ps_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vpp = interp_vert_pp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vps = interp_vert_ps_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>;
+
+void setupFilterPrimitives_c(EncoderPrimitives& p)
{
LUMA(4, 4);
LUMA(8, 8);
@@ -509,9 +509,9 @@ void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
CHROMA_444(16, 64);
p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
- p.chroma_p2s[X265_CSP_I444] = filterConvertPelToShort_c<MAX_CU_SIZE>;
- p.chroma_p2s[X265_CSP_I420] = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
- p.chroma_p2s[X265_CSP_I422] = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+ p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
+ p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+ p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
p.extendRowBorder = extendCURowColBorder;
}
diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp
index 58a28c7..66b615c 100644
--- a/source/common/loopfilter.cpp
+++ b/source/common/loopfilter.cpp
@@ -28,6 +28,20 @@
#define PIXEL_MIN 0
#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
+namespace {
+
+/* get the sign of input variable (TODO: this is a dup, make common) */
+inline int8_t signOf(int x)
+{
+ return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
+}
+
+void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
+{
+ for (int x = 0; x < endX; x++)
+ dst[x] = signOf(src1[x] - src2[x]);
+}
+
void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
{
int x;
@@ -39,15 +53,75 @@ void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
edgeType = signRight + signLeft + 2;
signLeft = -signRight;
+ rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+ }
+}
+
+void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+{
+ int x;
+ int8_t signDown;
+ int edgeType;
+
+ for (x = 0; x < width; x++)
+ {
+ signDown = signOf(rec[x] - rec[x + stride]);
+ edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x] = -signDown;
+ rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+ }
+}
- short v = rec[x] + offsetEo[edgeType];
- rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
+void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
+{
+ int x;
+ for (x = 0; x < width; x++)
+ {
+ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
+ int edgeType = signDown + buff1[x] + 2;
+ bufft[x + 1] = -signDown;
+ rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);;
}
}
+void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
+{
+ int8_t signDown;
+ int8_t edgeType;
+
+ for (int x = startX + 1; x < endX; x++)
+ {
+ signDown = signOf(rec[x] - rec[x + stride]);
+ edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x - 1] = -signDown;
+ rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+ }
+}
+
+void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
+{
+ #define SAO_BO_BITS 5
+ const int boShift = X265_DEPTH - SAO_BO_BITS;
+ int x, y;
+ for (y = 0; y < ctuHeight; y++)
+ {
+ for (x = 0; x < ctuWidth; x++)
+ {
+ rec[x] = x265_clip(rec[x] + offset[rec[x] >> boShift]);
+ }
+ rec += stride;
+ }
+}
+}
+
namespace x265 {
-void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
+void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
{
p.saoCuOrgE0 = processSaoCUE0;
+ p.saoCuOrgE1 = processSaoCUE1;
+ p.saoCuOrgE2 = processSaoCUE2;
+ p.saoCuOrgE3 = processSaoCUE3;
+ p.saoCuOrgB0 = processSaoCUB0;
+ p.sign = calSign;
}
}
diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
index fe4f7b9..50bbc89 100644
--- a/source/common/lowres.cpp
+++ b/source/common/lowres.cpp
@@ -69,6 +69,7 @@ bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled)
lowresPlane[3] = buffer[3] + padoffset;
CHECKED_MALLOC(intraCost, int32_t, cuCount);
+ CHECKED_MALLOC(intraMode, uint8_t, cuCount);
for (int i = 0; i < bframes + 2; i++)
{
@@ -99,6 +100,7 @@ void Lowres::destroy()
X265_FREE(buffer[i]);
X265_FREE(intraCost);
+ X265_FREE(intraMode);
for (int i = 0; i < bframes + 2; i++)
{
@@ -155,7 +157,7 @@ void Lowres::init(PicYuv *origPic, int poc, int type)
intraMbs[i] = 0;
/* downscale and generate 4 hpel planes for lookahead */
- primitives.frame_init_lowres_core(origPic->m_picOrg[0],
+ primitives.frameInitLowres(origPic->m_picOrg[0],
lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
origPic->m_stride, lumaStride, width, lines);
@@ -164,5 +166,5 @@ void Lowres::init(PicYuv *origPic, int poc, int type)
extendPicBorder(lowresPlane[1], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
extendPicBorder(lowresPlane[2], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
extendPicBorder(lowresPlane[3], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
- fpelPlane = lowresPlane[0];
+ fpelPlane[0] = lowresPlane[0];
}
diff --git a/source/common/lowres.h b/source/common/lowres.h
index b88ad3e..43e3485 100644
--- a/source/common/lowres.h
+++ b/source/common/lowres.h
@@ -26,27 +26,36 @@
#include "primitives.h"
#include "common.h"
+#include "picyuv.h"
#include "mv.h"
namespace x265 {
// private namespace
-class PicYuv;
-
struct ReferencePlanes
{
ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
- pixel* fpelPlane;
+ pixel* fpelPlane[3];
pixel* lowresPlane[4];
+ PicYuv* reconPic;
bool isWeighted;
bool isLowres;
+
intptr_t lumaStride;
- int weight;
- int offset;
- int shift;
- int round;
+ intptr_t chromaStride;
+
+ struct {
+ int weight;
+ int offset;
+ int shift;
+ int round;
+ } w[3];
+
+ pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[0] + reconPic->m_cuOffsetY[ctuAddr] + reconPic->m_buOffsetY[absPartIdx]; }
+ pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[1] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; }
+ pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return fpelPlane[2] + reconPic->m_cuOffsetC[ctuAddr] + reconPic->m_buOffsetC[absPartIdx]; }
/* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels
* in case QPEL is required. Else it returns a pointer to the HPEL pixels */
@@ -56,12 +65,11 @@ struct ReferencePlanes
{
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
-
- MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
- int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
-
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
- primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
+ int qmvx = qmv.x + (qmv.x & 1);
+ int qmvy = qmv.y + (qmv.y & 1);
+ int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
+ pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
+ primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
return buf;
}
else
@@ -79,10 +87,11 @@ struct ReferencePlanes
ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
- MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
- int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
- primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
+ int qmvx = qmv.x + (qmv.x & 1);
+ int qmvy = qmv.y + (qmv.y & 1);
+ int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
+ pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
+ primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
return comp(fenc, FENC_STRIDE, subpelbuf, 8);
}
else
@@ -116,6 +125,7 @@ struct Lowres : public ReferencePlanes
int32_t* rowSatds[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2];
int intraMbs[X265_BFRAME_MAX + 2];
int32_t* intraCost;
+ uint8_t* intraMode;
int64_t satdCost;
uint16_t* lowresCostForRc;
uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]);
diff --git a/source/common/mv.h b/source/common/mv.h
index 22a7073..dad3729 100644
--- a/source/common/mv.h
+++ b/source/common/mv.h
@@ -44,19 +44,19 @@ public:
int32_t word;
};
- MV() : word(0) {}
-
+ MV() {}
+ MV(int32_t w) : word(w) {}
MV(int16_t _x, int16_t _y) : x(_x), y(_y) {}
- const MV& operator =(uint32_t w) { word = w; return *this; }
+ MV& operator =(uint32_t w) { word = w; return *this; }
- const MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; }
+ MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; }
- const MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; }
+ MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; }
- const MV& operator >>=(int i) { x >>= i; y >>= i; return *this; }
+ MV& operator >>=(int i) { x >>= i; y >>= i; return *this; }
- const MV& operator <<=(int i) { x <<= i; y <<= i; return *this; }
+ MV& operator <<=(int i) { x <<= i; y <<= i; return *this; }
MV operator >>(int i) const { return MV(x >> i, y >> i); }
@@ -64,16 +64,18 @@ public:
MV operator *(int16_t i) const { return MV(x * i, y * i); }
- const MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); }
+ MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); }
- const MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); }
+ MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); }
bool operator ==(const MV& other) const { return word == other.word; }
bool operator !=(const MV& other) const { return word != other.word; }
+ bool operator !() const { return !word; }
+
// Scale down a QPEL mv to FPEL mv, rounding up by one HPEL offset
- MV roundToFPel() const { return MV(x + 2, y + 2) >> 2; }
+ MV roundToFPel() const { return MV((x + 2) >> 2, (y + 2) >> 2); }
// Scale up an FPEL mv to QPEL by shifting up two bits
MV toQPel() const { return *this << 2; }
diff --git a/source/common/param.cpp b/source/common/param.cpp
index af70058..4c758fa 100644
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -174,8 +174,10 @@ void x265_param_default(x265_param *param)
param->cbQpOffset = 0;
param->crQpOffset = 0;
param->rdPenalty = 0;
- param->psyRd = 0.0;
- param->psyRdoq = 0.0;
+ param->psyRd = 0.3;
+ param->psyRdoq = 1.0;
+ param->analysisMode = 0;
+ param->analysisFileName = NULL;
param->bIntraInBFrames = 0;
param->bLossless = 0;
param->bCULossless = 0;
@@ -186,14 +188,13 @@ void x265_param_default(x265_param *param)
param->rc.vbvBufferInit = 0.9;
param->rc.rfConstant = 28;
param->rc.bitrate = 0;
- param->rc.rateTolerance = 1.0;
param->rc.qCompress = 0.6;
param->rc.ipFactor = 1.4f;
param->rc.pbFactor = 1.3f;
param->rc.qpStep = 4;
param->rc.rateControlMode = X265_RC_CRF;
param->rc.qp = 32;
- param->rc.aqMode = X265_AQ_AUTO_VARIANCE;
+ param->rc.aqMode = X265_AQ_VARIANCE;
param->rc.aqStrength = 1.0;
param->rc.cuTree = 1;
param->rc.rfConstantMax = 0;
@@ -203,7 +204,10 @@ void x265_param_default(x265_param *param)
param->rc.statFileName = NULL;
param->rc.complexityBlur = 20;
param->rc.qblur = 0.5;
+ param->rc.zoneCount = 0;
+ param->rc.zones = NULL;
param->rc.bEnableSlowFirstPass = 0;
+ param->rc.bStrictCbr = 0;
/* Video Usability Information (VUI) */
param->vui.aspectRatioIdc = 0;
@@ -254,7 +258,6 @@ int x265_param_default_preset(x265_param *param, const char *preset, const char
param->bEnableWeightedPred = 0;
param->rdLevel = 2;
param->maxNumReferences = 1;
- param->bEnableLoopFilter = 0;
param->rc.aqStrength = 0.0;
param->rc.aqMode = X265_AQ_NONE;
param->rc.cuTree = 0;
@@ -405,6 +408,20 @@ int x265_param_default_preset(x265_param *param, const char *preset, const char
param->lookaheadDepth = 0;
param->scenecutThreshold = 0;
param->rc.cuTree = 0;
+ param->frameNumThreads = 1;
+ }
+ else if (!strcmp(tune, "grain"))
+ {
+ param->deblockingFilterBetaOffset = -2;
+ param->deblockingFilterTCOffset = -2;
+ param->bIntraInBFrames = 0;
+ param->psyRdoq = 30;
+ param->psyRd = 0.5;
+ param->rc.ipFactor = 1.1;
+ param->rc.pbFactor = 1.1;
+ param->rc.aqMode = X265_AQ_VARIANCE;
+ param->rc.aqStrength = 0.3;
+ param->rc.qCompress = 0.8;
}
else
return -1;
@@ -440,12 +457,8 @@ static double x265_atof(const char *str, bool& bError)
static int parseName(const char *arg, const char * const * names, bool& bError)
{
for (int i = 0; names[i]; i++)
- {
if (!strcmp(arg, names[i]))
- {
return i;
- }
- }
return x265_atoi(arg, bError);
}
@@ -478,9 +491,7 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
char *c;
strcpy(nameBuf, name);
while ((c = strchr(nameBuf, '_')) != 0)
- {
*c = '-';
- }
name = nameBuf;
}
@@ -532,9 +543,6 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
}
}
}
- OPT("csv") p->csvfn = value;
- OPT("scaling-list") p->scalingLists = value;
- OPT("lambda-file") p->rc.lambdaFileName = value;
OPT("threads") p->poolNumThreads = atoi(value);
OPT("frame-threads") p->frameNumThreads = atoi(value);
OPT("pmode") p->bDistributeModeAnalysis = atobool(value);
@@ -619,11 +627,46 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
OPT("cbqpoffs") p->cbQpOffset = atoi(value);
OPT("crqpoffs") p->crQpOffset = atoi(value);
OPT("rd") p->rdLevel = atoi(value);
- OPT("psy-rd") p->psyRd = atof(value);
- OPT("psy-rdoq") p->psyRdoq = atof(value);
+ OPT("psy-rd")
+ {
+ int bval = atobool(value);
+ if (bError || bval)
+ {
+ bError = false;
+ p->psyRd = atof(value);
+ }
+ else
+ p->psyRd = 0.0;
+ }
+ OPT("psy-rdoq")
+ {
+ int bval = atobool(value);
+ if (bError || bval)
+ {
+ bError = false;
+ p->psyRdoq = atof(value);
+ }
+ else
+ p->psyRdoq = 0.0;
+ }
OPT("signhide") p->bEnableSignHiding = atobool(value);
OPT("b-intra") p->bIntraInBFrames = atobool(value);
- OPT("lft") p->bEnableLoopFilter = atobool(value);
+ OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */
+ OPT("deblock")
+ {
+ if (2 == sscanf(value, "%d:%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset) ||
+ 2 == sscanf(value, "%d,%d", &p->deblockingFilterTCOffset, &p->deblockingFilterBetaOffset))
+ {
+ p->bEnableLoopFilter = true;
+ }
+ else if (sscanf(value, "%d", &p->deblockingFilterTCOffset))
+ {
+ p->bEnableLoopFilter = 1;
+ p->deblockingFilterBetaOffset = p->deblockingFilterTCOffset;
+ }
+ else
+ p->bEnableLoopFilter = atobool(value);
+ }
OPT("sao") p->bEnableSAO = atobool(value);
OPT("sao-non-deblock") p->bSaoNonDeblocked = atobool(value);
OPT("ssim") p->bEnableSsim = atobool(value);
@@ -635,6 +678,10 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
OPT("hrd") p->bEmitHRDSEI = atobool(value);
OPT2("ipratio", "ip-factor") p->rc.ipFactor = atof(value);
OPT2("pbratio", "pb-factor") p->rc.pbFactor = atof(value);
+ OPT("qcomp") p->rc.qCompress = atof(value);
+ OPT("qpstep") p->rc.qpStep = atoi(value);
+ OPT("cplxblur") p->rc.complexityBlur = atof(value);
+ OPT("qblur") p->rc.qblur = atof(value);
OPT("aq-mode") p->rc.aqMode = atoi(value);
OPT("aq-strength") p->rc.aqStrength = atof(value);
OPT("vbv-maxrate") p->rc.vbvMaxBitrate = atoi(value);
@@ -657,11 +704,41 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
p->rc.qp = atoi(value);
p->rc.rateControlMode = X265_RC_CQP;
}
+ OPT("zones")
+ {
+ p->rc.zoneCount = 1;
+ const char* c;
+
+ for (c = value; *c; c++)
+ p->rc.zoneCount += (*c == '/');
+
+ p->rc.zones = X265_MALLOC(x265_zone, p->rc.zoneCount);
+ c = value;
+ for (int i = 0; i < p->rc.zoneCount; i++ )
+ {
+ int len;
+ if (3 == sscanf(c, "%d,%d,q=%d%n", &p->rc.zones[i].startFrame, &p->rc.zones[i].endFrame, &p->rc.zones[i].qp, &len))
+ p->rc.zones[i].bForceQp = 1;
+ else if (3 == sscanf(c, "%d,%d,b=%f%n", &p->rc.zones[i].startFrame, &p->rc.zones[i].endFrame, &p->rc.zones[i].bitrateFactor, &len))
+ p->rc.zones[i].bForceQp = 0;
+ else
+ {
+ bError = true;
+ break;
+ }
+ c += len + 1;
+ }
+ }
OPT("input-res") bError |= sscanf(value, "%dx%d", &p->sourceWidth, &p->sourceHeight) != 2;
OPT("input-csp") p->internalCsp = parseName(value, x265_source_csp_names, bError);
OPT("me") p->searchMethod = parseName(value, x265_motion_est_names, bError);
OPT("cutree") p->rc.cuTree = atobool(value);
OPT("slow-firstpass") p->rc.bEnableSlowFirstPass = atobool(value);
+ OPT("strict-cbr")
+ {
+ p->rc.bStrictCbr = atobool(value);
+ p->rc.pbFactor = 1.0;
+ }
OPT("analysis-mode") p->analysisMode = parseName(value, x265_analysis_names, bError);
OPT("sar")
{
@@ -729,14 +806,19 @@ int x265_param_parse(x265_param *p, const char *name, const char *value)
&p->vui.defDispWinRightOffset,
&p->vui.defDispWinBottomOffset) != 4;
}
- OPT("nr") p->noiseReduction = atoi(value);
+ OPT("nr-intra") p->noiseReductionIntra = atoi(value);
+ OPT("nr-inter") p->noiseReductionInter = atoi(value);
OPT("pass")
{
- int pass = Clip3(0, 3, atoi(value));
+ int pass = x265_clip3(0, 3, atoi(value));
p->rc.bStatWrite = pass & 1;
p->rc.bStatRead = pass & 2;
}
OPT("stats") p->rc.statFileName = strdup(value);
+ OPT("csv") p->csvfn = strdup(value);
+ OPT("scaling-list") p->scalingLists = strdup(value);
+ OPT("lambda-file") p->rc.lambdaFileName = strdup(value);
+ OPT("analysis-file") p->analysisFileName = strdup(value);
else
return X265_PARAM_BAD_NAME;
#undef OPT
@@ -894,7 +976,7 @@ int x265_check_params(x265_param *param)
"x265 was compiled for 8bit encodes, only 8bit internal depth supported");
#endif
- CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > 51,
+ CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > QP_MAX_SPEC,
"QP exceeds supported range (-QpBDOffsety to 51)");
CHECK(param->fpsNum == 0 || param->fpsDenom == 0,
"Frame rate numerator and denominator must be specified");
@@ -960,6 +1042,10 @@ int x265_check_params(x265_param *param)
"Aq-Mode is out of range");
CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
"Aq-Strength is out of range");
+ CHECK(param->deblockingFilterTCOffset < -6 || param->deblockingFilterTCOffset > 6,
+ "deblocking filter tC offset must be in the range of -6 to +6");
+ CHECK(param->deblockingFilterBetaOffset < -6 || param->deblockingFilterBetaOffset > 6,
+ "deblocking filter Beta offset must be in the range of -6 to +6");
CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0");
CHECK(param->psyRdoq < 0 || 50.0 < param->psyRdoq, "Psy-rdoq strength must be between 0 and 50.0");
CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative");
@@ -1031,12 +1117,18 @@ int x265_check_params(x265_param *param)
"Valid initial VBV buffer occupancy must be a fraction 0 - 1, or size in kbits");
CHECK(param->rc.bitrate < 0,
"Target bitrate can not be less than zero");
- if (param->noiseReduction)
- CHECK(100 > param->noiseReduction || param->noiseReduction > 1000, "Valid noise reduction range 100 - 1000");
+ CHECK(param->rc.qCompress < 0.5 || param->rc.qCompress > 1.0,
+ "qCompress must be between 0.5 and 1.0");
+ if (param->noiseReductionIntra)
+ CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
+ if (param->noiseReductionInter)
+ CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead,
"Constant rate-factor is incompatible with 2pass");
CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
"Constant QP is incompatible with 2pass");
+ CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0),
+ "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
return check_failed;
}
@@ -1061,7 +1153,7 @@ int x265_set_globals(x265_param *param)
{
static int once /* = 0 */;
- if (ATOMIC_CAS32(&once, 0, 1) == 1)
+ if (ATOMIC_INC(&once) > 1)
{
if (param->maxCUSize != g_maxCUSize)
{
@@ -1152,11 +1244,19 @@ void x265_print_params(x265_param *param)
fprintf(stderr, "psy-rd=%.2lf ", param->psyRd);
if (param->psyRdoq > 0.)
fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq);
- TOOLOPT(param->bEnableEarlySkip, "esd");
- TOOLOPT(param->bEnableCbfFastMode, "cfm");
- if (param->noiseReduction)
- fprintf(stderr, "nr=%d ", param->noiseReduction);
- TOOLOPT(param->bEnableLoopFilter, "lft");
+ TOOLOPT(param->bEnableEarlySkip, "early-skip");
+ TOOLOPT(param->bEnableCbfFastMode, "fast-cbf");
+ if (param->noiseReductionIntra)
+ fprintf(stderr, "nr-intra=%d ", param->noiseReductionIntra);
+ if (param->noiseReductionInter)
+ fprintf(stderr, "nr-inter=%d ", param->noiseReductionInter);
+ if (param->bEnableLoopFilter)
+ {
+ if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset)
+ fprintf(stderr, "deblock(tC=%d:B=%d) ", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset);
+ else
+ TOOLOPT(param->bEnableLoopFilter, "deblock");
+ }
if (param->bEnableSAO)
fprintf(stderr, "sao%s ", param->bSaoNonDeblocked ? "-non-deblock" : "");
TOOLOPT(param->bEnableSignHiding, "signhide");
@@ -1241,8 +1341,7 @@ char *x265_param2string(x265_param *p)
if (p->rc.rateControlMode == X265_RC_CRF)
s += sprintf(s, " crf=%.1f", p->rc.rfConstant);
else
- s += sprintf(s, " bitrate=%d ratetol=%.1f",
- p->rc.bitrate, p->rc.rateTolerance);
+ s += sprintf(s, " bitrate=%d", p->rc.bitrate);
s += sprintf(s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
p->rc.qCompress, QP_MIN, QP_MAX_SPEC, p->rc.qpStep);
if (p->rc.bStatRead)
diff --git a/source/common/param.h b/source/common/param.h
index fa42006..6c2a1fe 100644
--- a/source/common/param.h
+++ b/source/common/param.h
@@ -37,7 +37,7 @@ void getParamAspectRatio(x265_param *p, int& width, int& height);
bool parseLambdaFile(x265_param *param);
/* this table is kept internal to avoid confusion, since log level indices start at -1 */
-static const char * const logLevelNames[] = { "none", "error", "warning", "info", "debug", "full", 0 };
+static const char * const logLevelNames[] = { "none", "error", "warning", "info", "frame", "debug", "full", 0 };
#define MAXPARAMSIZE 2000
}
diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp
index 7f4fd06..5a77e07 100644
--- a/source/common/picyuv.cpp
+++ b/source/common/picyuv.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
diff --git a/source/common/picyuv.h b/source/common/picyuv.h
index 1e18d8c..9856f41 100644
--- a/source/common/picyuv.h
+++ b/source/common/picyuv.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -76,12 +76,21 @@ public:
pixel* getCrAddr(uint32_t ctuAddr) { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; }
pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; }
pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); }
+ const pixel* getLumaAddr(uint32_t ctuAddr) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr]; }
+ const pixel* getCbAddr(uint32_t ctuAddr) const { return m_picOrg[1] + m_cuOffsetC[ctuAddr]; }
+ const pixel* getCrAddr(uint32_t ctuAddr) const { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; }
+ const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; }
+ const pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) const { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); }
/* get pointer to CU start address */
pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; }
pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+ const pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; }
+ const pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+ const pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
+ const pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) const { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
};
void updateChecksum(const pixel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, intptr_t stride, int row, uint32_t cuHeight);
diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
index 3e0530d..be9269d 100644
--- a/source/common/pixel.cpp
+++ b/source/common/pixel.cpp
@@ -32,74 +32,18 @@
using namespace x265;
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
- p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
-
-#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
- p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
- p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \
- p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \
- p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \
- p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
- p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \
- p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \
- p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
- p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
- p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \
- p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \
- p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
- p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
- p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
- p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
- p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
- p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \
- p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \
- p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
- p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
- p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
- p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
- p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
- p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
- p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
-
namespace {
// place functions in anonymous namespace (file static)
template<int lx, int ly>
-int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
sum += abs(pix1[x] - pix2[x]);
- }
pix1 += stride_pix1;
pix2 += stride_pix2;
@@ -109,16 +53,14 @@ int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
}
template<int lx, int ly>
-int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
sum += abs(pix1[x] - pix2[x]);
- }
pix1 += stride_pix1;
pix2 += stride_pix2;
@@ -128,7 +70,7 @@ int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2
}
template<int lx, int ly>
-void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res)
+void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
@@ -150,7 +92,7 @@ void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstr
}
template<int lx, int ly>
-void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res)
+void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
@@ -175,17 +117,17 @@ void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, int
}
template<int lx, int ly, class T1, class T2>
-int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2)
+int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
{
int sum = 0;
- int iTemp;
+ int tmp;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
{
- iTemp = pix1[x] - pix2[x];
- sum += (iTemp * iTemp);
+ tmp = pix1[x] - pix2[x];
+ sum += (tmp * tmp);
}
pix1 += stride_pix1;
@@ -217,7 +159,7 @@ inline sum2_t abs2(sum2_t a)
return (a + s) ^ s;
}
-int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][2];
sum2_t a0, a1, a2, a3, b0, b1;
@@ -245,36 +187,39 @@ int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix
return (int)(sum >> 1);
}
-int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
{
- ssum2_t tmp[4][2];
- ssum2_t a0, a1, a2, a3, b0, b1;
- ssum2_t sum = 0;
+ int32_t tmp[4][4];
+ int32_t s01, s23, d01, d23;
+ int32_t satd = 0;
+ int d;
- for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
+ for (d = 0; d < 4; d++, pix1 += stride_pix1)
{
- a0 = pix1[0] - pix2[0];
- a1 = pix1[1] - pix2[1];
- b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
- a2 = pix1[2] - pix2[2];
- a3 = pix1[3] - pix2[3];
- b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
- tmp[i][0] = b0 + b1;
- tmp[i][1] = b0 - b1;
+ s01 = pix1[0] + pix1[1];
+ s23 = pix1[2] + pix1[3];
+ d01 = pix1[0] - pix1[1];
+ d23 = pix1[2] - pix1[3];
+
+ tmp[d][0] = s01 + s23;
+ tmp[d][1] = s01 - s23;
+ tmp[d][2] = d01 - d23;
+ tmp[d][3] = d01 + d23;
}
- for (int i = 0; i < 2; i++)
+ for (d = 0; d < 4; d++)
{
- HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
- a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
- sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+ s01 = tmp[0][d] + tmp[1][d];
+ s23 = tmp[2][d] + tmp[3][d];
+ d01 = tmp[0][d] - tmp[1][d];
+ d23 = tmp[2][d] - tmp[3][d];
+ satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
}
-
- return (int)(sum >> 1);
+ return (int)(satd / 2);
}
// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
-int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][4];
sum2_t a0, a1, a2, a3;
@@ -300,41 +245,33 @@ int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix
template<int w, int h>
// calculate satd in blocks of 4x4
-int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
- {
for (int col = 0; col < w; col += 4)
- {
satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
- }
- }
return satd;
}
template<int w, int h>
// calculate satd in blocks of 8x4
-int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
- {
for (int col = 0; col < w; col += 8)
- {
satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
- }
- }
return satd;
}
-inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
sum2_t tmp[8][4];
sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -371,54 +308,63 @@ inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
return (int)sum;
}
-int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
}
-inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
{
- ssum2_t tmp[8][4];
- ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
- ssum2_t sum = 0;
+ int32_t tmp[8][8];
+ int32_t a0, a1, a2, a3, a4, a5, a6, a7;
+ int32_t sum = 0;
- for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
+ for (int i = 0; i < 8; i++, pix1 += i_pix1)
{
- a0 = pix1[0] - pix2[0];
- a1 = pix1[1] - pix2[1];
- b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
- a2 = pix1[2] - pix2[2];
- a3 = pix1[3] - pix2[3];
- b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
- a4 = pix1[4] - pix2[4];
- a5 = pix1[5] - pix2[5];
- b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
- a6 = pix1[6] - pix2[6];
- a7 = pix1[7] - pix2[7];
- b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
- HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
+ a0 = pix1[0] + pix1[1];
+ a1 = pix1[2] + pix1[3];
+ a2 = pix1[4] + pix1[5];
+ a3 = pix1[6] + pix1[7];
+ a4 = pix1[0] - pix1[1];
+ a5 = pix1[2] - pix1[3];
+ a6 = pix1[4] - pix1[5];
+ a7 = pix1[6] - pix1[7];
+ tmp[i][0] = (a0 + a1) + (a2 + a3);
+ tmp[i][1] = (a0 + a1) - (a2 + a3);
+ tmp[i][2] = (a0 - a1) + (a2 - a3);
+ tmp[i][3] = (a0 - a1) - (a2 - a3);
+ tmp[i][4] = (a4 + a5) + (a6 + a7);
+ tmp[i][5] = (a4 + a5) - (a6 + a7);
+ tmp[i][6] = (a4 - a5) + (a6 - a7);
+ tmp[i][7] = (a4 - a5) - (a6 - a7);
}
- for (int i = 0; i < 4; i++)
+ for (int i = 0; i < 8; i++)
{
- HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
- HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
- b0 = abs2(a0 + a4) + abs2(a0 - a4);
- b0 += abs2(a1 + a5) + abs2(a1 - a5);
- b0 += abs2(a2 + a6) + abs2(a2 - a6);
- b0 += abs2(a3 + a7) + abs2(a3 - a7);
- sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
+ a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
+ a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
+ a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
+ a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
+ a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
+ a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
+ a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
+ a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
+ a0 = abs(a0 + a4) + abs(a0 - a4);
+ a0 += abs(a1 + a5) + abs(a1 - a5);
+ a0 += abs(a2 + a6) + abs(a2 - a6);
+ a0 += abs(a3 + a7) + abs(a3 - a7);
+ sum += a0;
}
return (int)sum;
}
-int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
{
- return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
+ return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
}
-int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
+ _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
@@ -432,159 +378,129 @@ int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
template<int w, int h>
// Calculate sa8d in blocks of 8x8
-int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 8)
- {
for (int x = 0; x < w; x += 8)
- {
cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
- }
- }
return cost;
}
template<int w, int h>
// Calculate sa8d in blocks of 16x16
-int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 16)
- {
for (int x = 0; x < w; x += 16)
- {
cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
- }
- }
return cost;
}
template<int size>
-int pixel_ssd_s_c(short *a, intptr_t dstride)
+int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
{
int sum = 0;
for (int y = 0; y < size; y++)
{
for (int x = 0; x < size; x++)
- {
sum += a[x] * a[x];
- }
+
a += dstride;
}
return sum;
}
template<int size>
-void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val)
+void blockfill_s_c(int16_t* dst, intptr_t dstride, int16_t val)
{
for (int y = 0; y < size; y++)
- {
for (int x = 0; x < size; x++)
- {
dst[y * dstride + x] = val;
- }
- }
-}
-
-void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
-{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j]) << shift;
- }
- }
}
template<int size>
-void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset)
-{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
- }
- }
-}
-
-void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- int round = 1 << (shift - 1);
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = src[j] << shift;
- src += size;
- dst += stride;
+ src += srcStride;
+ dst += size;
}
}
-void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- int round = 1 << (shift - 1);
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = (src[j] + round) >> shift;
- src += size;
- dst += stride;
+ src += srcStride;
+ dst += size;
}
}
template<int size>
-void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
+
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = ((int16_t)src[j] << shift);
- }
+ dst[j] = src[j] << shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int size>
-void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (src[j] << shift);
- }
+ dst[j] = (src[j] + round) >> shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int blockSize>
-void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
+void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
{
for (int y = 0; y < blockSize; y++)
{
for (int x = 0; x < blockSize; x++)
- {
residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
- }
fenc += stride;
residual += stride;
@@ -593,27 +509,32 @@ void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
}
template<int blockSize>
-void transpose(pixel* dst, pixel* src, intptr_t stride)
+void transpose(pixel* dst, const pixel* src, intptr_t stride)
{
for (int k = 0; k < blockSize; k++)
- {
for (int l = 0; l < blockSize; l++)
- {
dst[k * blockSize + l] = src[l * stride + k];
- }
- }
}
-void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
+#if CHECKED_BUILD || _DEBUG
+ const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+#endif
+
+ X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
+ X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
+ X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
+ X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
+
for (y = 0; y <= height - 1; y++)
{
for (x = 0; x <= width - 1; )
{
// note: width can be odd
- dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
+ dst[x] = x265_clip(((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
x++;
}
@@ -622,21 +543,25 @@ void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStrid
}
}
-void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
+void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
+ const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+
X265_CHECK(!(width & 15), "weightp alignment error\n");
X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
+ X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
+ X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
for (y = 0; y <= height - 1; y++)
{
for (x = 0; x <= width - 1; )
{
// simulating pixel to short conversion
- int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH);
- dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset);
+ int16_t val = src[x] << correction;
+ dst[x] = x265_clip(((w0 * (val) + round) >> shift) + offset);
x++;
}
@@ -646,14 +571,12 @@ void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height,
}
template<int lx, int ly>
-void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
+void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
{
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
dst[x] = (src0[x] + src1[x] + 1) >> 1;
- }
src0 += sstride0;
src1 += sstride1;
@@ -661,23 +584,35 @@ void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, p
}
}
-void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel *dst, const pixel *src, intptr_t /*stride*/)
{
int x;
+ const pixel* src1 = src;
+ const pixel* src2 = src + 128;
+
+ pixel* dst1 = dst;
+ pixel* dst2 = dst + 64/*128*/;
for (x = 0; x < 128; x += 2)
{
- pixel pix0 = src[(x + 0)];
- pixel pix1 = src[(x + 1)];
- int sum = pix0 + pix1;
-
- dst[x >> 1] = (pixel)((sum + 1) >> 1);
+ // Top pixel
+ pixel pix0 = src1[(x + 0)];
+ pixel pix1 = src1[(x + 1)];
+
+ // Left pixel
+ pixel pix2 = src2[(x + 0)];
+ pixel pix3 = src2[(x + 1)];
+ int sum1 = pix0 + pix1;
+ int sum2 = pix2 + pix3;
+
+ dst1[x >> 1] = (pixel)((sum1 + 1) >> 1);
+ dst2[x >> 1] = (pixel)((sum2 + 1) >> 1);
}
}
-void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
{
- int x, y;
+ uint32_t x, y;
for (y = 0; y < 64; y += 2)
{
@@ -694,13 +629,13 @@ void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
}
}
-void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height)
{
for (int y = 0; y < height; y++)
{
- pixel *src1 = src0 + src_stride;
- pixel *src2 = src1 + src_stride;
+ const pixel* src1 = src0 + src_stride;
+ const pixel* src2 = src1 + src_stride;
for (int x = 0; x < width; x++)
{
// slower than naive bilinear, but matches asm
@@ -720,7 +655,7 @@ void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv,
}
/* structural similarity metric */
-void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
+void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
{
for (int z = 0; z < 2; z++)
{
@@ -794,7 +729,7 @@ float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
}
template<int size>
-uint64_t pixel_var(pixel *pix, intptr_t i_stride)
+uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
{
uint32_t sum = 0, sqr = 0;
@@ -817,7 +752,7 @@ uint64_t pixel_var(pixel *pix, intptr_t i_stride)
#endif
template<int size>
-int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
+int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
{
static pixel zeroBuf[8] /* = { 0 } */;
@@ -850,7 +785,7 @@ int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
}
template<int size>
-int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride)
+int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
{
static int16_t zeroBuf[8] /* = { 0 } */;
@@ -863,9 +798,9 @@ int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstri
for (int j = 0; j < dim; j+= 8)
{
/* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
- int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
+ int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) -
(sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
- int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
+ int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride) -
(sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
totEnergy += abs(sourceEnergy - reconEnergy);
@@ -876,34 +811,19 @@ int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstri
else
{
/* 4x4 is too small for sa8d */
- int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
- int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
+ int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
+ int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
return abs(sourceEnergy - reconEnergy);
}
}
-void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride,
- pixel *src, intptr_t srcStride, int w, int h)
-{
- for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride)
- {
- for (int x = 0; x < w; x++)
- {
- dstu[x] = src[2 * x];
- dstv[x] = src[2 * x + 1];
- }
- }
-}
-
template<int bx, int by>
-void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = b[x];
- }
a += stridea;
b += strideb;
@@ -911,14 +831,12 @@ void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
}
template<int bx, int by>
-void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = b[x];
- }
a += stridea;
b += strideb;
@@ -926,7 +844,7 @@ void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
}
template<int bx, int by>
-void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
@@ -942,14 +860,12 @@ void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
}
template<int bx, int by>
-void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = (int16_t)b[x];
- }
a += stridea;
b += strideb;
@@ -957,14 +873,12 @@ void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
}
template<int bx, int by>
-void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = (int16_t)(b0[x] - b1[x]);
- }
b0 += sstride0;
b1 += sstride1;
@@ -973,14 +887,12 @@ void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t
}
template<int bx, int by>
-void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
- a[x] = Clip(b0[x] + b1[x]);
- }
+ a[x] = x265_clip(b0[x] + b1[x]);
b0 += sstride0;
b1 += sstride1;
@@ -989,7 +901,7 @@ void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t
}
template<int bx, int by>
-void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
int shiftNum, offset;
@@ -1000,8 +912,8 @@ void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intpt
{
for (int x = 0; x < bx; x += 2)
{
- dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
- dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
+ dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
+ dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
}
src0 += src0Stride;
@@ -1010,28 +922,24 @@ void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intpt
}
}
-void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
- {
dst[c] = ((pixel)src[c]) << shift;
- }
dst += dstStride;
src += srcStride;
}
}
-void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
- {
dst[c] = (pixel)((src[c] >> shift) & mask);
- }
dst += dstStride;
src += srcStride;
@@ -1040,8 +948,8 @@ void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstS
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
-void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts,
- int32_t *invQscales, double *fpsFactor, int len)
+void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
+ const int32_t* invQscales, const double* fpsFactor, int len)
{
double fps = *fpsFactor / 256;
@@ -1068,318 +976,274 @@ void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int mar
primitives.extendRowBorder(pic, stride, width, height, marginX);
/* copy top row to create above margin */
- pixel *top = pic - marginX;
+ pixel* top = pic - marginX;
for (int y = 0; y < marginY; y++)
memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
/* copy bottom row to create below margin */
- pixel *bot = pic - marginX + (height - 1) * stride;
+ pixel* bot = pic - marginX + (height - 1) * stride;
for (int y = 0; y < marginY; y++)
memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
}
/* Initialize entries for pixel functions defined in this file */
-void Setup_C_PixelPrimitives(EncoderPrimitives &p)
+void setupPixelPrimitives_c(EncoderPrimitives &p)
{
- SET_FUNC_PRIMITIVE_TABLE_C2(sad)
- SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3)
- SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4)
- SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
-
- // satd
- p.satd[LUMA_4x4] = satd_4x4;
- p.satd[LUMA_8x8] = satd8<8, 8>;
- p.satd[LUMA_8x4] = satd_8x4;
- p.satd[LUMA_4x8] = satd4<4, 8>;
- p.satd[LUMA_16x16] = satd8<16, 16>;
- p.satd[LUMA_16x8] = satd8<16, 8>;
- p.satd[LUMA_8x16] = satd8<8, 16>;
- p.satd[LUMA_16x12] = satd8<16, 12>;
- p.satd[LUMA_12x16] = satd4<12, 16>;
- p.satd[LUMA_16x4] = satd8<16, 4>;
- p.satd[LUMA_4x16] = satd4<4, 16>;
- p.satd[LUMA_32x32] = satd8<32, 32>;
- p.satd[LUMA_32x16] = satd8<32, 16>;
- p.satd[LUMA_16x32] = satd8<16, 32>;
- p.satd[LUMA_32x24] = satd8<32, 24>;
- p.satd[LUMA_24x32] = satd8<24, 32>;
- p.satd[LUMA_32x8] = satd8<32, 8>;
- p.satd[LUMA_8x32] = satd8<8, 32>;
- p.satd[LUMA_64x64] = satd8<64, 64>;
- p.satd[LUMA_64x32] = satd8<64, 32>;
- p.satd[LUMA_32x64] = satd8<32, 64>;
- p.satd[LUMA_64x48] = satd8<64, 48>;
- p.satd[LUMA_48x64] = satd8<48, 64>;
- p.satd[LUMA_64x16] = satd8<64, 16>;
- p.satd[LUMA_16x64] = satd8<16, 64>;
-
-#define CHROMA_420(W, H) \
- p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
- p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
-
-#define CHROMA_422(W, H) \
- p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
- p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
-
-#define CHROMA_444(W, H) \
- p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
- p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
- p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
-
-#define LUMA(W, H) \
- p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
- p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
- p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
-
-#define LUMA_PIXELSUB(W, H) \
- p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-
-#define CHROMA_PIXELSUB_420(W, H) \
- p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-
-#define CHROMA_PIXELSUB_422(W, H) \
- p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-
-#define CHROMA_PIXELSUB_444(W, H) \
- p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-
-
-
- LUMA(4, 4);
- LUMA(8, 8);
- CHROMA_420(4, 4);
- LUMA(4, 8);
- CHROMA_420(2, 4);
- LUMA(8, 4);
- CHROMA_420(4, 2);
- LUMA(16, 16);
- CHROMA_420(8, 8);
- LUMA(16, 8);
- CHROMA_420(8, 4);
- LUMA(8, 16);
- CHROMA_420(4, 8);
- LUMA(16, 12);
- CHROMA_420(8, 6);
- LUMA(12, 16);
- CHROMA_420(6, 8);
- LUMA(16, 4);
- CHROMA_420(8, 2);
- LUMA(4, 16);
- CHROMA_420(2, 8);
- LUMA(32, 32);
- CHROMA_420(16, 16);
- LUMA(32, 16);
- CHROMA_420(16, 8);
- LUMA(16, 32);
- CHROMA_420(8, 16);
- LUMA(32, 24);
- CHROMA_420(16, 12);
- LUMA(24, 32);
- CHROMA_420(12, 16);
- LUMA(32, 8);
- CHROMA_420(16, 4);
- LUMA(8, 32);
- CHROMA_420(4, 16);
- LUMA(64, 64);
- CHROMA_420(32, 32);
- LUMA(64, 32);
- CHROMA_420(32, 16);
- LUMA(32, 64);
- CHROMA_420(16, 32);
- LUMA(64, 48);
- CHROMA_420(32, 24);
- LUMA(48, 64);
- CHROMA_420(24, 32);
- LUMA(64, 16);
- CHROMA_420(32, 8);
- LUMA(16, 64);
- CHROMA_420(8, 32);
-
- LUMA_PIXELSUB(4, 4);
- LUMA_PIXELSUB(8, 8);
- LUMA_PIXELSUB(16, 16);
- LUMA_PIXELSUB(32, 32);
- LUMA_PIXELSUB(64, 64);
- CHROMA_PIXELSUB_420(4, 4)
- CHROMA_PIXELSUB_420(8, 8)
- CHROMA_PIXELSUB_420(16, 16)
- CHROMA_PIXELSUB_420(32, 32)
- CHROMA_PIXELSUB_422(4, 8)
- CHROMA_PIXELSUB_422(8, 16)
- CHROMA_PIXELSUB_422(16, 32)
- CHROMA_PIXELSUB_422(32, 64)
- CHROMA_PIXELSUB_444(8, 8)
- CHROMA_PIXELSUB_444(16, 16)
- CHROMA_PIXELSUB_444(32, 32)
- CHROMA_PIXELSUB_444(64, 64)
-
- CHROMA_422(4, 8);
- CHROMA_422(4, 4);
- CHROMA_422(2, 8);
- CHROMA_422(8, 16);
- CHROMA_422(8, 8);
- CHROMA_422(4, 16);
- CHROMA_422(8, 12);
- CHROMA_422(6, 16);
- CHROMA_422(8, 4);
- CHROMA_422(2, 16);
- CHROMA_422(16, 32);
- CHROMA_422(16, 16);
- CHROMA_422(8, 32);
- CHROMA_422(16, 24);
- CHROMA_422(12, 32);
- CHROMA_422(16, 8);
- CHROMA_422(4, 32);
- CHROMA_422(32, 64);
- CHROMA_422(32, 32);
- CHROMA_422(16, 64);
- CHROMA_422(32, 48);
- CHROMA_422(24, 64);
- CHROMA_422(32, 16);
- CHROMA_422(8, 64);
-
- CHROMA_444(4, 4);
- CHROMA_444(8, 8);
- CHROMA_444(4, 8);
- CHROMA_444(8, 4);
- CHROMA_444(16, 16);
- CHROMA_444(16, 8);
- CHROMA_444(8, 16);
- CHROMA_444(16, 12);
- CHROMA_444(12, 16);
- CHROMA_444(16, 4);
- CHROMA_444(4, 16);
- CHROMA_444(32, 32);
- CHROMA_444(32, 16);
- CHROMA_444(16, 32);
- CHROMA_444(32, 24);
- CHROMA_444(24, 32);
- CHROMA_444(32, 8);
- CHROMA_444(8, 32);
- CHROMA_444(64, 64);
- CHROMA_444(64, 32);
- CHROMA_444(32, 64);
- CHROMA_444(64, 48);
- CHROMA_444(48, 64);
- CHROMA_444(64, 16);
- CHROMA_444(16, 64);
-
- SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
-
- p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
- p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
- p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
- p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
- p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
-
- p.cvt16to32_shl = convert16to32_shl;
- p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
- p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
- p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
- p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
- p.cvt32to16_shr = convert32to16_shr;
- p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
- p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
- p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
- p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
- p.copy_shr = copy_shr;
- p.copy_shl[BLOCK_4x4] = copy_shl<4>;
- p.copy_shl[BLOCK_8x8] = copy_shl<8>;
- p.copy_shl[BLOCK_16x16] = copy_shl<16>;
- p.copy_shl[BLOCK_32x32] = copy_shl<32>;
-
- p.sa8d[BLOCK_4x4] = satd_4x4;
- p.sa8d[BLOCK_8x8] = sa8d_8x8;
- p.sa8d[BLOCK_16x16] = sa8d_16x16;
- p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
- p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
-
- p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
- p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
- p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
- p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
- p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
-
- p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
- p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
- p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
- p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
- p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
-
- p.sa8d_inter[LUMA_4x4] = satd_4x4;
- p.sa8d_inter[LUMA_8x8] = sa8d_8x8;
- p.sa8d_inter[LUMA_8x4] = satd_8x4;
- p.sa8d_inter[LUMA_4x8] = satd4<4, 8>;
- p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
- p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>;
- p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>;
- p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
- p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
- p.sa8d_inter[LUMA_4x16] = satd4<4, 16>;
- p.sa8d_inter[LUMA_16x4] = satd8<16, 4>;
- p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
- p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
- p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
- p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
- p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
- p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>;
- p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>;
- p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
- p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
- p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
- p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
- p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
- p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
- p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
-
- p.calcresidual[BLOCK_4x4] = getResidual<4>;
- p.calcresidual[BLOCK_8x8] = getResidual<8>;
- p.calcresidual[BLOCK_16x16] = getResidual<16>;
- p.calcresidual[BLOCK_32x32] = getResidual<32>;
- p.calcresidual[BLOCK_64x64] = NULL;
-
- p.transpose[BLOCK_4x4] = transpose<4>;
- p.transpose[BLOCK_8x8] = transpose<8>;
- p.transpose[BLOCK_16x16] = transpose<16>;
- p.transpose[BLOCK_32x32] = transpose<32>;
- p.transpose[BLOCK_64x64] = transpose<64>;
-
- p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
- p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
- p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
- p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
+#define LUMA_PU(W, H) \
+ p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp = pixelavg_pp<W, H>;
+
+#define LUMA_CU(W, H) \
+ p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s = blockfill_s_c<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \
+ p.cu[BLOCK_ ## W ## x ## H].psy_cost_ss = psyCost_ss<BLOCK_ ## W ## x ## H>; \
+ p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s = pixel_ssd_s_c<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].var = pixel_var<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].calcresidual = getResidual<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
+ p.cu[BLOCK_ ## W ## x ## H].sse_ss = sse<W, H, int16_t, int16_t>;
+
+ LUMA_PU(4, 4);
+ LUMA_PU(8, 8);
+ LUMA_PU(16, 16);
+ LUMA_PU(32, 32);
+ LUMA_PU(64, 64);
+ LUMA_PU(4, 8);
+ LUMA_PU(8, 4);
+ LUMA_PU(16, 8);
+ LUMA_PU(8, 16);
+ LUMA_PU(16, 12);
+ LUMA_PU(12, 16);
+ LUMA_PU(16, 4);
+ LUMA_PU(4, 16);
+ LUMA_PU(32, 16);
+ LUMA_PU(16, 32);
+ LUMA_PU(32, 24);
+ LUMA_PU(24, 32);
+ LUMA_PU(32, 8);
+ LUMA_PU(8, 32);
+ LUMA_PU(64, 32);
+ LUMA_PU(32, 64);
+ LUMA_PU(64, 48);
+ LUMA_PU(48, 64);
+ LUMA_PU(64, 16);
+ LUMA_PU(16, 64);
+
+ p.pu[LUMA_4x4].satd = satd_4x4;
+ p.pu[LUMA_8x8].satd = satd8<8, 8>;
+ p.pu[LUMA_8x4].satd = satd_8x4;
+ p.pu[LUMA_4x8].satd = satd4<4, 8>;
+ p.pu[LUMA_16x16].satd = satd8<16, 16>;
+ p.pu[LUMA_16x8].satd = satd8<16, 8>;
+ p.pu[LUMA_8x16].satd = satd8<8, 16>;
+ p.pu[LUMA_16x12].satd = satd8<16, 12>;
+ p.pu[LUMA_12x16].satd = satd4<12, 16>;
+ p.pu[LUMA_16x4].satd = satd8<16, 4>;
+ p.pu[LUMA_4x16].satd = satd4<4, 16>;
+ p.pu[LUMA_32x32].satd = satd8<32, 32>;
+ p.pu[LUMA_32x16].satd = satd8<32, 16>;
+ p.pu[LUMA_16x32].satd = satd8<16, 32>;
+ p.pu[LUMA_32x24].satd = satd8<32, 24>;
+ p.pu[LUMA_24x32].satd = satd8<24, 32>;
+ p.pu[LUMA_32x8].satd = satd8<32, 8>;
+ p.pu[LUMA_8x32].satd = satd8<8, 32>;
+ p.pu[LUMA_64x64].satd = satd8<64, 64>;
+ p.pu[LUMA_64x32].satd = satd8<64, 32>;
+ p.pu[LUMA_32x64].satd = satd8<32, 64>;
+ p.pu[LUMA_64x48].satd = satd8<64, 48>;
+ p.pu[LUMA_48x64].satd = satd8<48, 64>;
+ p.pu[LUMA_64x16].satd = satd8<64, 16>;
+ p.pu[LUMA_16x64].satd = satd8<16, 64>;
+
+ LUMA_CU(4, 4);
+ LUMA_CU(8, 8);
+ LUMA_CU(16, 16);
+ LUMA_CU(32, 32);
+ LUMA_CU(64, 64);
+
+ p.cu[BLOCK_4x4].sa8d = satd_4x4;
+ p.cu[BLOCK_8x8].sa8d = sa8d_8x8;
+ p.cu[BLOCK_16x16].sa8d = sa8d_16x16;
+ p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
+ p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
+
+#define CHROMA_PU_420(W, H) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg<W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
+
+ CHROMA_PU_420(2, 2);
+ CHROMA_PU_420(2, 4);
+ CHROMA_PU_420(4, 4);
+ CHROMA_PU_420(8, 8);
+ CHROMA_PU_420(16, 16);
+ CHROMA_PU_420(32, 32);
+ CHROMA_PU_420(4, 2);
+ CHROMA_PU_420(8, 4);
+ CHROMA_PU_420(4, 8);
+ CHROMA_PU_420(8, 6);
+ CHROMA_PU_420(6, 8);
+ CHROMA_PU_420(8, 2);
+ CHROMA_PU_420(2, 8);
+ CHROMA_PU_420(16, 8);
+ CHROMA_PU_420(8, 16);
+ CHROMA_PU_420(16, 12);
+ CHROMA_PU_420(12, 16);
+ CHROMA_PU_420(16, 4);
+ CHROMA_PU_420(4, 16);
+ CHROMA_PU_420(32, 16);
+ CHROMA_PU_420(16, 32);
+ CHROMA_PU_420(32, 24);
+ CHROMA_PU_420(24, 32);
+ CHROMA_PU_420(32, 8);
+ CHROMA_PU_420(8, 32);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = satd_4x4;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8<8, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8<32, 32>;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = satd_8x4;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = satd4<4, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8<16, 32>;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4<12, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd4<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = satd4<4, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8<8, 32>;
+
+#define CHROMA_CU_420(W, H) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
+
+ CHROMA_CU_420(2, 2)
+ CHROMA_CU_420(4, 4)
+ CHROMA_CU_420(8, 8)
+ CHROMA_CU_420(16, 16)
+ CHROMA_CU_420(32, 32)
+
+ p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
+ p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
+
+#define CHROMA_PU_422(W, H) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg<W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
+
+ CHROMA_PU_422(2, 4);
+ CHROMA_PU_422(4, 8);
+ CHROMA_PU_422(8, 16);
+ CHROMA_PU_422(16, 32);
+ CHROMA_PU_422(32, 64);
+ CHROMA_PU_422(4, 4);
+ CHROMA_PU_422(2, 8);
+ CHROMA_PU_422(8, 8);
+ CHROMA_PU_422(4, 16);
+ CHROMA_PU_422(8, 12);
+ CHROMA_PU_422(6, 16);
+ CHROMA_PU_422(8, 4);
+ CHROMA_PU_422(2, 16);
+ CHROMA_PU_422(16, 16);
+ CHROMA_PU_422(8, 32);
+ CHROMA_PU_422(16, 24);
+ CHROMA_PU_422(12, 32);
+ CHROMA_PU_422(16, 8);
+ CHROMA_PU_422(4, 32);
+ CHROMA_PU_422(32, 32);
+ CHROMA_PU_422(16, 64);
+ CHROMA_PU_422(32, 48);
+ CHROMA_PU_422(24, 64);
+ CHROMA_PU_422(32, 16);
+ CHROMA_PU_422(8, 64);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4<4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8<8, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8<32, 64>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = satd_4x4;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = satd8<8, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4<4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = satd8<8, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8<32, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8<16, 64>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd4<8, 12>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd4<8, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4<4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8<32, 48>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = satd8<8, 64>;
+
+#define CHROMA_CU_422(W, H) \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
+
+ CHROMA_CU_422(2, 4)
+ CHROMA_CU_422(4, 8)
+ CHROMA_CU_422(8, 16)
+ CHROMA_CU_422(16, 32)
+ CHROMA_CU_422(32, 64)
+
+ p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
+ p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
p.weight_pp = weight_pp_c;
p.weight_sp = weight_sp_c;
p.scale1D_128to64 = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
- p.frame_init_lowres_core = frame_init_lowres_core;
+ p.frameInitLowres = frame_init_lowres_core;
p.ssim_4x4x2_core = ssim_4x4x2_core;
p.ssim_end_4 = ssim_end_4;
- p.var[BLOCK_8x8] = pixel_var<8>;
- p.var[BLOCK_16x16] = pixel_var<16>;
- p.var[BLOCK_32x32] = pixel_var<32>;
- p.var[BLOCK_64x64] = pixel_var<64>;
- p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
p.planecopy_cp = planecopy_cp_c;
p.planecopy_sp = planecopy_sp_c;
p.propagateCost = estimateCUPropagateCost;
diff --git a/source/common/predict.cpp b/source/common/predict.cpp
index a142c5a..27d960e 100644
--- a/source/common/predict.cpp
+++ b/source/common/predict.cpp
@@ -30,28 +30,25 @@
using namespace x265;
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant
+#endif
+
namespace
{
inline pixel weightBidir(int w0, int16_t P0, int w1, int16_t P1, int round, int shift, int offset)
{
- return Clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset << (shift - 1))) >> shift);
+ return x265_clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset << (shift - 1))) >> shift);
}
}
Predict::Predict()
{
- m_predBuf = NULL;
- m_refAbove = NULL;
- m_refAboveFlt = NULL;
- m_refLeft = NULL;
- m_refLeftFlt = NULL;
m_immedVals = NULL;
}
Predict::~Predict()
{
- X265_FREE(m_predBuf);
- X265_FREE(m_refAbove);
X265_FREE(m_immedVals);
m_predShortYuv[0].destroy();
m_predShortYuv[1].destroy();
@@ -62,16 +59,7 @@ bool Predict::allocBuffers(int csp)
m_csp = csp;
m_hChromaShift = CHROMA_H_SHIFT(csp);
m_vChromaShift = CHROMA_V_SHIFT(csp);
-
- int predBufHeight = ((MAX_CU_SIZE + 2) << 4);
- int predBufStride = ((MAX_CU_SIZE + 8) << 4);
- CHECKED_MALLOC(m_predBuf, pixel, predBufStride * predBufHeight);
CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1));
- CHECKED_MALLOC(m_refAbove, pixel, 12 * MAX_CU_SIZE);
-
- m_refAboveFlt = m_refAbove + 3 * MAX_CU_SIZE;
- m_refLeft = m_refAboveFlt + 3 * MAX_CU_SIZE;
- m_refLeftFlt = m_refLeft + 3 * MAX_CU_SIZE;
return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp);
@@ -81,67 +69,47 @@ fail:
void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize)
{
+ int sizeIdx = log2TrSize - 2;
int tuSize = 1 << log2TrSize;
-
- pixel *refLft, *refAbv;
-
- if (!(g_intraFilterFlags[dirMode] & tuSize))
- {
- refLft = m_refLeft + tuSize - 1;
- refAbv = m_refAbove + tuSize - 1;
- }
- else
- {
- refLft = m_refLeftFlt + tuSize - 1;
- refAbv = m_refAboveFlt + tuSize - 1;
- }
+ int filter = !!(g_intraFilterFlags[dirMode] & tuSize);
+ X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
bool bFilter = log2TrSize <= 4;
- int sizeIdx = log2TrSize - 2;
- X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
- primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft, refAbv, dirMode, bFilter);
+ primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, intraNeighbourBuf[filter], dirMode, bFilter);
}
-void Predict::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
+void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
{
int tuSize = 1 << log2TrSizeC;
int tuSize2 = tuSize << 1;
- // Create the prediction
- const int bufOffset = tuSize - 1;
- pixel buf0[3 * MAX_CU_SIZE];
- pixel buf1[3 * MAX_CU_SIZE];
- pixel* above;
- pixel* left = buf0 + bufOffset;
-
- int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1);
- for (int k = 0; k < limit; k++)
- left[k] = src[k * ADI_BUF_STRIDE];
+ pixel* srcBuf = intraNeighbourBuf[0];
if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
{
- // generate filtered intra prediction samples
- buf0[bufOffset - 1] = src[1];
- left = buf1 + bufOffset;
- for (int i = 0; i < tuSize2; i++)
- left[i] = (buf0[bufOffset + i - 1] + 2 * buf0[bufOffset + i] + buf0[bufOffset + i + 1] + 2) >> 2;
- left[tuSize2] = buf0[bufOffset + tuSize2];
-
- above = buf0 + bufOffset;
- above[0] = left[0];
+ pixel* fltBuf = intraNeighbourBuf[1];
+ pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2];
+
+ // filtering top
for (int i = 1; i < tuSize2; i++)
- above[i] = (src[i - 1] + 2 * src[i] + src[i + 1] + 2) >> 2;
- above[tuSize2] = src[tuSize2];
- }
- else
- {
- above = buf1 + bufOffset;
- memcpy(above, src, (tuSize2 + 1) * sizeof(pixel));
+ fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2] = topLast;
+
+ // filtering top-left
+ fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2;
+
+ //filtering left
+ fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2;
+ for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+ fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2 + tuSize2] = leftLast;
+
+ srcBuf = intraNeighbourBuf[1];
}
int sizeIdx = log2TrSizeC - 2;
X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
- primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
+ primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0);
}
void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
@@ -187,18 +155,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
- predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
- predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
}
}
else
@@ -253,13 +221,13 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
if (bLuma)
{
- predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
- predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+ predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
}
if (bChroma)
{
- predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
- predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+ predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
}
if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
@@ -277,18 +245,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
- predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
- predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
if (bChroma)
- predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]);
+ predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
}
}
else
@@ -302,18 +270,18 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
ShortYuv& shortYuv = m_predShortYuv[0];
if (bLuma)
- predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
if (bChroma)
- predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
}
else
{
if (bLuma)
- predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
if (bChroma)
- predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]);
+ predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
}
}
}
@@ -321,41 +289,35 @@ void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
{
- pixel *dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
+ pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
intptr_t dstStride = dstYuv.m_size;
intptr_t srcStride = refPic.m_stride;
intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
- pixel* src = const_cast<PicYuv&>(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+ const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
int xFrac = mv.x & 0x3;
int yFrac = mv.y & 0x3;
if (!(yFrac | xFrac))
- primitives.luma_copy_pp[partEnum](dst, dstStride, src, srcStride);
+ primitives.pu[partEnum].copy_pp(dst, dstStride, src, srcStride);
else if (!yFrac)
- primitives.luma_hpp[partEnum](src, srcStride, dst, dstStride, xFrac);
+ primitives.pu[partEnum].luma_hpp(src, srcStride, dst, dstStride, xFrac);
else if (!xFrac)
- primitives.luma_vpp[partEnum](src, srcStride, dst, dstStride, yFrac);
+ primitives.pu[partEnum].luma_vpp(src, srcStride, dst, dstStride, yFrac);
else
- {
- int tmpStride = m_puWidth;
- int filterSize = NTAPS_LUMA;
- int halfFilterSize = (filterSize >> 1);
- primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1);
- primitives.luma_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
- }
+ primitives.pu[partEnum].luma_hvpp(src, srcStride, dst, dstStride, xFrac, yFrac);
}
void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
{
- int16_t *dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
+ int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
int dstStride = dstSYuv.m_size;
intptr_t srcStride = refPic.m_stride;
intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
- pixel *src = const_cast<PicYuv&>(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+ const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
int xFrac = mv.x & 0x3;
int yFrac = mv.y & 0x3;
@@ -368,16 +330,16 @@ void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const
if (!(yFrac | xFrac))
primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight);
else if (!yFrac)
- primitives.luma_hps[partEnum](src, srcStride, dst, dstStride, xFrac, 0);
+ primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
else if (!xFrac)
- primitives.luma_vps[partEnum](src, srcStride, dst, dstStride, yFrac);
+ primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
else
{
int tmpStride = m_puWidth;
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
- primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1);
- primitives.luma_vss[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
+ primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
+ primitives.pu[partEnum].luma_vss(m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
}
}
@@ -391,8 +353,8 @@ void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV&
intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
- pixel* refCb = const_cast<PicYuv&>(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
- pixel* refCr = const_cast<PicYuv&>(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx);
pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx);
@@ -404,18 +366,18 @@ void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV&
if (!(yFrac | xFrac))
{
- primitives.chroma[m_csp].copy_pp[partEnum](dstCb, dstStride, refCb, refStride);
- primitives.chroma[m_csp].copy_pp[partEnum](dstCr, dstStride, refCr, refStride);
+ primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCb, dstStride, refCb, refStride);
+ primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCr, dstStride, refCr, refStride);
}
else if (!yFrac)
{
- primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
- primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
}
else if (!xFrac)
{
- primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
- primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
}
else
{
@@ -423,11 +385,11 @@ void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV&
int filterSize = NTAPS_CHROMA;
int halfFilterSize = (filterSize >> 1);
- primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
- primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+ primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
- primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
- primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+ primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
}
}
@@ -441,8 +403,8 @@ void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, cons
intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
- pixel* refCb = const_cast<PicYuv&>(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
- pixel* refCr = const_cast<PicYuv&>(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+ const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx);
int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx);
@@ -459,28 +421,28 @@ void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, cons
if (!(yFrac | xFrac))
{
- primitives.chroma_p2s[m_csp](refCb, refStride, dstCb, cxWidth, cxHeight);
- primitives.chroma_p2s[m_csp](refCr, refStride, dstCr, cxWidth, cxHeight);
+ primitives.chroma[m_csp].p2s(refCb, refStride, dstCb, cxWidth, cxHeight);
+ primitives.chroma[m_csp].p2s(refCr, refStride, dstCr, cxWidth, cxHeight);
}
else if (!yFrac)
{
- primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
- primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
+ primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
+ primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
}
else if (!xFrac)
{
- primitives.chroma[m_csp].filter_vps[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
- primitives.chroma[m_csp].filter_vps[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
}
else
{
int extStride = cxWidth;
int filterSize = NTAPS_CHROMA;
int halfFilterSize = (filterSize >> 1);
- primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
- primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
- primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
- primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+ primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+ primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+ primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
}
}
@@ -492,20 +454,12 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv&
int w0, w1, offset, shiftNum, shift, round;
uint32_t src0Stride, src1Stride, dststride;
- pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
- pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
- pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-
- const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
- const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
- const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
-
- const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
- const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
- const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
-
if (bLuma)
{
+ pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
+ const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
+ const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
+
// Luma
w0 = wp0[0].w;
offset = wp0[0].o + wp1[0].o;
@@ -542,6 +496,13 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv&
if (bChroma)
{
+ pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
+ pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
+ const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
+ const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
+ const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
+ const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
+
// Chroma U
w0 = wp0[1].w;
offset = wp0[1].o + wp1[1].o;
@@ -602,19 +563,14 @@ void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv&
/* weighted averaging for uni-pred */
void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
{
- pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
- pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
- pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-
- const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
- const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
- const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
-
int w0, offset, shiftNum, shift, round;
uint32_t srcStride, dstStride;
if (bLuma)
{
+ pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
+ const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
+
// Luma
w0 = wp[0].w;
offset = wp[0].offset;
@@ -624,11 +580,16 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal
srcStride = srcYuv.m_size;
dstStride = predYuv.m_size;
- primitives.weight_sp(const_cast<int16_t*>(srcY0), dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
+ primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
}
if (bChroma)
{
+ pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
+ pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
+ const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
+ const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
+
// Chroma U
w0 = wp[1].w;
offset = wp[1].offset;
@@ -642,7 +603,7 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal
uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift;
uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift;
- primitives.weight_sp(const_cast<int16_t*>(srcU0), dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
+ primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
// Chroma V
w0 = wp[2].w;
@@ -650,110 +611,91 @@ void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightVal
shift = wp[2].shift + shiftNum;
round = shift ? (1 << (shift - 1)) : 0;
- primitives.weight_sp(const_cast<int16_t*>(srcV0), dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
+ primitives.weight_sp(srcV0, dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
}
}
-void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode)
+void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
{
- IntraNeighbors intraNeighbors;
- initIntraNeighbors(cu, absPartIdx, partDepth, true, &intraNeighbors);
-
- pixel* adiBuf = m_predBuf;
- pixel* refAbove = m_refAbove;
- pixel* refLeft = m_refLeft;
- pixel* refAboveFlt = m_refAboveFlt;
- pixel* refLeftFlt = m_refLeftFlt;
-
int tuSize = intraNeighbors.tuSize;
int tuSize2 = tuSize << 1;
- pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = cu.m_encData->m_reconPicYuv->m_stride;
+ pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
- fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
+ fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
- // initialization of ADI buffers
- const int bufOffset = tuSize - 1;
- refAbove += bufOffset;
- refLeft += bufOffset;
+ pixel* refBuf = intraNeighbourBuf[0];
+ pixel* fltBuf = intraNeighbourBuf[1];
- // ADI_BUF_STRIDE * (2 * tuSize + 1);
- memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel));
- for (int k = 0; k < tuSize2 + 1; k++)
- refLeft[k] = adiBuf[k * ADI_BUF_STRIDE];
+ pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
{
// generate filtered intra prediction samples
- refAboveFlt += bufOffset;
- refLeftFlt += bufOffset;
-
bool bStrongSmoothing = (tuSize == 32 && cu.m_slice->m_sps->bUseStrongIntraSmoothing);
if (bStrongSmoothing)
{
const int trSize = 32;
- const int trSize2 = 32 * 2;
+ const int trSize2 = trSize << 1;
const int threshold = 1 << (X265_DEPTH - 5);
- int refBL = refLeft[trSize2];
- int refTL = refAbove[0];
- int refTR = refAbove[trSize2];
- bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize]) < threshold &&
- abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
+
+ pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
+
+ bStrongSmoothing = abs (topLeft + topLast - (topMiddle << 1)) < threshold &&
+ abs (topLeft + leftLast - (leftMiddle << 1)) < threshold;
if (bStrongSmoothing)
{
// bilinear interpolation
- const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
- int init = (refTL << shift) + tuSize;
- int delta;
-
- refLeftFlt[0] = refAboveFlt[0] = refAbove[0];
+ const int shift = 5 + 1;
+ int init = (topLeft << shift) + tuSize;
+ int deltaL, deltaR;
- //TODO: Performance Primitive???
- delta = refBL - refTL;
- for (int i = 1; i < trSize2; i++)
- refLeftFlt[i] = (pixel)((init + delta * i) >> shift);
- refLeftFlt[trSize2] = refLeft[trSize2];
+ // TODO: Performance Primitive???
+ deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
- delta = refTR - refTL;
+ fltBuf[0] = topLeft;
for (int i = 1; i < trSize2; i++)
- refAboveFlt[i] = (pixel)((init + delta * i) >> shift);
- refAboveFlt[trSize2] = refAbove[trSize2];
+ {
+ fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
+ fltBuf[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering
+ }
+ fltBuf[trSize2] = topLast;
+ fltBuf[tuSize2 + trSize2] = leftLast;
return;
}
}
- refLeft[-1] = refAbove[1];
- for (int i = 0; i < tuSize2; i++)
- refLeftFlt[i] = (refLeft[i - 1] + 2 * refLeft[i] + refLeft[i + 1] + 2) >> 2;
- refLeftFlt[tuSize2] = refLeft[tuSize2];
-
- refAboveFlt[0] = refLeftFlt[0];
+ // filtering top
for (int i = 1; i < tuSize2; i++)
- refAboveFlt[i] = (refAbove[i - 1] + 2 * refAbove[i] + refAbove[i + 1] + 2) >> 2;
- refAboveFlt[tuSize2] = refAbove[tuSize2];
+ fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2] = topLast;
+
+ // filtering top-left
+ fltBuf[0] = ((topLeft << 1) + refBuf[1] + refBuf[tuSize2 + 1] + 2) >> 2;
+
+ // filtering left
+ fltBuf[tuSize2 + 1] = ((refBuf[tuSize2 + 1] << 1) + topLeft + refBuf[tuSize2 + 2] + 2) >> 2;
+ for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+ fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2 + tuSize2] = leftLast;
}
}
-void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId)
+void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
{
- IntraNeighbors intraNeighbors;
- initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
- uint32_t tuSize = intraNeighbors.tuSize;
-
- const pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = cu.m_encData->m_reconPicYuv->m_strideC;
- pixel* adiRef = getAdiChromaBuf(chromaId, tuSize);
+ const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
- fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
+ fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
}
-void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
+void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
{
- uint32_t log2TrSize = cu.m_log2CUSize[0] - partDepth;
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
int log2UnitWidth = LOG2_UNIT_SIZE;
int log2UnitHeight = LOG2_UNIT_SIZE;
@@ -764,12 +706,12 @@ void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t
log2UnitHeight -= cu.m_vChromaShift;
}
- int numIntraNeighbor = 0;
- bool *bNeighborFlags = intraNeighbors->bNeighborFlags;
+ int numIntraNeighbor;
+ bool* bNeighborFlags = intraNeighbors->bNeighborFlags;
- uint32_t partIdxLT, partIdxRT, partIdxLB;
-
- cu.deriveLeftRightTopIdxAdi(partIdxLT, partIdxRT, absPartIdx, partDepth);
+ uint32_t numPartInWidth = 1 << (cu.m_log2CUSize[0] - LOG2_UNIT_SIZE - tuDepth);
+ uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
+ uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + numPartInWidth - 1];
uint32_t tuSize = 1 << log2TrSize;
int tuWidthInUnits = tuSize >> log2UnitWidth;
@@ -777,14 +719,26 @@ void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t
int aboveUnits = tuWidthInUnits << 1;
int leftUnits = tuHeightInUnits << 1;
int partIdxStride = cu.m_slice->m_sps->numPartInCUSize;
- partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
+ uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
- bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT);
- numIntraNeighbor += (int)(bNeighborFlags[leftUnits]);
- numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1));
- numIntraNeighbor += isAboveRightAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1 + tuWidthInUnits));
- numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1));
- numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1 - tuHeightInUnits));
+ if (cu.m_slice->isIntra() || !cu.m_slice->m_pps->bConstrainedIntraPred)
+ {
+ bNeighborFlags[leftUnits] = isAboveLeftAvailable<false>(cu, partIdxLT);
+ numIntraNeighbor = (int)(bNeighborFlags[leftUnits]);
+ numIntraNeighbor += isAboveAvailable<false>(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
+ numIntraNeighbor += isAboveRightAvailable<false>(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
+ numIntraNeighbor += isLeftAvailable<false>(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
+ numIntraNeighbor += isBelowLeftAvailable<false>(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
+ }
+ else
+ {
+ bNeighborFlags[leftUnits] = isAboveLeftAvailable<true>(cu, partIdxLT);
+ numIntraNeighbor = (int)(bNeighborFlags[leftUnits]);
+ numIntraNeighbor += isAboveAvailable<true>(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
+ numIntraNeighbor += isAboveRightAvailable<true>(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
+ numIntraNeighbor += isLeftAvailable<true>(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
+ numIntraNeighbor += isBelowLeftAvailable<true>(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
+ }
intraNeighbors->numIntraNeighbor = numIntraNeighbor;
intraNeighbors->totalUnits = aboveUnits + leftUnits + 1;
@@ -793,10 +747,9 @@ void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t
intraNeighbors->unitWidth = 1 << log2UnitWidth;
intraNeighbors->unitHeight = 1 << log2UnitHeight;
intraNeighbors->tuSize = tuSize;
- intraNeighbors->log2TrSize = log2TrSize;
}
-void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
+void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258])
{
const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
@@ -804,26 +757,28 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p
uint32_t tuSize = intraNeighbors.tuSize;
uint32_t refSize = tuSize * 2 + 1;
+ // Nothing is available, perform DC prediction.
if (numIntraNeighbor == 0)
{
- // Fill border with DC value
+ // Fill top border with DC value
for (uint32_t i = 0; i < refSize; i++)
- adiRef[i] = dcValue;
+ dst[i] = dcValue;
- for (uint32_t i = 1; i < refSize; i++)
- adiRef[i * ADI_BUF_STRIDE] = dcValue;
+ // Fill left border with DC value
+ for (uint32_t i = 0; i < refSize - 1; i++)
+ dst[i + refSize] = dcValue;
}
else if (numIntraNeighbor == totalUnits)
{
// Fill top border with rec. samples
const pixel* adiTemp = adiOrigin - picStride - 1;
- memcpy(adiRef, adiTemp, refSize * sizeof(*adiRef));
+ memcpy(dst, adiTemp, refSize * sizeof(pixel));
// Fill left border with rec. samples
adiTemp = adiOrigin - 1;
- for (uint32_t i = 1; i < refSize; i++)
+ for (uint32_t i = 0; i < refSize - 1; i++)
{
- adiRef[i * ADI_BUF_STRIDE] = adiTemp[0];
+ dst[i + refSize] = adiTemp[0];
adiTemp += picStride;
}
}
@@ -893,7 +848,7 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p
while (next < totalUnits && !bNeighborFlags[next])
next++;
- pixel *pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
+ pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
const pixel refSample = *pAdiLineNext;
// Pad unavailable samples with new value
int nextOrTop = X265_MIN(next, leftUnits);
@@ -940,120 +895,108 @@ void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, p
// Copy processed samples
adi = adiLineBuffer + refSize + unitWidth - 2;
- memcpy(adiRef, adi, refSize * sizeof(*adiRef));
+ memcpy(dst, adi, refSize * sizeof(pixel));
adi = adiLineBuffer + refSize - 1;
- for (int i = 1; i < (int)refSize; i++)
- adiRef[i * ADI_BUF_STRIDE] = adi[-i];
+ for (int i = 0; i < (int)refSize - 1; i++)
+ dst[i + refSize] = adi[-(i + 1)];
}
}
+template<bool cip>
bool Predict::isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT)
{
uint32_t partAboveLeft;
const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft, partIdxLT);
- if (!cu.m_slice->m_pps->bConstrainedIntraPred)
- return cuAboveLeft ? true : false;
- else
- return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
+ return cuAboveLeft && (!cip || cuAboveLeft->isIntra(partAboveLeft));
}
-int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags)
+template<bool cip>
+int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
{
const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
- const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1;
+ const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
const uint32_t idxStep = 1;
- bool *validFlagPtr = bValidFlags;
int numIntra = 0;
- for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
+ for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags++)
{
uint32_t partAbove;
const CUData* cuAbove = cu.getPUAbove(partAbove, g_rasterToZscan[rasterPart]);
- if (cuAbove && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAbove->isIntra(partAbove)))
+ if (cuAbove && (!cip || cuAbove->isIntra(partAbove)))
{
numIntra++;
- *validFlagPtr = true;
+ *bValidFlags = true;
}
else
- *validFlagPtr = false;
-
- validFlagPtr++;
+ *bValidFlags = false;
}
return numIntra;
}
-int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags)
+template<bool cip>
+int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
{
const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
- const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1;
+ const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
- bool *validFlagPtr = bValidFlags;
int numIntra = 0;
- for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
+ for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
{
uint32_t partLeft;
const CUData* cuLeft = cu.getPULeft(partLeft, g_rasterToZscan[rasterPart]);
- if (cuLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuLeft->isIntra(partLeft)))
+ if (cuLeft && (!cip || cuLeft->isIntra(partLeft)))
{
numIntra++;
- *validFlagPtr = true;
+ *bValidFlags = true;
}
else
- *validFlagPtr = false;
-
- validFlagPtr--; // opposite direction
+ *bValidFlags = false;
}
return numIntra;
}
-int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags)
+template<bool cip>
+int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits)
{
- const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1;
- bool *validFlagPtr = bValidFlags;
int numIntra = 0;
- for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
+ for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
{
uint32_t partAboveRight;
const CUData* cuAboveRight = cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
- if (cuAboveRight && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAboveRight->isIntra(partAboveRight)))
+ if (cuAboveRight && (!cip || cuAboveRight->isIntra(partAboveRight)))
{
numIntra++;
- *validFlagPtr = true;
+ *bValidFlags = true;
}
else
- *validFlagPtr = false;
-
- validFlagPtr++;
+ *bValidFlags = false;
}
return numIntra;
}
-int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags)
+template<bool cip>
+int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits)
{
- const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1;
- bool *validFlagPtr = bValidFlags;
int numIntra = 0;
- for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
+ for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags--) // opposite direction
{
uint32_t partBelowLeft;
const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft, partIdxLB, offset);
- if (cuBelowLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuBelowLeft->isIntra(partBelowLeft)))
+ if (cuBelowLeft && (!cip || cuBelowLeft->isIntra(partBelowLeft)))
{
numIntra++;
- *validFlagPtr = true;
+ *bValidFlags = true;
}
else
- *validFlagPtr = false;
-
- validFlagPtr--; // opposite direction
+ *bValidFlags = false;
}
return numIntra;
diff --git a/source/common/predict.h b/source/common/predict.h
index a76a32c..618d0e5 100644
--- a/source/common/predict.h
+++ b/source/common/predict.h
@@ -57,20 +57,14 @@ public:
int unitWidth;
int unitHeight;
int tuSize;
- uint32_t log2TrSize;
bool bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
};
ShortYuv m_predShortYuv[2]; /* temporary storage for weighted prediction */
int16_t* m_immedVals;
- /* Intra prediction buffers */
- pixel* m_predBuf;
- pixel* m_refAbove;
- pixel* m_refAboveFlt;
- pixel* m_refLeft;
- pixel* m_refLeftFlt;
-
+ // Unfiltered/filtered neighbours of the current partition.
+ pixel intraNeighbourBuf[2][258];
/* Slice information */
const Slice* m_predSlice;
int m_csp;
@@ -105,14 +99,18 @@ public:
void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
/* Intra prediction helper functions */
- static void initIntraNeighbors(const CUData& cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
- static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);
-
+ static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
+ static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258]);
+ template<bool cip>
static bool isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT);
+ template<bool cip>
static int isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
+ template<bool cip>
static int isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
- static int isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
- static int isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
+ template<bool cip>
+ static int isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
+ template<bool cip>
+ static int isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);
public:
@@ -123,14 +121,9 @@ public:
/* Angular Intra */
void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
- void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
-
- void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode);
- void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId);
- pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize)
- {
- return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE * (tuSize * 2 + 1));
- }
+ void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
+ void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+ void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
};
}
diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp
index 7592d27..35b7a2c 100644
--- a/source/common/primitives.cpp
+++ b/source/common/primitives.cpp
@@ -51,62 +51,139 @@ extern const uint8_t lumaPartitionMapTable[] =
/* the "authoritative" set of encoder primitives */
EncoderPrimitives primitives;
-void Setup_C_PixelPrimitives(EncoderPrimitives &p);
-void Setup_C_DCTPrimitives(EncoderPrimitives &p);
-void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
-void Setup_C_IPredPrimitives(EncoderPrimitives &p);
-void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p);
+void setupPixelPrimitives_c(EncoderPrimitives &p);
+void setupDCTPrimitives_c(EncoderPrimitives &p);
+void setupFilterPrimitives_c(EncoderPrimitives &p);
+void setupIntraPrimitives_c(EncoderPrimitives &p);
+void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
-void Setup_C_Primitives(EncoderPrimitives &p)
+void setupCPrimitives(EncoderPrimitives &p)
{
- Setup_C_PixelPrimitives(p); // pixel.cpp
- Setup_C_DCTPrimitives(p); // dct.cpp
- Setup_C_IPFilterPrimitives(p); // ipfilter.cpp
- Setup_C_IPredPrimitives(p); // intrapred.cpp
- Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
+ setupPixelPrimitives_c(p); // pixel.cpp
+ setupDCTPrimitives_c(p); // dct.cpp
+ setupFilterPrimitives_c(p); // ipfilter.cpp
+ setupIntraPrimitives_c(p); // intrapred.cpp
+ setupLoopFilterPrimitives_c(p); // loopfilter.cpp
}
-void Setup_Alias_Primitives(EncoderPrimitives &p)
+void setupAliasPrimitives(EncoderPrimitives &p)
{
- /* copy reusable luma primitives to chroma 4:4:4 */
- for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
+#if HIGH_BIT_DEPTH
+ /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
+ for (int i = 0; i < NUM_CU_SIZES; i++)
{
- p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
- p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
- p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
- p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
- p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
+ p.cu[i].sse_pp = (pixelcmp_t)p.cu[i].sse_ss;
+
+ p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
+ p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
+ p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp;
+
+ p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
+ p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
+ p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
+
+ p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
+ p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
+ p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
}
+#endif
+
+ /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */
+
+ p.chroma[X265_CSP_I444].p2s = p.luma_p2s;
+ p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL;
- for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
+ for (int i = 0; i < NUM_PU_SIZES; i++)
{
- p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
- p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
+ p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
+ p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg;
+ p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
}
- for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
+ for (int i = 0; i < NUM_CU_SIZES; i++)
{
- int partL = partitionFromLog2Size(i + 2);
- p.square_copy_pp[i] = p.luma_copy_pp[partL];
- p.square_copy_ps[i] = p.luma_copy_ps[partL];
- p.square_copy_sp[i] = p.luma_copy_sp[partL];
- p.square_copy_ss[i] = p.luma_copy_ss[partL];
+ p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d;
+ p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp;
+ p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps;
+ p.chroma[X265_CSP_I444].cu[i].add_ps = p.cu[i].add_ps;
+ p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
+ p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
+ p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
}
- primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4];
- primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8];
- primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
- primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];
- primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];
-
- // SA8D devolves to SATD for blocks not even multiples of 8x8
- primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4];
- primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8];
- primitives.sa8d_inter[LUMA_4x16] = primitives.satd[LUMA_4x16];
- primitives.sa8d_inter[LUMA_8x4] = primitives.satd[LUMA_8x4];
- primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4];
- primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12];
- primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16];
+ p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd;
+
+ /* Chroma PU can often use luma satd primitives */
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = p.pu[LUMA_4x4].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = p.pu[LUMA_8x8].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = p.pu[LUMA_8x4].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = p.pu[LUMA_4x8].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = p.pu[LUMA_16x8].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = p.pu[LUMA_8x16].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = p.pu[LUMA_16x4].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = p.pu[LUMA_4x16].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = p.pu[LUMA_32x8].satd;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = p.pu[LUMA_8x32].satd;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = p.pu[LUMA_4x8].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = p.pu[LUMA_8x16].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = p.pu[LUMA_4x4].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = p.pu[LUMA_8x8].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = p.pu[LUMA_4x16].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = p.pu[LUMA_8x32].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd;
+
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12] = satd4<8, 12>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = p.pu[LUMA_8x4].satd;
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>;
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd;
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32] = satd4<4, 32>;
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>;
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd;
+ //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64] = satd8<8, 64>;
+
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d;
+
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd;
+
+ /* alias CU copy_pp from square PU copy_pp */
+ for (int i = 0; i < NUM_CU_SIZES; i++)
+ {
+ p.cu[i].copy_pp = p.pu[i].copy_pp;
+
+ for (int c = 0; c < X265_CSP_COUNT; c++)
+ p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp;
+ }
+
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp;
+
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
}
}
using namespace x265;
@@ -120,20 +197,24 @@ void x265_setup_primitives(x265_param *param, int cpuid)
cpuid = x265::cpu_detect();
// initialize global variables
- if (!primitives.sad[0])
+ if (!primitives.pu[0].sad)
{
- Setup_C_Primitives(primitives);
- Setup_Instrinsic_Primitives(primitives, cpuid);
+ setupCPrimitives(primitives);
+
+ /* We do not want the encoder to use the un-optimized intra all-angles
+ * C references. It is better to call the individual angle functions
+ * instead. We must check for NULL before using this primitive */
+ for (int i = 0; i < NUM_TR_SIZE; i++)
+ primitives.cu[i].intra_pred_allangs = NULL;
#if ENABLE_ASSEMBLY
- Setup_Assembly_Primitives(primitives, cpuid);
+ setupInstrinsicPrimitives(primitives, cpuid);
+ setupAssemblyPrimitives(primitives, cpuid);
#else
x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
#endif
- Setup_Alias_Primitives(primitives);
-
- initROM();
+ setupAliasPrimitives(primitives);
}
if (param->logLevel >= X265_LOG_INFO)
@@ -169,74 +250,14 @@ void x265_setup_primitives(x265_param *param, int cpuid)
}
}
-#if !defined(ENABLE_ASSEMBLY)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
+#if ENABLE_ASSEMBLY
+/* these functions are implemented in assembly. When assembly is not being
+ * compiled, they are unnecessary and can be NOPs */
+#else
extern "C" {
-// the intrinsic primitives will not use MMX instructions, so if assembly
-// is disabled there should be no reason to use EMMS.
+int x265_cpu_cpuid_test(void) { return 0; }
void x265_cpu_emms(void) {}
-
-#if defined(X265_ARCH_X86)
-
-#if defined(_MSC_VER)
-# pragma warning(disable: 4100)
-#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
-# define __cpuidex(regsArray, level, index) \
- __asm__ __volatile__ ("cpuid" \
- : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \
- : "0" (level), "2" (index));
-#else
-# error "compiler not supported"
-#endif
-
-int x265_cpu_cpuid_test(void)
-{
- return 0;
-}
-
-void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
-{
- int output[4];
-
- __cpuidex(output, op, 0);
- *eax = output[0];
- *ebx = output[1];
- *ecx = output[2];
- *edx = output[3];
+void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {}
+void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {}
}
-
-void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
-{
- uint64_t out = 0;
-
-#if X265_ARCH_X86
-
-#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
- // MSVC 2010 SP1 or later, or similar Intel release
- out = _xgetbv(op);
-
-#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
-
- uint32_t a, d;
- __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :);
- *eax = a;
- *edx = d;
- return;
-
-#elif defined(_WIN64) // On x64 with older compilers, this is impossible
-
-#endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
-#endif // if x86
-
- *eax = (uint32_t)out;
- *edx = (uint32_t)(out >> 32);
-}
-
-#endif // X265_ARCH_X86
-}
-#endif // if !ENABLE_ASSEMBLY
+#endif
diff --git a/source/common/primitives.h b/source/common/primitives.h
index 8300c21..925e8b6 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -36,85 +36,308 @@
namespace x265 {
// x265 private namespace
-enum LumaPartitions
+enum LumaPU
{
- // Square
+ // Square (the first 5 PUs match the block sizes)
LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64,
// Rectangular
LUMA_8x4, LUMA_4x8,
- LUMA_16x8, LUMA_8x16,
+ LUMA_16x8, LUMA_8x16,
LUMA_32x16, LUMA_16x32,
LUMA_64x32, LUMA_32x64,
// Asymmetrical (0.75, 0.25)
LUMA_16x12, LUMA_12x16, LUMA_16x4, LUMA_4x16,
LUMA_32x24, LUMA_24x32, LUMA_32x8, LUMA_8x32,
LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64,
- NUM_LUMA_PARTITIONS
+ NUM_PU_SIZES
};
-// 4:2:0 chroma partition sizes. These enums are just a convenience for indexing into the
-// chroma primitive arrays when instantiating templates. The function tables should always
-// be indexed by the luma partition enum
-enum Chroma420Partitions
+enum LumaCU // can be indexed using log2n(width)-2
{
- CHROMA_2x2, CHROMA_4x4, CHROMA_8x8, CHROMA_16x16, CHROMA_32x32,
- CHROMA_4x2, CHROMA_2x4,
- CHROMA_8x4, CHROMA_4x8,
- CHROMA_16x8, CHROMA_8x16,
- CHROMA_32x16, CHROMA_16x32,
- CHROMA_8x6, CHROMA_6x8, CHROMA_8x2, CHROMA_2x8,
- CHROMA_16x12, CHROMA_12x16, CHROMA_16x4, CHROMA_4x16,
- CHROMA_32x24, CHROMA_24x32, CHROMA_32x8, CHROMA_8x32,
- NUM_CHROMA_PARTITIONS
+ BLOCK_4x4,
+ BLOCK_8x8,
+ BLOCK_16x16,
+ BLOCK_32x32,
+ BLOCK_64x64,
+ NUM_CU_SIZES
};
-enum Chroma422Partitions
+enum { NUM_TR_SIZE = 4 }; // TU are 4x4, 8x8, 16x16, and 32x32
+
+
+/* Chroma partition sizes. These enums are only a convenience for indexing into
+ * the chroma primitive arrays when instantiating macros or templates. The
+ * chroma function tables should always be indexed by a LumaPU enum when used. */
+enum ChromaPU420
{
- CHROMA422_2x4, CHROMA422_4x8, CHROMA422_8x16, CHROMA422_16x32, CHROMA422_32x64,
- CHROMA422_4x4, CHROMA422_2x8,
- CHROMA422_8x8, CHROMA422_4x16,
- CHROMA422_16x16, CHROMA422_8x32,
- CHROMA422_32x32, CHROMA422_16x64,
- CHROMA422_8x12, CHROMA422_6x16, CHROMA422_8x4, CHROMA422_2x16,
- CHROMA422_16x24, CHROMA422_12x32, CHROMA422_16x8, CHROMA422_4x32,
- CHROMA422_32x48, CHROMA422_24x64, CHROMA422_32x16, CHROMA422_8x64,
- NUM_CHROMA_PARTITIONS422
+ CHROMA_420_2x2, CHROMA_420_4x4, CHROMA_420_8x8, CHROMA_420_16x16, CHROMA_420_32x32,
+ CHROMA_420_4x2, CHROMA_420_2x4,
+ CHROMA_420_8x4, CHROMA_420_4x8,
+ CHROMA_420_16x8, CHROMA_420_8x16,
+ CHROMA_420_32x16, CHROMA_420_16x32,
+ CHROMA_420_8x6, CHROMA_420_6x8, CHROMA_420_8x2, CHROMA_420_2x8,
+ CHROMA_420_16x12, CHROMA_420_12x16, CHROMA_420_16x4, CHROMA_420_4x16,
+ CHROMA_420_32x24, CHROMA_420_24x32, CHROMA_420_32x8, CHROMA_420_8x32,
};
-enum SquareBlocks // Routines can be indexed using log2n(width)-2
+enum ChromaCU420
{
- BLOCK_4x4,
- BLOCK_8x8,
- BLOCK_16x16,
- BLOCK_32x32,
- BLOCK_64x64,
- NUM_SQUARE_BLOCKS
+ BLOCK_420_2x2,
+ BLOCK_420_4x4,
+ BLOCK_420_8x8,
+ BLOCK_420_16x16,
+ BLOCK_420_32x32
};
-enum { NUM_TR_SIZE = 4 };
+enum ChromaPU422
+{
+ CHROMA_422_2x4, CHROMA_422_4x8, CHROMA_422_8x16, CHROMA_422_16x32, CHROMA_422_32x64,
+ CHROMA_422_4x4, CHROMA_422_2x8,
+ CHROMA_422_8x8, CHROMA_422_4x16,
+ CHROMA_422_16x16, CHROMA_422_8x32,
+ CHROMA_422_32x32, CHROMA_422_16x64,
+ CHROMA_422_8x12, CHROMA_422_6x16, CHROMA_422_8x4, CHROMA_422_2x16,
+ CHROMA_422_16x24, CHROMA_422_12x32, CHROMA_422_16x8, CHROMA_422_4x32,
+ CHROMA_422_32x48, CHROMA_422_24x64, CHROMA_422_32x16, CHROMA_422_8x64,
+};
-// NOTE: Not all DCT functions support dest stride
-enum Dcts
+enum ChromaCU422
{
- DST_4x4,
- DCT_4x4,
- DCT_8x8,
- DCT_16x16,
- DCT_32x32,
- NUM_DCTS
+ BLOCK_422_2x4,
+ BLOCK_422_4x8,
+ BLOCK_422_8x16,
+ BLOCK_422_16x32,
+ BLOCK_422_32x64
};
-enum IDcts
+typedef int (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef int (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
+
+typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
+typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
+
+typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+
+typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
+typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
+
+typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
+typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
+
+typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc,
+ intptr_t src_stride, intptr_t dst_stride, int width, int height);
+typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
+typedef void (*ssim_4x4x2_core_t)(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
+typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width);
+typedef uint64_t (*var_t)(const pixel* pix, intptr_t stride);
+typedef void (*plane_copy_deinterleave_t)(pixel* dstu, intptr_t dstuStride, pixel* dstv, intptr_t dstvStride, const pixel* src, intptr_t srcStride, int w, int h);
+
+typedef void (*filter_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+typedef void (*filter_ps_t) (const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
+
+typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
+typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+
+typedef void (*pixel_sub_ps_t)(int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
+typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
+typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+
+typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
+
+/* Function pointers to optimized encoder primitives. Each pointer can reference
+ * either an assembly routine, a SIMD intrinsic primitive, or a C function */
+struct EncoderPrimitives
{
- IDST_4x4,
- IDCT_4x4,
- IDCT_8x8,
- IDCT_16x16,
- IDCT_32x32,
- NUM_IDCTS
+ /* These primitives can be used for any sized prediction unit (from 4x4 to
+ * 64x64, square, rectangular - 50/50 or asymmetrical - 25/75) and are
+ * generally restricted to motion estimation and motion compensation (inter
+ * prediction. Note that the 4x4 PU can only be used for intra, which is
+ * really a 4x4 TU, so at most copy_pp and satd will use 4x4. This array is
+ * indexed by LumaPU values, which can be retrieved by partitionFromSizes() */
+ struct PU
+ {
+ pixelcmp_t sad; // Sum of Absolute Differences
+ pixelcmp_x3_t sad_x3; // Sum of Absolute Differences, 3 mv offsets at once
+ pixelcmp_x4_t sad_x4; // Sum of Absolute Differences, 4 mv offsets at once
+ pixelcmp_t satd; // Sum of Absolute Transformed Differences (4x4 Hadamard)
+
+ filter_pp_t luma_hpp; // 8-tap luma motion compensation interpolation filters
+ filter_hps_t luma_hps;
+ filter_pp_t luma_vpp;
+ filter_ps_t luma_vps;
+ filter_sp_t luma_vsp;
+ filter_ss_t luma_vss;
+ filter_hv_pp_t luma_hvpp; // combines hps + vsp
+
+ pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264)
+ addAvg_t addAvg; // bidir motion compensation, uses 16bit values
+
+ copy_pp_t copy_pp;
+ }
+ pu[NUM_PU_SIZES];
+
+ /* These primitives can be used for square TU blocks (4x4 to 32x32) or
+ * possibly square CU blocks (8x8 to 64x64). Some primitives are used for
+ * both CU and TU so we merge them into one array that is indexed uniformly.
+ * This keeps the index logic uniform and simple and improves cache
+ * coherency. CU only primitives will leave 4x4 pointers NULL while TU only
+ * primitives will leave 64x64 pointers NULL. Indexed by LumaCU */
+ struct CU
+ {
+ dct_t dct;
+ idct_t idct;
+ calcresidual_t calcresidual;
+ pixel_sub_ps_t sub_ps;
+ pixel_add_ps_t add_ps;
+ blockfill_s_t blockfill_s; // block fill, for DC transforms
+ copy_cnt_t copy_cnt; // copy coeff while counting non-zero
+
+ cpy2Dto1D_shl_t cpy2Dto1D_shl;
+ cpy2Dto1D_shr_t cpy2Dto1D_shr;
+ cpy1Dto2D_shl_t cpy1Dto2D_shl;
+ cpy1Dto2D_shr_t cpy1Dto2D_shr;
+
+ copy_sp_t copy_sp;
+ copy_ps_t copy_ps;
+ copy_ss_t copy_ss;
+ copy_pp_t copy_pp; // alias to pu[].copy_pp
+
+ var_t var; // block internal variance
+ pixelcmp_t sse_pp; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
+ pixelcmp_ss_t sse_ss; // Sum of Square Error (short, short) fenc alignment not assumed
+ pixelcmp_t psy_cost_pp; // difference in AC energy between two pixel blocks
+ pixelcmp_ss_t psy_cost_ss; // difference in AC energy between two signed residual blocks
+ pixel_ssd_s_t ssd_s; // Sum of Square Error (residual coeff to self)
+ pixelcmp_t sa8d; // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
+
+ transpose_t transpose; // transpose pixel block; for use with intra all-angs
+ intra_allangs_t intra_pred_allangs;
+ intra_pred_t intra_pred[NUM_INTRA_MODE];
+ }
+ cu[NUM_CU_SIZES];
+
+ /* These remaining primitives work on either fixed block sizes or take
+ * block dimensions as arguments and thus do not belong in either the PU or
+ * the CU arrays */
+ dct_t dst4x4;
+ idct_t idst4x4;
+
+ quant_t quant;
+ nquant_t nquant;
+ dequant_scaling_t dequant_scaling;
+ dequant_normal_t dequant_normal;
+ count_nonzero_t count_nonzero;
+ denoiseDct_t denoiseDct;
+
+ scale_t scale1D_128to64;
+ scale_t scale2D_64to32;
+
+ ssim_4x4x2_core_t ssim_4x4x2_core;
+ ssim_end4_t ssim_end_4;
+
+ sign_t sign;
+ saoCuOrgE0_t saoCuOrgE0;
+ saoCuOrgE1_t saoCuOrgE1;
+ saoCuOrgE2_t saoCuOrgE2;
+ saoCuOrgE3_t saoCuOrgE3;
+ saoCuOrgB0_t saoCuOrgB0;
+
+ downscale_t frameInitLowres;
+ cutree_propagate_cost propagateCost;
+
+ extendCURowBorder_t extendRowBorder;
+ planecopy_cp_t planecopy_cp;
+ planecopy_sp_t planecopy_sp;
+
+ weightp_sp_t weight_sp;
+ weightp_pp_t weight_pp;
+
+ filter_p2s_t luma_p2s;
+
+ /* There is one set of chroma primitives per color space. An encoder will
+ * have just a single color space and thus it will only ever use one entry
+ * in this array. However we always fill all entries in the array in case
+ * multiple encoders with different color spaces share the primitive table
+ * in a single process. Note that 4:2:0 PU and CU are 1/2 width and 1/2
+ * height of their luma counterparts. 4:2:2 PU and CU are 1/2 width and full
+ * height, while 4:4:4 directly uses the luma block sizes and shares luma
+ * primitives for all cases except for the interpolation filters. 4:4:4
+ * interpolation filters have luma partition sizes but are only 4-tap. */
+ struct Chroma
+ {
+ /* Chroma prediction unit primitives. Indexed by LumaPU */
+ struct PUChroma
+ {
+ pixelcmp_t satd; // if chroma PU is not multiple of 4x4, will be NULL
+ filter_pp_t filter_vpp;
+ filter_ps_t filter_vps;
+ filter_sp_t filter_vsp;
+ filter_ss_t filter_vss;
+ filter_pp_t filter_hpp;
+ filter_hps_t filter_hps;
+ addAvg_t addAvg;
+ copy_pp_t copy_pp;
+ }
+ pu[NUM_PU_SIZES];
+
+ /* Chroma transform and coding unit primitives. Indexed by LumaCU */
+ struct CUChroma
+ {
+ pixelcmp_t sa8d; // if chroma CU is not multiple of 8x8, will use satd
+ pixelcmp_t sse_pp;
+ pixel_sub_ps_t sub_ps;
+ pixel_add_ps_t add_ps;
+
+ copy_ps_t copy_ps;
+ copy_sp_t copy_sp;
+ copy_ss_t copy_ss;
+ copy_pp_t copy_pp;
+ }
+ cu[NUM_CU_SIZES];
+
+ filter_p2s_t p2s; // takes width/height as arguments
+ }
+ chroma[X265_CSP_COUNT];
};
-// Returns a LumaPartitions enum for the given size, always expected to return a valid enum
+/* This copy of the table is what gets used by the encoder */
+extern EncoderPrimitives primitives;
+
+/* Returns a LumaPU enum for the given size, always expected to return a valid enum */
inline int partitionFromSizes(int width, int height)
{
X265_CHECK(((width | height) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n");
@@ -132,188 +355,10 @@ inline int partitionFromLog2Size(int log2Size)
return log2Size - 2;
}
-typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
-typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
-typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
-typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride);
-typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
-typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
-typedef void (*blockcpy_sp_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
-typedef void (*blockcpy_sc_t)(int bx, int by, int16_t *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned
-typedef void (*pixelsub_ps_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
-typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
-
-typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
-typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
-
-typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
-typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
-typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
-typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
-typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
-typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
-typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift);
-
-typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
-typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
-typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
-
-typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
-typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
-
-typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
-typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
- intptr_t src_stride, intptr_t dst_stride, int width, int height);
-typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
-typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]);
-typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width);
-typedef uint64_t (*var_t)(pixel *pix, intptr_t stride);
-typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h);
-
-typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-typedef void (*filter_ps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_sp_t) (int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_ss_t) (int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
-typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-
-typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
-typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
-typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-typedef void (*copy_ss_t)(int16_t *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
-
-typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
-
-typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft);
-typedef void (*planecopy_cp_t) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
-typedef void (*planecopy_sp_t) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
-
-typedef void (*cutree_propagate_cost) (int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len);
-
-/* Define a structure containing function pointers to optimized encoder
- * primitives. Each pointer can reference either an assembly routine,
- * a vectorized primitive, or a C function. */
-struct EncoderPrimitives
-{
- pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size
- pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size
- pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size
- pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
- pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed
- pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed
- pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed
- pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD)
- pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
- pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks
- pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks
- pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
-
- blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value
- cvt16to32_shl_t cvt16to32_shl;
- cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
- cvt32to16_shr_t cvt32to16_shr;
- cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
- copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
- copy_shr_t copy_shr;
- copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1];
-
- copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
- copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
- copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS];
- copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS];
- pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS];
- pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS];
- copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS];
- copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS];
- copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS];
- copy_ss_t square_copy_ss[NUM_SQUARE_BLOCKS];
-
- filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
- filter_hps_t luma_hps[NUM_LUMA_PARTITIONS];
- filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS];
- filter_ps_t luma_vps[NUM_LUMA_PARTITIONS];
- filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS];
- filter_ss_t luma_vss[NUM_LUMA_PARTITIONS];
- filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
- filter_p2s_t luma_p2s;
- filter_p2s_t chroma_p2s[X265_CSP_COUNT];
-
- weightp_sp_t weight_sp;
- weightp_pp_t weight_pp;
- pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS];
- addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS];
-
- intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
- intra_allangs_t intra_pred_allangs[NUM_TR_SIZE];
- scale_t scale1D_128to64;
- scale_t scale2D_64to32;
-
- dct_t dct[NUM_DCTS];
- idct_t idct[NUM_IDCTS];
- quant_t quant;
- nquant_t nquant;
- dequant_scaling_t dequant_scaling;
- dequant_normal_t dequant_normal;
- count_nonzero_t count_nonzero;
- denoiseDct_t denoiseDct;
-
- calcresidual_t calcresidual[NUM_SQUARE_BLOCKS];
- transpose_t transpose[NUM_SQUARE_BLOCKS];
-
- var_t var[NUM_SQUARE_BLOCKS];
- ssim_4x4x2_core_t ssim_4x4x2_core;
- ssim_end4_t ssim_end_4;
-
- downscale_t frame_init_lowres_core;
- plane_copy_deinterleave_t plane_copy_deinterleave_c;
- extendCURowBorder_t extendRowBorder;
- // sao primitives
- saoCuOrgE0_t saoCuOrgE0;
- planecopy_cp_t planecopy_cp;
- planecopy_sp_t planecopy_sp;
-
- cutree_propagate_cost propagateCost;
-
- struct
- {
- filter_pp_t filter_vpp[NUM_LUMA_PARTITIONS];
- filter_ps_t filter_vps[NUM_LUMA_PARTITIONS];
- filter_sp_t filter_vsp[NUM_LUMA_PARTITIONS];
- filter_ss_t filter_vss[NUM_LUMA_PARTITIONS];
- filter_pp_t filter_hpp[NUM_LUMA_PARTITIONS];
- filter_hps_t filter_hps[NUM_LUMA_PARTITIONS];
- addAvg_t addAvg[NUM_LUMA_PARTITIONS];
- copy_pp_t copy_pp[NUM_LUMA_PARTITIONS];
- copy_sp_t copy_sp[NUM_LUMA_PARTITIONS];
- copy_ps_t copy_ps[NUM_LUMA_PARTITIONS];
- copy_ss_t copy_ss[NUM_LUMA_PARTITIONS];
- pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS];
- pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS];
- } chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here
-};
-
-void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
-
-/* This copy of the table is what gets used by the encoder.
- * It must be initialized before the encoder begins. */
-extern EncoderPrimitives primitives;
-
-void Setup_C_Primitives(EncoderPrimitives &p);
-void Setup_Instrinsic_Primitives(EncoderPrimitives &p, int cpuMask);
-void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask);
-void Setup_Alias_Primitives(EncoderPrimitives &p);
+void setupCPrimitives(EncoderPrimitives &p);
+void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
+void setupAliasPrimitives(EncoderPrimitives &p);
}
#endif // ifndef X265_PRIMITIVES_H
diff --git a/source/common/quant.cpp b/source/common/quant.cpp
index 387962c..e847ee5 100644
--- a/source/common/quant.cpp
+++ b/source/common/quant.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -50,7 +50,7 @@ inline int fastMin(int x, int y)
return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
}
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
@@ -81,7 +81,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOne
// NOTE: mapping to x86 hardware instruction BSR
unsigned long size;
- CLZ32(size, absLevel);
+ CLZ(size, absLevel);
int egs = size * 2 + 1;
rate += egs << 15;
@@ -106,7 +106,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOne
}
/* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(absLevel, "absLevel should not be zero\n");
@@ -135,7 +135,7 @@ inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *g
if (symbol)
{
unsigned long idx;
- CLZ32(idx, symbol + 1);
+ CLZ(idx, symbol + 1);
length = idx;
}
@@ -166,9 +166,10 @@ bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList,
m_useRDOQ = useRDOQ;
m_psyRdoqScale = (int64_t)(psyScale * 256.0);
m_scalingList = &scalingList;
- m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
+ m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
+ m_tqBypass = false;
return m_resiDctCoeff && m_fencShortBuf;
}
@@ -190,24 +191,27 @@ Quant::~Quant()
X265_FREE(m_fencShortBuf);
}
-void Quant::setQPforQuant(const CUData& ctu)
+void Quant::setQPforQuant(const CUData& cu)
{
- m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
- int qpy = ctu.m_qp[0];
+ m_tqBypass = !!cu.m_tqBypass[0];
+ if (m_tqBypass)
+ return;
+ m_nr = m_frameNr ? &m_frameNr[cu.m_encData->m_frameEncoderID] : NULL;
+ int qpy = cu.m_qp[0];
m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
- setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat);
- setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat);
+ setChromaQP(qpy + cu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, cu.m_chromaFormat);
+ setChromaQP(qpy + cu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, cu.m_chromaFormat);
}
void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
{
- int qp = Clip3(-QP_BD_OFFSET, 57, qpin);
+ int qp = x265_clip3(-QP_BD_OFFSET, 57, qpin);
if (qp >= 30)
{
if (chFmt == X265_CSP_I420)
qp = g_chromaScale[qp];
else
- qp = X265_MIN(qp, 51);
+ qp = X265_MIN(qp, QP_MAX_SPEC);
}
m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET);
}
@@ -216,7 +220,7 @@ void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
{
const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
- const uint16_t *scan = codeParams.scan;
+ const uint16_t* scan = codeParams.scan;
bool lastCG = true;
for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
@@ -322,58 +326,56 @@ uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSi
return numSig;
}
-uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride,
+uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
{
- if (cu.m_tqBypass[absPartIdx])
+ const uint32_t sizeIdx = log2TrSize - 2;
+ if (m_tqBypass)
{
X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
- return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
+ return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
}
bool isLuma = ttype == TEXT_LUMA;
bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip;
- bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA;
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
- int trSize = 1 << log2TrSize;
X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
if (useTransformSkip)
{
#if X265_DEPTH <= 10
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ X265_CHECK(transformShift >= 0, "invalid transformShift\n");
+ primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
#else
if (transformShift >= 0)
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
else
- {
- int shift = -transformShift;
- int offset = (1 << (shift - 1));
- primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
- }
+ primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
- int useDST = !sizeIdx && isLuma && isIntra;
- int index = DCT_4x4 + sizeIdx - useDST;
+ bool isIntra = cu.isIntra(absPartIdx);
- primitives.dct[index](residual, m_resiDctCoeff, stride);
+ if (!sizeIdx && isLuma && isIntra)
+ primitives.dst4x4(residual, m_resiDctCoeff, resiStride);
+ else
+ primitives.cu[sizeIdx].dct(residual, m_resiDctCoeff, resiStride);
/* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
* there is no risk of performing this DCT unnecessarily */
if (usePsy)
{
+ int trSize = 1 << log2TrSize;
/* perform DCT on source pixels for psy-rdoq */
- primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
- primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
+ primitives.cu[sizeIdx].copy_ps(m_fencShortBuf, trSize, fenc, fencStride);
+ primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
}
- if (m_nr && !isIntra)
+ if (m_nr)
{
/* denoise is not applied to intra residual, so DST can be ignored */
- int cat = sizeIdx + 4 * !isLuma;
+ int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
int numCoeff = 1 << (log2TrSize * 2);
primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
m_nr->count[cat]++;
@@ -389,7 +391,7 @@ uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16
int scalingListType = ttype + (isLuma ? 3 : 0);
int rem = m_qpParam[ttype].rem;
int per = m_qpParam[ttype].per;
- int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
int qbits = QUANT_SHIFT + per + transformShift;
int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
@@ -408,12 +410,13 @@ uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16
}
}
-void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+void Quant::invtransformNxN(int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
{
- if (transQuantBypass)
+ const uint32_t sizeIdx = log2TrSize - 2;
+ if (m_tqBypass)
{
- primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
+ primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
return;
}
@@ -427,7 +430,7 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s
if (m_scalingList->m_bEnabled)
{
int scalingListType = (bIntra ? 0 : 3) + ttype;
- int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
}
else
@@ -438,20 +441,18 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s
if (useTransformSkip)
{
- int trSize = 1 << log2TrSize;
-
#if X265_DEPTH <= 10
- primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ X265_CHECK(transformShift > 0, "invalid transformShift\n");
+ primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
#else
if (transformShift > 0)
- primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
else
- primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
+ primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
@@ -459,23 +460,26 @@ void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t s
// DC only
if (numSig == 1 && coeff[0] != 0 && !useDST)
{
- const int shift_1st = 7;
+ const int shift_1st = 7 - 6;
const int add_1st = 1 << (shift_1st - 1);
- const int shift_2nd = 12 - (X265_DEPTH - 8);
+ const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
const int add_2nd = 1 << (shift_2nd - 1);
- int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
- primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
+ int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
+ primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
return;
}
- primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
+ if (useDST)
+ primitives.idst4x4(m_resiDctCoeff, residual, resiStride);
+ else
+ primitives.cu[sizeIdx].idct(m_resiDctCoeff, residual, resiStride);
}
}
/* Rate distortion optimized quantization for entropy coding engines using
* probability models like CABAC */
-uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
{
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
@@ -486,7 +490,7 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
int per = m_qpParam[ttype].per;
int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
int add = (1 << (qbits - 1));
- int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
int numCoeff = 1 << (log2TrSize * 2);
@@ -503,7 +507,7 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
/* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
* scale applied that must be removed during unquant. Note that in real dequant there is clipping
* at several stages. We skip the clipping for simplicity when measuring RD cost */
- int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
int scaleBits = SCALE_BITS - 2 * transformShift;
@@ -616,8 +620,8 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
// coefficient level estimation
const uint32_t oneCtx = 4 * ctxSet + c1;
const uint32_t absCtx = ctxSet + c2;
- const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
- const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
+ const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
+ const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
uint16_t level = 0;
uint32_t sigCoefBits = 0;
@@ -842,12 +846,23 @@ uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, Tex
* cost of signaling it as not-significant */
uint32_t blkPos = codeParams.scan[scanPos];
if (dstCoeff[blkPos])
- {
- /* Swap the cost of signaling its significant coeff bit with the cost of
- * signaling its lastNZ pos */
- uint32_t posY = blkPos >> log2TrSize;
- uint32_t posX = blkPos - (posY << log2TrSize);
- uint32_t bitsLastNZ = codeParams.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY);
+ {
+ // Calculates the cost of signaling the last significant coefficient in the block
+ uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
+ if (codeParams.scanType == SCAN_VER)
+ std::swap(pos[0], pos[1]);
+ uint32_t bitsLastNZ = 0;
+
+ for (int i = 0; i < 2; i++)
+ {
+ int temp = g_lastCoeffTable[pos[i]];
+ int prefixOnes = temp & 15;
+ int suffixLen = temp >> 4;
+
+ bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
+ bitsLastNZ += IEP_RATE * suffixLen;
+ }
+
int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
if (costAsLast < bestCost)
@@ -1096,21 +1111,6 @@ uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32
return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset;
}
-/* Calculates the cost of signaling the last significant coefficient in the block */
-inline uint32_t Quant::getRateLast(uint32_t posx, uint32_t posy) const
-{
- uint32_t ctxX = getGroupIdx(posx);
- uint32_t ctxY = getGroupIdx(posy);
- uint32_t cost = m_entropyCoder->m_estBitsSbac.lastXBits[ctxX] + m_entropyCoder->m_estBitsSbac.lastYBits[ctxY];
-
- int32_t maskX = (int32_t)(2 - posx) >> 31;
- int32_t maskY = (int32_t)(2 - posy) >> 31;
-
- cost += maskX & (IEP_RATE * ((ctxX - 2) >> 1));
- cost += maskY & (IEP_RATE * ((ctxY - 2) >> 1));
- return cost;
-}
-
/* Context derivation process of coeff_abs_significant_flag */
uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG)
{
diff --git a/source/common/quant.h b/source/common/quant.h
index ac575f7..49d6763 100644
--- a/source/common/quant.h
+++ b/source/common/quant.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -58,6 +58,20 @@ struct QpParam
}
};
+#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
+#define MAX_NUM_TR_CATEGORIES 16 /* 32, 16, 8, 4 transform categories each for luma and chroma */
+
+// NOTE: MUST be 16-byte aligned for asm code
+struct NoiseReduction
+{
+ /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
+ * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
+ * Intra 0..7 - Inter 8..15 */
+ uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+ uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+ uint32_t count[MAX_NUM_TR_CATEGORIES];
+};
+
class Quant
{
protected:
@@ -69,8 +83,8 @@ protected:
bool m_useRDOQ;
int64_t m_psyRdoqScale;
- int32_t* m_resiDctCoeff;
- int32_t* m_fencDctCoeff;
+ int16_t* m_resiDctCoeff;
+ int16_t* m_fencDctCoeff;
int16_t* m_fencShortBuf;
enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */
@@ -79,6 +93,7 @@ public:
NoiseReduction* m_nr;
NoiseReduction* m_frameNr; // Array of NR structures, one for each frameEncoder
+ bool m_tqBypass;
Quant();
~Quant();
@@ -88,12 +103,12 @@ public:
bool allocNoiseReduction(const x265_param& param);
/* CU setup */
- void setQPforQuant(const CUData& ctu);
+ void setQPforQuant(const CUData& cu);
- uint32_t transformNxN(CUData& cu, pixel *fenc, uint32_t fencstride, int16_t* residual, uint32_t stride, coeff_t* coeff,
+ uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
- void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+ void invtransformNxN(int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
/* static methods shared with entropy.cpp */
@@ -107,30 +122,8 @@ protected:
uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
- uint32_t rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
- inline uint32_t getRateLast(uint32_t posx, uint32_t posy) const;
+ uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
};
-
-static inline uint32_t getGroupIdx(const uint32_t idx)
-{
- // TODO: Why is this not a table lookup?
-
- uint32_t group = (idx >> 3);
-
- if (idx >= 24)
- group = 2;
- uint32_t groupIdx = ((idx >> (group + 1)) - 2) + 4 + (group << 1);
- if (idx <= 3)
- groupIdx = idx;
-
-#ifdef _DEBUG
- static const uint8_t g_groupIdx[32] = { 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9 };
- assert(groupIdx == g_groupIdx[idx]);
-#endif
-
- return groupIdx;
-}
-
}
#endif // ifndef X265_QUANT_H
diff --git a/source/common/scalinglist.cpp b/source/common/scalinglist.cpp
index d64bcee..f263ca2 100644
--- a/source/common/scalinglist.cpp
+++ b/source/common/scalinglist.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
diff --git a/source/common/scalinglist.h b/source/common/scalinglist.h
index e133498..9022e38 100644
--- a/source/common/scalinglist.h
+++ b/source/common/scalinglist.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
diff --git a/source/common/shortyuv.cpp b/source/common/shortyuv.cpp
index 2a7e153..9428767 100644
--- a/source/common/shortyuv.cpp
+++ b/source/common/shortyuv.cpp
@@ -74,9 +74,9 @@ void ShortYuv::clear()
void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size)
{
const int sizeIdx = log2Size - 2;
- primitives.luma_sub_ps[sizeIdx](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
- primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
- primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+ primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
+ primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+ primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
}
void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
@@ -84,7 +84,7 @@ void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_
const int16_t* src = getLumaAddr(absPartIdx);
int16_t* dst = dstYuv.getLumaAddr(absPartIdx);
- primitives.square_copy_ss[log2Size - 2](dst, dstYuv.m_size, const_cast<int16_t*>(src), m_size);
+ primitives.cu[log2Size - 2].copy_ss(dst, dstYuv.m_size, src, m_size);
}
void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
@@ -92,7 +92,7 @@ void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log
const int16_t* src = getLumaAddr(absPartIdx);
pixel* dst = dstYuv.getLumaAddr(absPartIdx);
- primitives.square_copy_sp[log2Size - 2](dst, dstYuv.m_size, const_cast<int16_t*>(src), m_size);
+ primitives.cu[log2Size - 2].copy_sp(dst, dstYuv.m_size, src, m_size);
}
void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -103,8 +103,8 @@ void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint3
int16_t* dstU = dstYuv.getCbAddr(absPartIdx);
int16_t* dstV = dstYuv.getCrAddr(absPartIdx);
- primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, const_cast<int16_t*>(srcU), m_csize);
- primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, const_cast<int16_t*>(srcV), m_csize);
+ primitives.chroma[m_csp].cu[part].copy_ss(dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].cu[part].copy_ss(dstV, dstYuv.m_csize, srcV, m_csize);
}
void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -115,6 +115,6 @@ void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t l
pixel* dstU = dstYuv.getCbAddr(absPartIdx);
pixel* dstV = dstYuv.getCrAddr(absPartIdx);
- primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, const_cast<int16_t*>(srcU), m_csize);
- primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, const_cast<int16_t*>(srcV), m_csize);
+ primitives.chroma[m_csp].cu[part].copy_sp(dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].cu[part].copy_sp(dstV, dstYuv.m_csize, srcV, m_csize);
}
diff --git a/source/common/slice.cpp b/source/common/slice.cpp
index 2e850cd..52a990f 100644
--- a/source/common/slice.cpp
+++ b/source/common/slice.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
diff --git a/source/common/slice.h b/source/common/slice.h
index bd0ba63..debcda1 100644
--- a/source/common/slice.h
+++ b/source/common/slice.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -230,6 +230,7 @@ struct SPS
uint32_t maxDecPicBuffering; // these are dups of VPS values
int numReorderPics;
+ int maxLatencyIncrease;
bool bUseStrongIntraSmoothing; // use param
bool bTemporalMVPEnabled;
@@ -242,8 +243,7 @@ struct PPS
{
uint32_t maxCuDQPDepth;
- int chromaCbQpOffset; // use param
- int chromaCrQpOffset; // use param
+ int chromaQpOffset[2]; // use param
bool bUseWeightPred; // use param
bool bUseWeightedBiPred; // use param
@@ -334,6 +334,8 @@ public:
void setRefPicList(PicList& picList);
+ const Frame* getRefPic(int list, int refIdx) const { return refIdx >= 0 ? m_refPicList[list][refIdx] : NULL; }
+
bool getRapPicFlag() const
{
return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
diff --git a/source/common/threading.cpp b/source/common/threading.cpp
index cb50eb2..1d888ae 100644
--- a/source/common/threading.cpp
+++ b/source/common/threading.cpp
@@ -1,6 +1,4 @@
/*****************************************************************************
- * x265: threading class and intrinsics
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve at borho.org>
@@ -48,21 +46,21 @@ bool Thread::start()
{
DWORD threadId;
- this->thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId);
+ thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId);
return threadId > 0;
}
void Thread::stop()
{
- if (this->thread)
- WaitForSingleObject(this->thread, INFINITE);
+ if (thread)
+ WaitForSingleObject(thread, INFINITE);
}
Thread::~Thread()
{
- if (this->thread)
- CloseHandle(this->thread);
+ if (thread)
+ CloseHandle(thread);
}
#else /* POSIX / pthreads */
@@ -79,10 +77,9 @@ static void *ThreadShim(void *opaque)
bool Thread::start()
{
- if (pthread_create(&this->thread, NULL, ThreadShim, this))
+ if (pthread_create(&thread, NULL, ThreadShim, this))
{
- this->thread = 0;
-
+ thread = 0;
return false;
}
@@ -91,8 +88,8 @@ bool Thread::start()
void Thread::stop()
{
- if (this->thread)
- pthread_join(this->thread, NULL);
+ if (thread)
+ pthread_join(thread, NULL);
}
Thread::~Thread() {}
@@ -101,6 +98,7 @@ Thread::~Thread() {}
Thread::Thread()
{
- this->thread = 0;
+ thread = 0;
}
+
}
diff --git a/source/common/threading.h b/source/common/threading.h
index ef5642a..b3b3dbd 100644
--- a/source/common/threading.h
+++ b/source/common/threading.h
@@ -1,6 +1,4 @@
/*****************************************************************************
- * x265: threading class and intrinsics
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve at borho.org>
@@ -49,66 +47,26 @@
#include <sys/time.h>
#include <unistd.h>
-#define CLZ32(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
-#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x)
-#define ATOMIC_OR(ptr, mask) __sync_or_and_fetch(ptr, mask)
-#define ATOMIC_CAS(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
-#define ATOMIC_CAS32(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
+#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
+#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x)
+#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask)
+#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1)
#define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1)
+#define ATOMIC_ADD(ptr, value) __sync_add_and_fetch((volatile int32_t*)ptr, value)
#define GIVE_UP_TIME() usleep(0)
#elif defined(_MSC_VER) /* Windows atomic intrinsics */
#include <intrin.h>
-#if !_WIN64
-inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
- uint32_t high32 = (uint32_t)(x64 >> 32);
- uint32_t low32 = (uint32_t)x64;
-
- if (high32)
- {
- _BitScanReverse(id, high32);
- *id += 32;
- return 1;
- }
- else if (low32)
- return _BitScanReverse(id, low32);
- else
- return *id = 0;
-}
-
-inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
- uint32_t high32 = (uint32_t)(x64 >> 32);
- uint32_t low32 = (uint32_t)x64;
-
- if (high32)
- {
- _BitScanForward(id, high32);
- *id += 32;
- return 1;
- }
- else if (low32)
- return _BitScanForward(id, low32);
- else
- return *id = 0;
-}
-
-#endif // if !_WIN64
-
-#ifndef ATOMIC_OR
-#define ATOMIC_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, mask)
-#endif
-
-#define CLZ32(id, x) _BitScanReverse(&id, x)
-#define CTZ64(id, x) _BitScanForward64(&id, x)
-#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
-#define ATOMIC_CAS32(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
+#define CLZ(id, x) _BitScanReverse(&id, x)
+#define CTZ(id, x) _BitScanForward(&id, x)
#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr)
#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr)
+#define ATOMIC_ADD(ptr, value) InterlockedExchangeAdd((volatile LONG*)ptr, value)
+#define ATOMIC_OR(ptr, mask) _InterlockedOr((volatile LONG*)ptr, (LONG)mask)
+#define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
#define GIVE_UP_TIME() Sleep(0)
#endif // ifdef __GNUC__
diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp
index 8a2ab9d..ccb9ab6 100644
--- a/source/common/threadpool.cpp
+++ b/source/common/threadpool.cpp
@@ -1,6 +1,4 @@
/*****************************************************************************
- * x265: singleton thread pool and interface classes
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve at borho.org>
@@ -87,7 +85,7 @@ private:
int m_numThreads;
int m_numSleepMapWords;
PoolThread *m_threads;
- volatile uint64_t *m_sleepMap;
+ volatile uint32_t *m_sleepMap;
/* Lock for write access to the provider lists. Threads are
* always allowed to read m_firstProvider and follow the
@@ -139,6 +137,8 @@ public:
void PoolThread::threadMain()
{
+ THREAD_NAME("Worker", m_id);
+
#if _WIN32
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
#else
@@ -174,8 +174,8 @@ void PoolThread::threadMain()
void ThreadPoolImpl::markThreadAsleep(int id)
{
- int word = id >> 6;
- uint64_t bit = 1LL << (id & 63);
+ int word = id >> 5;
+ uint32_t bit = 1 << (id & 31);
ATOMIC_OR(&m_sleepMap[word], bit);
}
@@ -186,16 +186,16 @@ void ThreadPoolImpl::pokeIdleThread()
* not give up until a thread is awakened or all of them are awake */
for (int i = 0; i < m_numSleepMapWords; i++)
{
- uint64_t oldval = m_sleepMap[i];
+ uint32_t oldval = m_sleepMap[i];
while (oldval)
{
unsigned long id;
- CTZ64(id, oldval);
+ CTZ(id, oldval);
- uint64_t newval = oldval & ~(1LL << id);
- if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval)
+ uint32_t bit = 1 << id;
+ if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit)
{
- m_threads[(i << 6) | id].poke();
+ m_threads[i * 32 + id].poke();
return;
}
@@ -249,8 +249,8 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
, m_firstProvider(NULL)
, m_lastProvider(NULL)
{
- m_numSleepMapWords = (numThreads + 63) >> 6;
- m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
+ m_numSleepMapWords = (numThreads + 31) >> 5;
+ m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords);
char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
m_threads = reinterpret_cast<PoolThread*>(buffer);
@@ -259,9 +259,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
if (m_threads && m_sleepMap)
{
for (int i = 0; i < m_numSleepMapWords; i++)
- {
m_sleepMap[i] = 0;
- }
m_ok = true;
int i;
@@ -277,9 +275,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
}
if (m_ok)
- {
waitForAllIdle();
- }
else
{
// stop threads that did start up
@@ -300,12 +296,10 @@ void ThreadPoolImpl::waitForAllIdle()
int id = 0;
do
{
- int word = id >> 6;
- uint64_t bit = 1LL << (id & 63);
+ int word = id >> 5;
+ uint32_t bit = 1 << (id & 31);
if (m_sleepMap[word] & bit)
- {
id++;
- }
else
{
GIVE_UP_TIME();
@@ -338,9 +332,7 @@ ThreadPoolImpl::~ThreadPoolImpl()
{
// cleanup thread handles
for (int i = 0; i < m_numThreads; i++)
- {
m_threads[i].~PoolThread();
- }
X265_FREE(reinterpret_cast<char*>(m_threads));
}
diff --git a/source/common/threadpool.h b/source/common/threadpool.h
index 7616670..2c192ca 100644
--- a/source/common/threadpool.h
+++ b/source/common/threadpool.h
@@ -1,6 +1,4 @@
/*****************************************************************************
- * x265: singleton thread pool and interface classes
- *****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve at borho.org>
diff --git a/source/common/vec/dct-sse3.cpp b/source/common/vec/dct-sse3.cpp
index c435b52..1a91178 100644
--- a/source/common/vec/dct-sse3.cpp
+++ b/source/common/vec/dct-sse3.cpp
@@ -36,7 +36,17 @@
using namespace x265;
namespace {
-#if !HIGH_BIT_DEPTH
+#define SHIFT1 7
+#define ADD1 64
+
+#if HIGH_BIT_DEPTH
+#define SHIFT2 10
+#define ADD2 512
+#else
+#define SHIFT2 12
+#define ADD2 2048
+#endif
+
ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
{
{ 89, 75, 89, 75, 89, 75, 89, 75 },
@@ -52,30 +62,22 @@ ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
{ 83, 36, 83, 36, 83, 36, 83, 36 },
{ 36, -83, 36, -83, 36, -83, 36, -83 }
};
-void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
{
__m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
__m128i T00, T01, T02, T03, T04, T05, T06, T07;
- m128iAdd = _mm_set1_epi32(64);
+ m128iAdd = _mm_set1_epi32(ADD1);
- T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
- m128iS1 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
- m128iS3 = _mm_packs_epi32(T00, T01);
+ m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
+ m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
- T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
- m128iS5 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
- m128iS7 = _mm_packs_epi32(T00, T01);
+ m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
+ m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
@@ -107,12 +109,8 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
/* ------- */
- T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
- m128iS0 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
- m128iS4 = _mm_packs_epi32(T00, T01);
+ m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
+ m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
@@ -123,12 +121,8 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
/* ------- */
- T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
- m128iS2 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
- m128iS6 = _mm_packs_epi32(T00, T01);
+ m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
+ m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
@@ -152,14 +146,14 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
E2l = _mm_add_epi32(E2l, m128iAdd);
E2h = _mm_sub_epi32(EE1h, E01h);
E2h = _mm_add_epi32(E2h, m128iAdd);
- m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 7));
- m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 7));
- m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 7));
- m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 7));
- m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 7));
- m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 7));
- m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 7));
- m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 7));
+ m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), SHIFT1), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), SHIFT1));
+ m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), SHIFT1), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), SHIFT1));
+ m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), SHIFT1), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), SHIFT1));
+ m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), SHIFT1), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), SHIFT1));
+ m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), SHIFT1), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), SHIFT1));
+ m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), SHIFT1), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), SHIFT1));
+ m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), SHIFT1), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), SHIFT1));
+ m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), SHIFT1), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), SHIFT1));
/* Invers matrix */
E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
@@ -187,7 +181,7 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
- m128iAdd = _mm_set1_epi32(2048);
+ m128iAdd = _mm_set1_epi32(ADD2);
m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
@@ -248,14 +242,14 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
E2h = _mm_sub_epi32(EE1h, E01h);
E2h = _mm_add_epi32(E2h, m128iAdd);
- m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 12));
- m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 12));
- m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 12));
- m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 12));
- m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 12));
- m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 12));
- m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 12));
- m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 12));
+ m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), SHIFT2), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), SHIFT2));
+ m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), SHIFT2), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), SHIFT2));
+ m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), SHIFT2), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), SHIFT2));
+ m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), SHIFT2), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), SHIFT2));
+ m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), SHIFT2), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), SHIFT2));
+ m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), SHIFT2), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), SHIFT2));
+ m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), SHIFT2), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), SHIFT2));
+ m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), SHIFT2), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), SHIFT2));
// [07 06 05 04 03 02 01 00]
// [17 16 15 14 13 12 11 10]
@@ -305,8 +299,256 @@ void idct8(int32_t *src, int16_t *dst, intptr_t stride)
_mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
}
-void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
+#define READ_UNPACKHILO(offset)\
+ const __m128i T_00_00A = _mm_unpacklo_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\
+ const __m128i T_00_00B = _mm_unpackhi_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\
+ const __m128i T_00_01A = _mm_unpacklo_epi16(*(__m128i*)&src[5 * 16 + offset], *(__m128i*)&src[7 * 16 + offset]);\
+ const __m128i T_00_01B = _mm_unpackhi_epi16(*(__m128i*)&src[5 * 16 + offset], *(__m128i*)&src[7 * 16 + offset]);\
+ const __m128i T_00_02A = _mm_unpacklo_epi16(*(__m128i*)&src[9 * 16 + offset], *(__m128i*)&src[11 * 16 + offset]);\
+ const __m128i T_00_02B = _mm_unpackhi_epi16(*(__m128i*)&src[9 * 16 + offset], *(__m128i*)&src[11 * 16 + offset]);\
+ const __m128i T_00_03A = _mm_unpacklo_epi16(*(__m128i*)&src[13 * 16 + offset], *(__m128i*)&src[15 * 16 + offset]);\
+ const __m128i T_00_03B = _mm_unpackhi_epi16(*(__m128i*)&src[13 * 16 + offset], *(__m128i*)&src[15 * 16 + offset]);\
+ const __m128i T_00_04A = _mm_unpacklo_epi16(*(__m128i*)&src[2 * 16 + offset], *(__m128i*)&src[6 * 16 + offset]);\
+ const __m128i T_00_04B = _mm_unpackhi_epi16(*(__m128i*)&src[2 * 16 + offset], *(__m128i*)&src[6 * 16 + offset]);\
+ const __m128i T_00_05A = _mm_unpacklo_epi16(*(__m128i*)&src[10 * 16 + offset], *(__m128i*)&src[14 * 16 + offset]);\
+ const __m128i T_00_05B = _mm_unpackhi_epi16(*(__m128i*)&src[10 * 16 + offset], *(__m128i*)&src[14 * 16 + offset]);\
+ const __m128i T_00_06A = _mm_unpacklo_epi16(*(__m128i*)&src[4 * 16 + offset], *(__m128i*)&src[12 * 16 + offset]);\
+ const __m128i T_00_06B = _mm_unpackhi_epi16(*(__m128i*)&src[4 * 16 + offset], *(__m128i*)&src[12 * 16 + offset]);\
+ const __m128i T_00_07A = _mm_unpacklo_epi16(*(__m128i*)&src[0 * 16 + offset], *(__m128i*)&src[8 * 16 + offset]);\
+ const __m128i T_00_07B = _mm_unpackhi_epi16(*(__m128i*)&src[0 * 16 + offset], *(__m128i*)&src[8 * 16 + offset]);
+
+#define UNPACKHILO(part) \
+ const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);\
+ const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);\
+ const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]);\
+ const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]);\
+ const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]);\
+ const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]);\
+ const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]);\
+ const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]);\
+ const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]);\
+ const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]);\
+ const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]);\
+ const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]);\
+ const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]);\
+ const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]);\
+ const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]);\
+ const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]);
+
+#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
+ T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
+ T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
+ row = _mm_add_epi32(T00, T01);
+
+#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
+ tr0_0 = _mm_unpacklo_epi16(I0, I1); \
+ tr0_1 = _mm_unpacklo_epi16(I2, I3); \
+ tr0_2 = _mm_unpackhi_epi16(I0, I1); \
+ tr0_3 = _mm_unpackhi_epi16(I2, I3); \
+ tr0_4 = _mm_unpacklo_epi16(I4, I5); \
+ tr0_5 = _mm_unpacklo_epi16(I6, I7); \
+ tr0_6 = _mm_unpackhi_epi16(I4, I5); \
+ tr0_7 = _mm_unpackhi_epi16(I6, I7); \
+ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+
+#define PROCESS(part, rnd, shift) \
+ __m128i c32_rnd = _mm_set1_epi32(rnd);\
+ int nShift = shift;\
+\
+ __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;\
+ __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;\
+ {\
+ __m128i T00, T01;\
+\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)\
+ COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)\
+\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)\
+ COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)\
+ }\
+\
+ __m128i EO0A, EO1A, EO2A, EO3A;\
+ __m128i EO0B, EO1B, EO2B, EO3B;\
+ EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50));\
+ EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));\
+ EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89));\
+ EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));\
+ EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18));\
+ EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));\
+ EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75));\
+ EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));\
+\
+ __m128i EEO0A, EEO1A;\
+ __m128i EEO0B, EEO1B;\
+ EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);\
+ EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);\
+ EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);\
+ EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);\
+\
+ __m128i EEE0A, EEE1A;\
+ __m128i EEE0B, EEE1B;\
+ EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);\
+ EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);\
+ EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);\
+ EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);\
+\
+ const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A);\
+ const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);\
+ const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A);\
+ const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);\
+ const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A);\
+ const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);\
+ const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A);\
+ const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);\
+\
+ const __m128i E0A = _mm_add_epi32(EE0A, EO0A);\
+ const __m128i E0B = _mm_add_epi32(EE0B, EO0B);\
+ const __m128i E1A = _mm_add_epi32(EE1A, EO1A);\
+ const __m128i E1B = _mm_add_epi32(EE1B, EO1B);\
+ const __m128i E2A = _mm_add_epi32(EE2A, EO2A);\
+ const __m128i E2B = _mm_add_epi32(EE2B, EO2B);\
+ const __m128i E3A = _mm_add_epi32(EE3A, EO3A);\
+ const __m128i E3B = _mm_add_epi32(EE3B, EO3B);\
+ const __m128i E7A = _mm_sub_epi32(EE0A, EO0A);\
+ const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);\
+ const __m128i E6A = _mm_sub_epi32(EE1A, EO1A);\
+ const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);\
+ const __m128i E5A = _mm_sub_epi32(EE2A, EO2A);\
+ const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);\
+ const __m128i E4A = _mm_sub_epi32(EE3A, EO3A);\
+ const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);\
+\
+ const __m128i T10A = _mm_add_epi32(E0A, c32_rnd);\
+ const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);\
+ const __m128i T11A = _mm_add_epi32(E1A, c32_rnd);\
+ const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);\
+ const __m128i T12A = _mm_add_epi32(E2A, c32_rnd);\
+ const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);\
+ const __m128i T13A = _mm_add_epi32(E3A, c32_rnd);\
+ const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);\
+ const __m128i T14A = _mm_add_epi32(E4A, c32_rnd);\
+ const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);\
+ const __m128i T15A = _mm_add_epi32(E5A, c32_rnd);\
+ const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);\
+ const __m128i T16A = _mm_add_epi32(E6A, c32_rnd);\
+ const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);\
+ const __m128i T17A = _mm_add_epi32(E7A, c32_rnd);\
+ const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);\
+\
+ const __m128i T20A = _mm_add_epi32(T10A, O0A);\
+ const __m128i T20B = _mm_add_epi32(T10B, O0B);\
+ const __m128i T21A = _mm_add_epi32(T11A, O1A);\
+ const __m128i T21B = _mm_add_epi32(T11B, O1B);\
+ const __m128i T22A = _mm_add_epi32(T12A, O2A);\
+ const __m128i T22B = _mm_add_epi32(T12B, O2B);\
+ const __m128i T23A = _mm_add_epi32(T13A, O3A);\
+ const __m128i T23B = _mm_add_epi32(T13B, O3B);\
+ const __m128i T24A = _mm_add_epi32(T14A, O4A);\
+ const __m128i T24B = _mm_add_epi32(T14B, O4B);\
+ const __m128i T25A = _mm_add_epi32(T15A, O5A);\
+ const __m128i T25B = _mm_add_epi32(T15B, O5B);\
+ const __m128i T26A = _mm_add_epi32(T16A, O6A);\
+ const __m128i T26B = _mm_add_epi32(T16B, O6B);\
+ const __m128i T27A = _mm_add_epi32(T17A, O7A);\
+ const __m128i T27B = _mm_add_epi32(T17B, O7B);\
+ const __m128i T2FA = _mm_sub_epi32(T10A, O0A);\
+ const __m128i T2FB = _mm_sub_epi32(T10B, O0B);\
+ const __m128i T2EA = _mm_sub_epi32(T11A, O1A);\
+ const __m128i T2EB = _mm_sub_epi32(T11B, O1B);\
+ const __m128i T2DA = _mm_sub_epi32(T12A, O2A);\
+ const __m128i T2DB = _mm_sub_epi32(T12B, O2B);\
+ const __m128i T2CA = _mm_sub_epi32(T13A, O3A);\
+ const __m128i T2CB = _mm_sub_epi32(T13B, O3B);\
+ const __m128i T2BA = _mm_sub_epi32(T14A, O4A);\
+ const __m128i T2BB = _mm_sub_epi32(T14B, O4B);\
+ const __m128i T2AA = _mm_sub_epi32(T15A, O5A);\
+ const __m128i T2AB = _mm_sub_epi32(T15B, O5B);\
+ const __m128i T29A = _mm_sub_epi32(T16A, O6A);\
+ const __m128i T29B = _mm_sub_epi32(T16B, O6B);\
+ const __m128i T28A = _mm_sub_epi32(T17A, O7A);\
+ const __m128i T28B = _mm_sub_epi32(T17B, O7B);\
+\
+ const __m128i T30A = _mm_srai_epi32(T20A, nShift);\
+ const __m128i T30B = _mm_srai_epi32(T20B, nShift);\
+ const __m128i T31A = _mm_srai_epi32(T21A, nShift);\
+ const __m128i T31B = _mm_srai_epi32(T21B, nShift);\
+ const __m128i T32A = _mm_srai_epi32(T22A, nShift);\
+ const __m128i T32B = _mm_srai_epi32(T22B, nShift);\
+ const __m128i T33A = _mm_srai_epi32(T23A, nShift);\
+ const __m128i T33B = _mm_srai_epi32(T23B, nShift);\
+ const __m128i T34A = _mm_srai_epi32(T24A, nShift);\
+ const __m128i T34B = _mm_srai_epi32(T24B, nShift);\
+ const __m128i T35A = _mm_srai_epi32(T25A, nShift);\
+ const __m128i T35B = _mm_srai_epi32(T25B, nShift);\
+ const __m128i T36A = _mm_srai_epi32(T26A, nShift);\
+ const __m128i T36B = _mm_srai_epi32(T26B, nShift);\
+ const __m128i T37A = _mm_srai_epi32(T27A, nShift);\
+ const __m128i T37B = _mm_srai_epi32(T27B, nShift);\
+\
+ const __m128i T38A = _mm_srai_epi32(T28A, nShift);\
+ const __m128i T38B = _mm_srai_epi32(T28B, nShift);\
+ const __m128i T39A = _mm_srai_epi32(T29A, nShift);\
+ const __m128i T39B = _mm_srai_epi32(T29B, nShift);\
+ const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);\
+ const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);\
+ const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);\
+ const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);\
+ const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);\
+ const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);\
+ const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);\
+ const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);\
+ const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);\
+ const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);\
+ const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);\
+ const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);\
+\
+ res00[part] = _mm_packs_epi32(T30A, T30B);\
+ res01[part] = _mm_packs_epi32(T31A, T31B);\
+ res02[part] = _mm_packs_epi32(T32A, T32B);\
+ res03[part] = _mm_packs_epi32(T33A, T33B);\
+ res04[part] = _mm_packs_epi32(T34A, T34B);\
+ res05[part] = _mm_packs_epi32(T35A, T35B);\
+ res06[part] = _mm_packs_epi32(T36A, T36B);\
+ res07[part] = _mm_packs_epi32(T37A, T37B);\
+\
+ res08[part] = _mm_packs_epi32(T38A, T38B);\
+ res09[part] = _mm_packs_epi32(T39A, T39B);\
+ res10[part] = _mm_packs_epi32(T3AA, T3AB);\
+ res11[part] = _mm_packs_epi32(T3BA, T3BB);\
+ res12[part] = _mm_packs_epi32(T3CA, T3CB);\
+ res13[part] = _mm_packs_epi32(T3DA, T3DB);\
+ res14[part] = _mm_packs_epi32(T3EA, T3EB);\
+ res15[part] = _mm_packs_epi32(T3FA, T3FB);
+
const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039);
@@ -354,9 +596,6 @@ void idct16(int32_t *src, int16_t *dst, intptr_t stride)
const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
- __m128i c32_rnd = _mm_set1_epi32(64);
-
- int nShift = 7;
// DCT1
__m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2];
@@ -364,359 +603,81 @@ void idct16(int32_t *src, int16_t *dst, intptr_t stride)
__m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2];
__m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2];
- for (int i = 0; i < 2; i++)
{
- const int offset = (i << 3);
- __m128i T00, T01;
-
- T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
- in00[i] = _mm_packs_epi32(T00, T01); // [07 06 05 04 03 02 01 00]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
- in01[i] = _mm_packs_epi32(T00, T01); // [17 16 15 14 13 12 11 10]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
- in02[i] = _mm_packs_epi32(T00, T01); // [27 26 25 24 23 22 21 20]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
- in03[i] = _mm_packs_epi32(T00, T01); // [37 36 35 34 33 32 31 30]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
- in04[i] = _mm_packs_epi32(T00, T01); // [47 46 45 44 43 42 41 40]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
- in05[i] = _mm_packs_epi32(T00, T01); // [57 56 55 54 53 52 51 50]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
- in06[i] = _mm_packs_epi32(T00, T01); // [67 66 65 64 63 62 61 60]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
- in07[i] = _mm_packs_epi32(T00, T01); // [77 76 75 74 73 72 71 70]
-
- T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
- in08[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
- in09[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
- in10[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
- in11[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
- in12[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
- in13[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
- in14[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
- in15[i] = _mm_packs_epi32(T00, T01);
+ READ_UNPACKHILO(0)
+ PROCESS(0, ADD1, SHIFT1)
}
- for (int pass = 0; pass < 2; pass++)
{
- if (pass == 1)
- {
- c32_rnd = _mm_set1_epi32(2048);
- nShift = 12;
- }
-
- for (int part = 0; part < 2; part++)
- {
- const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10]
- const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14]
- const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ]
- const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ]
- const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ]
- const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ]
- const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ]
- const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ]
- const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ]
- const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ]
- const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ]
- const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ]
- const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]row
- const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ]
- const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]); // [83 03 82 02 81 01 81 00] row08 row00
- const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]); // [87 07 86 06 85 05 84 04]
-
- __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;
- __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;
- {
- __m128i T00, T01;
-#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
- T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
- T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
- row = _mm_add_epi32(T00, T01);
-
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)
- COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)
-
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)
- COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)
-#undef COMPUTE_ROW
- }
-
- __m128i EO0A, EO1A, EO2A, EO3A;
- __m128i EO0B, EO1B, EO2B, EO3B;
- EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0
- EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));
- EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1
- EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));
- EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2
- EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));
- EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3
- EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));
-
- __m128i EEO0A, EEO1A;
- __m128i EEO0B, EEO1B;
- EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);
- EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);
- EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);
- EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);
-
- __m128i EEE0A, EEE1A;
- __m128i EEE0B, EEE1B;
- EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);
- EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);
- EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);
- EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);
-
- const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0
- const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
- const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1
- const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
- const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0
- const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);
- const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1
- const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);
-
- const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0
- const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
- const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1
- const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
- const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2
- const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
- const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3
- const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
- const __m128i E7A = _mm_sub_epi32(EE0A, EO0A); // E0 = EE0 - EO0
- const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);
- const __m128i E6A = _mm_sub_epi32(EE1A, EO1A); // E1 = EE1 - EO1
- const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);
- const __m128i E5A = _mm_sub_epi32(EE2A, EO2A); // E2 = EE2 - EO2
- const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);
- const __m128i E4A = _mm_sub_epi32(EE3A, EO3A); // E3 = EE3 - EO3
- const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);
-
- const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd
- const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
- const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd
- const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
- const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd
- const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
- const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd
- const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
- const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd
- const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
- const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd
- const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
- const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd
- const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
- const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd
- const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
-
- const __m128i T20A = _mm_add_epi32(T10A, O0A); // E0 + O0 + rnd
- const __m128i T20B = _mm_add_epi32(T10B, O0B);
- const __m128i T21A = _mm_add_epi32(T11A, O1A); // E1 + O1 + rnd
- const __m128i T21B = _mm_add_epi32(T11B, O1B);
- const __m128i T22A = _mm_add_epi32(T12A, O2A); // E2 + O2 + rnd
- const __m128i T22B = _mm_add_epi32(T12B, O2B);
- const __m128i T23A = _mm_add_epi32(T13A, O3A); // E3 + O3 + rnd
- const __m128i T23B = _mm_add_epi32(T13B, O3B);
- const __m128i T24A = _mm_add_epi32(T14A, O4A); // E4
- const __m128i T24B = _mm_add_epi32(T14B, O4B);
- const __m128i T25A = _mm_add_epi32(T15A, O5A); // E5
- const __m128i T25B = _mm_add_epi32(T15B, O5B);
- const __m128i T26A = _mm_add_epi32(T16A, O6A); // E6
- const __m128i T26B = _mm_add_epi32(T16B, O6B);
- const __m128i T27A = _mm_add_epi32(T17A, O7A); // E7
- const __m128i T27B = _mm_add_epi32(T17B, O7B);
- const __m128i T2FA = _mm_sub_epi32(T10A, O0A); // E0 - O0 + rnd
- const __m128i T2FB = _mm_sub_epi32(T10B, O0B);
- const __m128i T2EA = _mm_sub_epi32(T11A, O1A); // E1 - O1 + rnd
- const __m128i T2EB = _mm_sub_epi32(T11B, O1B);
- const __m128i T2DA = _mm_sub_epi32(T12A, O2A); // E2 - O2 + rnd
- const __m128i T2DB = _mm_sub_epi32(T12B, O2B);
- const __m128i T2CA = _mm_sub_epi32(T13A, O3A); // E3 - O3 + rnd
- const __m128i T2CB = _mm_sub_epi32(T13B, O3B);
- const __m128i T2BA = _mm_sub_epi32(T14A, O4A); // E4
- const __m128i T2BB = _mm_sub_epi32(T14B, O4B);
- const __m128i T2AA = _mm_sub_epi32(T15A, O5A); // E5
- const __m128i T2AB = _mm_sub_epi32(T15B, O5B);
- const __m128i T29A = _mm_sub_epi32(T16A, O6A); // E6
- const __m128i T29B = _mm_sub_epi32(T16B, O6B);
- const __m128i T28A = _mm_sub_epi32(T17A, O7A); // E7
- const __m128i T28B = _mm_sub_epi32(T17B, O7B);
-
- const __m128i T30A = _mm_srai_epi32(T20A, nShift); // [30 20 10 00]
- const __m128i T30B = _mm_srai_epi32(T20B, nShift); // [70 60 50 40]
- const __m128i T31A = _mm_srai_epi32(T21A, nShift); // [31 21 11 01]
- const __m128i T31B = _mm_srai_epi32(T21B, nShift); // [71 61 51 41]
- const __m128i T32A = _mm_srai_epi32(T22A, nShift); // [32 22 12 02]
- const __m128i T32B = _mm_srai_epi32(T22B, nShift); // [72 62 52 42]
- const __m128i T33A = _mm_srai_epi32(T23A, nShift); // [33 23 13 03]
- const __m128i T33B = _mm_srai_epi32(T23B, nShift); // [73 63 53 43]
- const __m128i T34A = _mm_srai_epi32(T24A, nShift); // [33 24 14 04]
- const __m128i T34B = _mm_srai_epi32(T24B, nShift); // [74 64 54 44]
- const __m128i T35A = _mm_srai_epi32(T25A, nShift); // [35 25 15 05]
- const __m128i T35B = _mm_srai_epi32(T25B, nShift); // [75 65 55 45]
- const __m128i T36A = _mm_srai_epi32(T26A, nShift); // [36 26 16 06]
- const __m128i T36B = _mm_srai_epi32(T26B, nShift); // [76 66 56 46]
- const __m128i T37A = _mm_srai_epi32(T27A, nShift); // [37 27 17 07]
- const __m128i T37B = _mm_srai_epi32(T27B, nShift); // [77 67 57 47]
-
- const __m128i T38A = _mm_srai_epi32(T28A, nShift); // [30 20 10 00] x8
- const __m128i T38B = _mm_srai_epi32(T28B, nShift); // [70 60 50 40]
- const __m128i T39A = _mm_srai_epi32(T29A, nShift); // [31 21 11 01] x9
- const __m128i T39B = _mm_srai_epi32(T29B, nShift); // [71 61 51 41]
- const __m128i T3AA = _mm_srai_epi32(T2AA, nShift); // [32 22 12 02] xA
- const __m128i T3AB = _mm_srai_epi32(T2AB, nShift); // [72 62 52 42]
- const __m128i T3BA = _mm_srai_epi32(T2BA, nShift); // [33 23 13 03] xB
- const __m128i T3BB = _mm_srai_epi32(T2BB, nShift); // [73 63 53 43]
- const __m128i T3CA = _mm_srai_epi32(T2CA, nShift); // [33 24 14 04] xC
- const __m128i T3CB = _mm_srai_epi32(T2CB, nShift); // [74 64 54 44]
- const __m128i T3DA = _mm_srai_epi32(T2DA, nShift); // [35 25 15 05] xD
- const __m128i T3DB = _mm_srai_epi32(T2DB, nShift); // [75 65 55 45]
- const __m128i T3EA = _mm_srai_epi32(T2EA, nShift); // [36 26 16 06] xE
- const __m128i T3EB = _mm_srai_epi32(T2EB, nShift); // [76 66 56 46]
- const __m128i T3FA = _mm_srai_epi32(T2FA, nShift); // [37 27 17 07] xF
- const __m128i T3FB = _mm_srai_epi32(T2FB, nShift); // [77 67 57 47]
-
- res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00]
- res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01]
- res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02]
- res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03]
- res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04]
- res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05]
- res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06]
- res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07]
-
- res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80]
- res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81]
- res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82]
- res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83]
- res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84]
- res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85]
- res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86]
- res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87]
- }
- //transpose matrix 8x8 16bit.
- {
- __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
- __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
-#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
- tr0_0 = _mm_unpacklo_epi16(I0, I1); \
- tr0_1 = _mm_unpacklo_epi16(I2, I3); \
- tr0_2 = _mm_unpackhi_epi16(I0, I1); \
- tr0_3 = _mm_unpackhi_epi16(I2, I3); \
- tr0_4 = _mm_unpacklo_epi16(I4, I5); \
- tr0_5 = _mm_unpacklo_epi16(I6, I7); \
- tr0_6 = _mm_unpackhi_epi16(I4, I5); \
- tr0_7 = _mm_unpackhi_epi16(I6, I7); \
- tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
-
- TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
- TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
- TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
- TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
+ READ_UNPACKHILO(8)
+ PROCESS(1, ADD1, SHIFT1)
+ }
+ {
+ __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
+ __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
+ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
+ TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
+ TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
+ TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
+ }
-#undef TRANSPOSE_8x8_16BIT
- }
+ {
+ UNPACKHILO(0)
+ PROCESS(0, ADD2, SHIFT2)
+ }
+ {
+ UNPACKHILO(1)
+ PROCESS(1, ADD2, SHIFT2)
}
- _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
- _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
- _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
- _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
- _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
- _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
- _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
- _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
- _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
- _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
- _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
- _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
- _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
- _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
- _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
- _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
- _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
- _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
- _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
- _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
- _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
- _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
- _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
- _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
- _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
- _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
- _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
- _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
- _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
- _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
- _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
- _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
+ {
+ __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
+ __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
+ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
+ _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
+ _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
+ _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
+ _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
+ _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
+ _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
+ _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
+ _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
+ TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
+ _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
+ _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
+ _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
+ _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
+ _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
+ _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
+ _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
+ _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
+ TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
+ _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
+ _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
+ _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
+ _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
+ _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
+ _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
+ _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
+ _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
+ TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
+ _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
+ _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
+ _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
+ _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
+ _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
+ _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
+ _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
+ _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
+ }
}
+#undef PROCESS
+#undef TRANSPOSE_8x8_16BIT
+#undef COMPUTE_ROW
+#undef UNPACKHILO
+#undef READ_UNPACKHILO
-void idct32(int32_t *src, int16_t *dst, intptr_t stride)
+void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
{
//Odd
const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0
@@ -896,9 +857,9 @@ void idct32(int32_t *src, int16_t *dst, intptr_t stride)
//EEEE
const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
- __m128i c32_rnd = _mm_set1_epi32(64);
+ __m128i c32_rnd = _mm_set1_epi32(ADD1);
- int nShift = 7;
+ int nShift = SHIFT1;
// DCT1
__m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4];
@@ -909,143 +870,46 @@ void idct32(int32_t *src, int16_t *dst, intptr_t stride)
for (int i = 0; i < 4; i++)
{
const int offset = (i << 3);
- __m128i T00, T01;
-
- T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
- in00[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
- in01[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
- in02[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
- in03[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
- in04[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
- in05[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
- in06[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
- in07[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
- in08[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
- in09[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
- in10[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
- in11[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
- in12[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
- in13[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
- in14[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
- in15[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
- in16[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
- in17[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
- in18[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
- in19[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
- in20[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
- in21[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
- in22[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
- in23[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
- in24[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
- in25[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
- in26[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
- in27[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
- in28[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
- in29[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
- in30[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
- in31[i] = _mm_packs_epi32(T00, T01);
+ in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
+ in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
+ in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
+ in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
+ in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
+ in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
+ in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
+ in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
+ in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
+ in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
+ in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
+ in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
+ in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
+ in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
+ in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
+ in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
+ in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
+ in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
+ in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
+ in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
+ in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
+ in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
+ in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
+ in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
+ in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
+ in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
+ in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
+ in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
+ in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
+ in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
+ in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
+ in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
}
for (int pass = 0; pass < 2; pass++)
{
if (pass == 1)
{
- c32_rnd = _mm_set1_epi32(2048);
- nShift = 12;
+ c32_rnd = _mm_set1_epi32(ADD2);
+ nShift = SHIFT2;
}
for (int part = 0; part < 4; part++)
@@ -1554,19 +1418,16 @@ void idct32(int32_t *src, int16_t *dst, intptr_t stride)
}
}
-#endif // if !HIGH_BIT_DEPTH
}
namespace x265 {
-void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives &p)
+void setupIntrinsicDCT_sse3(EncoderPrimitives &p)
{
- /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
- * still somewhat rare on end-user PCs we still compile and link these SSE3
+ /* Note: We have AVX2 assembly for these functions, but since AVX2 is still
+ * somewhat rare on end-user PCs we still compile and link these SSE3
* intrinsic SIMD functions */
-#if !HIGH_BIT_DEPTH
- p.idct[IDCT_8x8] = idct8;
- p.idct[IDCT_16x16] = idct16;
- p.idct[IDCT_32x32] = idct32;
-#endif
+ p.cu[BLOCK_8x8].idct = idct8;
+ p.cu[BLOCK_16x16].idct = idct16;
+ p.cu[BLOCK_32x32].idct = idct32;
}
}
diff --git a/source/common/vec/dct-sse41.cpp b/source/common/vec/dct-sse41.cpp
index aa52709..8405118 100644
--- a/source/common/vec/dct-sse41.cpp
+++ b/source/common/vec/dct-sse41.cpp
@@ -36,7 +36,7 @@
using namespace x265;
namespace {
-void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num too large\n");
@@ -66,11 +66,7 @@ void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32
quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
- sign = _mm_srai_epi16(quantCoef12, 15);
- quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
- quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
}
}
else
@@ -100,18 +96,14 @@ void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32
quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
- sign = _mm_srai_epi16(quantCoef12, 15);
- quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
- quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
}
}
}
}
namespace x265 {
-void Setup_Vec_DCTPrimitives_sse41(EncoderPrimitives &p)
+void setupIntrinsicDCT_sse41(EncoderPrimitives &p)
{
p.dequant_scaling = dequant_scaling;
}
diff --git a/source/common/vec/dct-ssse3.cpp b/source/common/vec/dct-ssse3.cpp
index bbb7858..b452da6 100644
--- a/source/common/vec/dct-ssse3.cpp
+++ b/source/common/vec/dct-ssse3.cpp
@@ -36,7 +36,6 @@
using namespace x265;
-#if !HIGH_BIT_DEPTH
namespace {
ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
{
@@ -100,11 +99,22 @@ ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) =
#undef MAKE_COEF
};
-void dct16(int16_t *src, int32_t *dst, intptr_t stride)
+void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
+#if HIGH_BIT_DEPTH
+#define SHIFT1 5
+#define ADD1 16
+#else
+#define SHIFT1 3
+#define ADD1 4
+#endif
+
+#define SHIFT2 10
+#define ADD2 512
+
// Const
- __m128i c_4 = _mm_set1_epi32(4);
- __m128i c_512 = _mm_set1_epi32(512);
+ __m128i c_4 = _mm_set1_epi32(ADD1);
+ __m128i c_512 = _mm_set1_epi32(ADD2);
int i;
@@ -192,29 +202,29 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
@@ -224,8 +234,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
@@ -235,8 +245,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
@@ -246,8 +256,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
@@ -257,8 +267,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
@@ -277,8 +287,8 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T63 = _mm_hadd_epi32(T66, T67); \
T60 = _mm_hadd_epi32(T60, T61); \
T61 = _mm_hadd_epi32(T62, T63); \
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
T70 = _mm_packs_epi32(T60, T61); \
_mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
@@ -342,10 +352,12 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T40 = _mm_hadd_epi32(T30, T31);
T41 = _mm_hsub_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
- _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ T41 = _mm_packs_epi32(T41, T41);
+ _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
+ _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
@@ -365,8 +377,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
@@ -386,8 +399,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
@@ -407,8 +421,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
@@ -428,8 +443,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
@@ -449,8 +465,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
@@ -470,8 +487,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_packs_epi32(T40, T40);
+ _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
#define MAKE_ODD(tab, dstPos) \
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
@@ -492,8 +510,9 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
T31 = _mm_hadd_epi32(T32, T33); \
\
T40 = _mm_hadd_epi32(T30, T31); \
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
- _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2); \
+ T40 = _mm_packs_epi32(T40, T40); \
+ _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
MAKE_ODD(14, 1);
MAKE_ODD(16, 3);
@@ -505,6 +524,10 @@ void dct16(int16_t *src, int32_t *dst, intptr_t stride)
MAKE_ODD(28, 15);
#undef MAKE_ODD
}
+#undef SHIFT1
+#undef ADD1
+#undef SHIFT2
+#undef ADD2
}
ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
@@ -657,11 +680,22 @@ ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) =
#undef MAKE_COEF16
};
-void dct32(int16_t *src, int32_t *dst, intptr_t stride)
+void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
{
+#if HIGH_BIT_DEPTH
+#define SHIFT1 6
+#define ADD1 32
+#else
+#define SHIFT1 4
+#define ADD1 8
+#endif
+
+#define SHIFT2 11
+#define ADD2 1024
+
// Const
- __m128i c_8 = _mm_set1_epi32(8);
- __m128i c_1024 = _mm_set1_epi32(1024);
+ __m128i c_8 = _mm_set1_epi32(ADD1);
+ __m128i c_1024 = _mm_set1_epi32(ADD2);
int i;
@@ -806,15 +840,15 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[0][i] = T60;
T50 = _mm_hsub_epi32(T40, T41);
T51 = _mm_hsub_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[16][i] = T60;
@@ -834,8 +868,8 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[8][i] = T60;
@@ -855,8 +889,8 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[24][i] = T60;
@@ -877,8 +911,8 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
\
T50 = _mm_hadd_epi32(T40, T41); \
T51 = _mm_hadd_epi32(T42, T43); \
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
T60 = _mm_packs_epi32(T50, T51); \
im[(dstPos)][i] = T60;
@@ -940,8 +974,8 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
\
T50 = _mm_hadd_epi32(T50, T51); \
T51 = _mm_hadd_epi32(T52, T53); \
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
T60 = _mm_packs_epi32(T50, T51); \
im[(dstPos)][i] = T60;
@@ -1049,8 +1083,9 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
\
T60 = _mm_hadd_epi32(T60, T61); \
\
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
- _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), SHIFT2); \
+ T60 = _mm_packs_epi32(T60, T60); \
+ _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
MAKE_ODD(44, 44, 44, 44, 0);
MAKE_ODD(45, 45, 45, 45, 16);
@@ -1090,19 +1125,20 @@ void dct32(int16_t *src, int32_t *dst, intptr_t stride)
MAKE_ODD(158, 159, 160, 161, 31);
#undef MAKE_ODD
}
+#undef SHIFT1
+#undef ADD1
+#undef SHIFT2
+#undef ADD2
}
}
-#endif // if !HIGH_BIT_DEPTH
namespace x265 {
-void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives &p)
+void setupIntrinsicDCT_ssse3(EncoderPrimitives &p)
{
/* Note: We have AVX2 assembly for these two functions, but since AVX2 is
* still somewhat rare on end-user PCs we still compile and link these SSSE3
* intrinsic SIMD functions */
-#if !HIGH_BIT_DEPTH
- p.dct[DCT_16x16] = dct16;
- p.dct[DCT_32x32] = dct32;
-#endif
+ p.cu[BLOCK_16x16].dct = dct16;
+ p.cu[BLOCK_32x32].dct = dct32;
}
}
diff --git a/source/common/vec/vec-primitives.cpp b/source/common/vec/vec-primitives.cpp
index c5d5405..3a13a01 100644
--- a/source/common/vec/vec-primitives.cpp
+++ b/source/common/vec/vec-primitives.cpp
@@ -53,29 +53,29 @@
namespace x265 {
// private x265 namespace
-void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives&);
-void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives&);
-void Setup_Vec_DCTPrimitives_sse41(EncoderPrimitives&);
+void setupIntrinsicDCT_sse3(EncoderPrimitives&);
+void setupIntrinsicDCT_ssse3(EncoderPrimitives&);
+void setupIntrinsicDCT_sse41(EncoderPrimitives&);
/* Use primitives for the best available vector architecture */
-void Setup_Instrinsic_Primitives(EncoderPrimitives &p, int cpuMask)
+void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
{
#ifdef HAVE_SSE3
if (cpuMask & X265_CPU_SSE3)
{
- Setup_Vec_DCTPrimitives_sse3(p);
+ setupIntrinsicDCT_sse3(p);
}
#endif
#ifdef HAVE_SSSE3
if (cpuMask & X265_CPU_SSSE3)
{
- Setup_Vec_DCTPrimitives_ssse3(p);
+ setupIntrinsicDCT_ssse3(p);
}
#endif
#ifdef HAVE_SSE4
if (cpuMask & X265_CPU_SSE4)
{
- Setup_Vec_DCTPrimitives_sse41(p);
+ setupIntrinsicDCT_sse41(p);
}
#endif
(void)p;
diff --git a/source/common/wavefront.cpp b/source/common/wavefront.cpp
index 17c44aa..533e768 100644
--- a/source/common/wavefront.cpp
+++ b/source/common/wavefront.cpp
@@ -33,14 +33,14 @@ bool WaveFront::init(int numRows)
{
m_numRows = numRows;
- m_numWords = (numRows + 63) >> 6;
- m_internalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+ m_numWords = (numRows + 31) >> 5;
+ m_internalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
if (m_internalDependencyBitmap)
- memset((void*)m_internalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
- m_externalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+ m_externalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
if (m_externalDependencyBitmap)
- memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
return m_internalDependencyBitmap && m_externalDependencyBitmap;
}
@@ -53,58 +53,31 @@ WaveFront::~WaveFront()
void WaveFront::clearEnabledRowMask()
{
- memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
}
void WaveFront::enqueueRow(int row)
{
- // thread safe
- uint64_t bit = 1LL << (row & 63);
-
- X265_CHECK(row < m_numRows, "invalid row\n");
- ATOMIC_OR(&m_internalDependencyBitmap[row >> 6], bit);
+ uint32_t bit = 1 << (row & 31);
+ ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
if (m_pool) m_pool->pokeIdleThread();
}
void WaveFront::enableRow(int row)
{
- // thread safe
- uint64_t bit = 1LL << (row & 63);
-
- X265_CHECK(row < m_numRows, "invalid row\n");
- ATOMIC_OR(&m_externalDependencyBitmap[row >> 6], bit);
+ uint32_t bit = 1 << (row & 31);
+ ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
}
void WaveFront::enableAllRows()
{
- memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint64_t) * m_numWords);
-}
-
-bool WaveFront::checkHigherPriorityRow(int curRow)
-{
- int fullwords = curRow >> 6;
- uint64_t mask = (1LL << (curRow & 63)) - 1;
-
- // Check full bitmap words before curRow
- for (int i = 0; i < fullwords; i++)
- {
- if (m_internalDependencyBitmap[i] & m_externalDependencyBitmap[i])
- return true;
- }
-
- // check the partially masked bitmap word of curRow
- if (m_internalDependencyBitmap[fullwords] & m_externalDependencyBitmap[fullwords] & mask)
- return true;
- return false;
+ memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint32_t) * m_numWords);
}
bool WaveFront::dequeueRow(int row)
{
- uint64_t oldval, newval;
-
- oldval = m_internalDependencyBitmap[row >> 6];
- newval = oldval & ~(1LL << (row & 63));
- return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval;
+ uint32_t bit = 1 << (row & 31);
+ return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
}
bool WaveFront::findJob(int threadId)
@@ -114,22 +87,21 @@ bool WaveFront::findJob(int threadId)
// thread safe
for (int w = 0; w < m_numWords; w++)
{
- uint64_t oldval = m_internalDependencyBitmap[w];
- while (oldval & m_externalDependencyBitmap[w])
+ uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
+ while (oldval)
{
- uint64_t mask = oldval & m_externalDependencyBitmap[w];
-
- CTZ64(id, mask);
+ CTZ(id, oldval);
- uint64_t newval = oldval & ~(1LL << id);
- if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval)
+ uint32_t bit = 1 << id;
+ if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
{
- // we cleared the bit, process row
- processRow(w * 64 + id, threadId);
+ /* we cleared the bit, we get to process the row */
+ processRow(w * 32 + id, threadId);
return true;
}
+
// some other thread cleared the bit, try another bit
- oldval = m_internalDependencyBitmap[w];
+ oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
}
}
diff --git a/source/common/wavefront.h b/source/common/wavefront.h
index a34b9a4..4692a41 100644
--- a/source/common/wavefront.h
+++ b/source/common/wavefront.h
@@ -43,8 +43,8 @@ private:
// Dependencies are categorized as internal and external. Internal dependencies
// are caused by neighbor block availability. External dependencies are generally
// reference frame reconstructed pixels being available.
- uint64_t volatile *m_internalDependencyBitmap;
- uint64_t volatile *m_externalDependencyBitmap;
+ uint32_t volatile *m_internalDependencyBitmap;
+ uint32_t volatile *m_externalDependencyBitmap;
// number of words in the bitmap
int m_numWords;
@@ -92,10 +92,6 @@ public:
// Start or resume encode processing of this row, must be implemented by
// derived classes.
virtual void processRow(int row, int threadId) = 0;
-
- // Returns true if a row above curRow is available for processing. The processRow()
- // method may call this function periodically and voluntarily exit
- bool checkHigherPriorityRow(int curRow);
};
} // end namespace x265
diff --git a/source/common/winxp.h b/source/common/winxp.h
index b105804..0446265 100644
--- a/source/common/winxp.h
+++ b/source/common/winxp.h
@@ -56,30 +56,6 @@ void cond_destroy(ConditionVariable *cond);
#define WakeAllConditionVariable x265::cond_broadcast
#define XP_CONDITION_VAR_FREE x265::cond_destroy
-#if defined(_MSC_VER)
-
-/* Windows XP did not define atomic OR 64, but gcc has a good version, so
- * only use this workaround when targeting XP with MSVC */
-FORCEINLINE LONGLONG interlocked_OR64(__inout LONGLONG volatile *Destination,
- __in LONGLONG Value)
-{
- LONGLONG Old;
-
- do
- {
- Old = *Destination;
- }
- while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old);
-
- return Old;
-}
-
-#define ATOMIC_OR(ptr, mask) x265::interlocked_OR64((volatile LONG64*)ptr, mask)
-
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#pragma intrinsic(_InterlockedCompareExchange64)
-#endif
-#endif // defined(_MSC_VER)
} // namespace x265
#else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600)
diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp
index ec1607d..03890db 100644
--- a/source/common/x86/asm-primitives.cpp
+++ b/source/common/x86/asm-primitives.cpp
@@ -39,919 +39,552 @@ extern "C" {
#include "dct8.h"
}
-#define INIT2_NAME(name1, name2, cpu) \
- p.name1[LUMA_16x16] = x265_pixel_ ## name2 ## _16x16 ## cpu; \
- p.name1[LUMA_16x8] = x265_pixel_ ## name2 ## _16x8 ## cpu;
-#define INIT4_NAME(name1, name2, cpu) \
- INIT2_NAME(name1, name2, cpu) \
- p.name1[LUMA_8x16] = x265_pixel_ ## name2 ## _8x16 ## cpu; \
- p.name1[LUMA_8x8] = x265_pixel_ ## name2 ## _8x8 ## cpu;
-#define INIT5_NAME(name1, name2, cpu) \
- INIT4_NAME(name1, name2, cpu) \
- p.name1[LUMA_8x4] = x265_pixel_ ## name2 ## _8x4 ## cpu;
-#define INIT6_NAME(name1, name2, cpu) \
- INIT5_NAME(name1, name2, cpu) \
- p.name1[LUMA_4x8] = x265_pixel_ ## name2 ## _4x8 ## cpu;
-#define INIT7_NAME(name1, name2, cpu) \
- INIT6_NAME(name1, name2, cpu) \
- p.name1[LUMA_4x4] = x265_pixel_ ## name2 ## _4x4 ## cpu;
-#define INIT8_NAME(name1, name2, cpu) \
- INIT7_NAME(name1, name2, cpu) \
- p.name1[LUMA_4x16] = x265_pixel_ ## name2 ## _4x16 ## cpu;
-#define INIT2(name, cpu) INIT2_NAME(name, name, cpu)
-#define INIT4(name, cpu) INIT4_NAME(name, name, cpu)
-#define INIT5(name, cpu) INIT5_NAME(name, name, cpu)
-#define INIT6(name, cpu) INIT6_NAME(name, name, cpu)
-#define INIT7(name, cpu) INIT7_NAME(name, name, cpu)
-#define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
-
-#define HEVC_SATD(cpu) \
- p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
- p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
- p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
- p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \
- p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \
- p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \
- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
- p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
- p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \
- p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
- p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \
- p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \
- p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \
- p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \
- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \
- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \
- p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
- p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
- p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
- p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \
- p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
- p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
- p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu;
-
-#define SAD_X3(cpu) \
- p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
- p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
- p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
- p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
- p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
- p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
- p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
- p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
- p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
- p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
- p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
- p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
- p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
- p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
- p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
- p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
-
-#define SAD_X4(cpu) \
- p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
- p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
- p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
- p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
- p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
- p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
- p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
- p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
- p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
- p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
- p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
- p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
- p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
- p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
- p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
- p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
-
-#define SAD(cpu) \
- p.sad[LUMA_8x32] = x265_pixel_sad_8x32_ ## cpu; \
- p.sad[LUMA_16x4] = x265_pixel_sad_16x4_ ## cpu; \
- p.sad[LUMA_16x12] = x265_pixel_sad_16x12_ ## cpu; \
- p.sad[LUMA_16x32] = x265_pixel_sad_16x32_ ## cpu; \
- p.sad[LUMA_16x64] = x265_pixel_sad_16x64_ ## cpu; \
- p.sad[LUMA_32x8] = x265_pixel_sad_32x8_ ## cpu; \
- p.sad[LUMA_32x16] = x265_pixel_sad_32x16_ ## cpu; \
- p.sad[LUMA_32x24] = x265_pixel_sad_32x24_ ## cpu; \
- p.sad[LUMA_32x32] = x265_pixel_sad_32x32_ ## cpu; \
- p.sad[LUMA_32x64] = x265_pixel_sad_32x64_ ## cpu; \
- p.sad[LUMA_64x16] = x265_pixel_sad_64x16_ ## cpu; \
- p.sad[LUMA_64x32] = x265_pixel_sad_64x32_ ## cpu; \
- p.sad[LUMA_64x48] = x265_pixel_sad_64x48_ ## cpu; \
- p.sad[LUMA_64x64] = x265_pixel_sad_64x64_ ## cpu; \
- p.sad[LUMA_48x64] = x265_pixel_sad_48x64_ ## cpu; \
- p.sad[LUMA_24x32] = x265_pixel_sad_24x32_ ## cpu; \
- p.sad[LUMA_12x16] = x265_pixel_sad_12x16_ ## cpu
-
-#define ASSGN_SSE(cpu) \
- p.sse_pp[LUMA_8x8] = x265_pixel_ssd_8x8_ ## cpu; \
- p.sse_pp[LUMA_8x4] = x265_pixel_ssd_8x4_ ## cpu; \
- p.sse_pp[LUMA_16x16] = x265_pixel_ssd_16x16_ ## cpu; \
- p.sse_pp[LUMA_16x4] = x265_pixel_ssd_16x4_ ## cpu; \
- p.sse_pp[LUMA_16x8] = x265_pixel_ssd_16x8_ ## cpu; \
- p.sse_pp[LUMA_8x16] = x265_pixel_ssd_8x16_ ## cpu; \
- p.sse_pp[LUMA_16x12] = x265_pixel_ssd_16x12_ ## cpu; \
- p.sse_pp[LUMA_32x32] = x265_pixel_ssd_32x32_ ## cpu; \
- p.sse_pp[LUMA_32x16] = x265_pixel_ssd_32x16_ ## cpu; \
- p.sse_pp[LUMA_16x32] = x265_pixel_ssd_16x32_ ## cpu; \
- p.sse_pp[LUMA_8x32] = x265_pixel_ssd_8x32_ ## cpu; \
- p.sse_pp[LUMA_32x8] = x265_pixel_ssd_32x8_ ## cpu; \
- p.sse_pp[LUMA_32x24] = x265_pixel_ssd_32x24_ ## cpu; \
- p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
- p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
-
-#define ASSGN_SSE_SS(cpu) \
- p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_ ## cpu; \
- p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_ ## cpu; \
- p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_ ## cpu; \
- p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_ ## cpu; \
- p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_ ## cpu; \
- p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_ ## cpu; \
- p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_ ## cpu; \
- p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_ ## cpu; \
- p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_ ## cpu; \
- p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_ ## cpu; \
- p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_ ## cpu; \
- p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_ ## cpu; \
- p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_ ## cpu; \
- p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_ ## cpu; \
- p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_ ## cpu; \
- p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_ ## cpu; \
- p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_ ## cpu; \
- p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_ ## cpu; \
- p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_ ## cpu; \
- p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_ ## cpu; \
- p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_ ## cpu; \
- p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_ ## cpu; \
- p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_ ## cpu; \
- p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_ ## cpu; \
- p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_ ## cpu;
-
-#define SA8D_INTER_FROM_BLOCK(cpu) \
- p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
- p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
- p.sa8d_inter[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
- p.sa8d_inter[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
- p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
- p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_ ## cpu; \
- p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_ ## cpu; \
- p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
- p.sa8d_inter[LUMA_16x8] = x265_pixel_sa8d_16x8_ ## cpu; \
- p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_ ## cpu; \
- p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_ ## cpu; \
- p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_ ## cpu; \
- p.sa8d_inter[LUMA_32x8] = x265_pixel_sa8d_32x8_ ## cpu; \
- p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_ ## cpu; \
- p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
- p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_ ## cpu; \
- p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_ ## cpu; \
- p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
- p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \
- p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \
- p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \
- p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_ ## cpu; \
- p.sa8d_inter[LUMA_64x16] = x265_pixel_sa8d_64x16_ ## cpu; \
- p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_ ## cpu;
+#define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
+ p.cu[BLOCK_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
+ p.cu[BLOCK_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.cu[BLOCK_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu
+#define ALL_LUMA_TU_TYPED_S(prim, fncdef, fname, cpu) \
+ p.cu[BLOCK_4x4].prim = fncdef x265_ ## fname ## 4_ ## cpu; \
+ p.cu[BLOCK_8x8].prim = fncdef x265_ ## fname ## 8_ ## cpu; \
+ p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \
+ p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu
+#define ALL_LUMA_BLOCKS_TYPED(prim, fncdef, fname, cpu) \
+ p.cu[BLOCK_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.cu[BLOCK_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu;
+#define ALL_LUMA_CU(prim, fname, cpu) ALL_LUMA_CU_TYPED(prim, , fname, cpu)
+#define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu)
+#define ALL_LUMA_BLOCKS(prim, fname, cpu) ALL_LUMA_BLOCKS_TYPED(prim, , fname, cpu)
+#define ALL_LUMA_TU_S(prim, fname, cpu) ALL_LUMA_TU_TYPED_S(prim, , fname, cpu)
+
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
+ p.pu[LUMA_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.pu[LUMA_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.pu[LUMA_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.pu[LUMA_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu; \
+ p.pu[LUMA_8x4].prim = fncdef x265_ ## fname ## _8x4_ ## cpu; \
+ p.pu[LUMA_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
+ p.pu[LUMA_16x8].prim = fncdef x265_ ## fname ## _16x8_ ## cpu; \
+ p.pu[LUMA_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \
+ p.pu[LUMA_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
+ p.pu[LUMA_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
+ p.pu[LUMA_64x32].prim = fncdef x265_ ## fname ## _64x32_ ## cpu; \
+ p.pu[LUMA_32x64].prim = fncdef x265_ ## fname ## _32x64_ ## cpu; \
+ p.pu[LUMA_16x12].prim = fncdef x265_ ## fname ## _16x12_ ## cpu; \
+ p.pu[LUMA_12x16].prim = fncdef x265_ ## fname ## _12x16_ ## cpu; \
+ p.pu[LUMA_16x4].prim = fncdef x265_ ## fname ## _16x4_ ## cpu; \
+ p.pu[LUMA_4x16].prim = fncdef x265_ ## fname ## _4x16_ ## cpu; \
+ p.pu[LUMA_32x24].prim = fncdef x265_ ## fname ## _32x24_ ## cpu; \
+ p.pu[LUMA_24x32].prim = fncdef x265_ ## fname ## _24x32_ ## cpu; \
+ p.pu[LUMA_32x8].prim = fncdef x265_ ## fname ## _32x8_ ## cpu; \
+ p.pu[LUMA_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu; \
+ p.pu[LUMA_64x48].prim = fncdef x265_ ## fname ## _64x48_ ## cpu; \
+ p.pu[LUMA_48x64].prim = fncdef x265_ ## fname ## _48x64_ ## cpu; \
+ p.pu[LUMA_64x16].prim = fncdef x265_ ## fname ## _64x16_ ## cpu; \
+ p.pu[LUMA_16x64].prim = fncdef x265_ ## fname ## _16x64_ ## cpu
+#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
+
+#define ALL_LUMA_PU_T(prim, fname) \
+ p.pu[LUMA_8x8].prim = fname<LUMA_8x8>; \
+ p.pu[LUMA_16x16].prim = fname<LUMA_16x16>; \
+ p.pu[LUMA_32x32].prim = fname<LUMA_32x32>; \
+ p.pu[LUMA_64x64].prim = fname<LUMA_64x64>; \
+ p.pu[LUMA_8x4].prim = fname<LUMA_8x4>; \
+ p.pu[LUMA_4x8].prim = fname<LUMA_4x8>; \
+ p.pu[LUMA_16x8].prim = fname<LUMA_16x8>; \
+ p.pu[LUMA_8x16].prim = fname<LUMA_8x16>; \
+ p.pu[LUMA_16x32].prim = fname<LUMA_16x32>; \
+ p.pu[LUMA_32x16].prim = fname<LUMA_32x16>; \
+ p.pu[LUMA_64x32].prim = fname<LUMA_64x32>; \
+ p.pu[LUMA_32x64].prim = fname<LUMA_32x64>; \
+ p.pu[LUMA_16x12].prim = fname<LUMA_16x12>; \
+ p.pu[LUMA_12x16].prim = fname<LUMA_12x16>; \
+ p.pu[LUMA_16x4].prim = fname<LUMA_16x4>; \
+ p.pu[LUMA_4x16].prim = fname<LUMA_4x16>; \
+ p.pu[LUMA_32x24].prim = fname<LUMA_32x24>; \
+ p.pu[LUMA_24x32].prim = fname<LUMA_24x32>; \
+ p.pu[LUMA_32x8].prim = fname<LUMA_32x8>; \
+ p.pu[LUMA_8x32].prim = fname<LUMA_8x32>; \
+ p.pu[LUMA_64x48].prim = fname<LUMA_64x48>; \
+ p.pu[LUMA_48x64].prim = fname<LUMA_48x64>; \
+ p.pu[LUMA_64x16].prim = fname<LUMA_64x16>; \
+ p.pu[LUMA_16x64].prim = fname<LUMA_16x64>
+
+#define ALL_CHROMA_420_CU_TYPED(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu
+#define ALL_CHROMA_420_CU_TYPED_S(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].prim = fncdef x265_ ## fname ## _4_ ## cpu; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].prim = fncdef x265_ ## fname ## _8_ ## cpu; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].prim = fncdef x265_ ## fname ## _16_ ## cpu; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].prim = fncdef x265_ ## fname ## _32_ ## cpu
+#define ALL_CHROMA_420_CU(prim, fname, cpu) ALL_CHROMA_420_CU_TYPED(prim, , fname, cpu)
+#define ALL_CHROMA_420_CU_S(prim, fname, cpu) ALL_CHROMA_420_CU_TYPED_S(prim, , fname, cpu)
+
+#define ALL_CHROMA_420_PU_TYPED(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].prim = fncdef x265_ ## fname ## _4x2_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].prim = fncdef x265_ ## fname ## _2x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef x265_ ## fname ## _8x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef x265_ ## fname ## _16x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].prim = fncdef x265_ ## fname ## _8x6_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].prim = fncdef x265_ ## fname ## _6x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].prim = fncdef x265_ ## fname ## _8x2_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].prim = fncdef x265_ ## fname ## _2x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef x265_ ## fname ## _16x12_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef x265_ ## fname ## _12x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef x265_ ## fname ## _16x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef x265_ ## fname ## _4x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef x265_ ## fname ## _32x24_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef x265_ ## fname ## _24x32_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef x265_ ## fname ## _32x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu
+#define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_420_4x4_PU_TYPED(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef x265_ ## fname ## _8x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef x265_ ## fname ## _16x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef x265_ ## fname ## _16x12_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef x265_ ## fname ## _12x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef x265_ ## fname ## _16x4_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef x265_ ## fname ## _4x16_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef x265_ ## fname ## _32x24_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef x265_ ## fname ## _24x32_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef x265_ ## fname ## _32x8_ ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu
+#define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
+
+
+#define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].prim = fncdef x265_ ## fname ## _32x64_ ## cpu
+#define ALL_CHROMA_422_CU(prim, fname, cpu) ALL_CHROMA_422_CU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_422_PU_TYPED(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].prim = fncdef x265_ ## fname ## _32x64_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].prim = fncdef x265_ ## fname ## _2x8_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].prim = fncdef x265_ ## fname ## _4x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].prim = fncdef x265_ ## fname ## _16x64_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].prim = fncdef x265_ ## fname ## _8x12_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].prim = fncdef x265_ ## fname ## _6x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].prim = fncdef x265_ ## fname ## _8x4_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].prim = fncdef x265_ ## fname ## _2x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].prim = fncdef x265_ ## fname ## _16x24_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].prim = fncdef x265_ ## fname ## _12x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].prim = fncdef x265_ ## fname ## _16x8_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].prim = fncdef x265_ ## fname ## _4x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].prim = fncdef x265_ ## fname ## _32x48_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef x265_ ## fname ## _24x64_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef x265_ ## fname ## _8x64_ ## cpu
+#define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
+
+#define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].prim = fncdef x265_ ## fname ## _8x4_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].prim = fncdef x265_ ## fname ## _16x8_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].prim = fncdef x265_ ## fname ## _64x32_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].prim = fncdef x265_ ## fname ## _32x64_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].prim = fncdef x265_ ## fname ## _16x12_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].prim = fncdef x265_ ## fname ## _12x16_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].prim = fncdef x265_ ## fname ## _16x4_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].prim = fncdef x265_ ## fname ## _4x16_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].prim = fncdef x265_ ## fname ## _32x24_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].prim = fncdef x265_ ## fname ## _24x32_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].prim = fncdef x265_ ## fname ## _32x8_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].prim = fncdef x265_ ## fname ## _64x48_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].prim = fncdef x265_ ## fname ## _48x64_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].prim = fncdef x265_ ## fname ## _64x16_ ## cpu; \
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].prim = fncdef x265_ ## fname ## _16x64_ ## cpu
+#define ALL_CHROMA_444_PU(prim, fname, cpu) ALL_CHROMA_444_PU_TYPED(prim, , fname, cpu)
+
+#define AVC_LUMA_PU(name, cpu) \
+ p.pu[LUMA_16x16].name = x265_pixel_ ## name ## _16x16_ ## cpu; \
+ p.pu[LUMA_16x8].name = x265_pixel_ ## name ## _16x8_ ## cpu; \
+ p.pu[LUMA_8x16].name = x265_pixel_ ## name ## _8x16_ ## cpu; \
+ p.pu[LUMA_8x8].name = x265_pixel_ ## name ## _8x8_ ## cpu; \
+ p.pu[LUMA_8x4].name = x265_pixel_ ## name ## _8x4_ ## cpu; \
+ p.pu[LUMA_4x8].name = x265_pixel_ ## name ## _4x8_ ## cpu; \
+ p.pu[LUMA_4x4].name = x265_pixel_ ## name ## _4x4_ ## cpu; \
+ p.pu[LUMA_4x16].name = x265_pixel_ ## name ## _4x16_ ## cpu
+
+#define HEVC_SAD(cpu) \
+ p.pu[LUMA_8x32].sad = x265_pixel_sad_8x32_ ## cpu; \
+ p.pu[LUMA_16x4].sad = x265_pixel_sad_16x4_ ## cpu; \
+ p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_ ## cpu; \
+ p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_ ## cpu; \
+ p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu; \
+ p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_ ## cpu; \
+ p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_ ## cpu; \
+ p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_ ## cpu; \
+ p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_ ## cpu; \
+ p.pu[LUMA_32x64].sad = x265_pixel_sad_32x64_ ## cpu; \
+ p.pu[LUMA_64x16].sad = x265_pixel_sad_64x16_ ## cpu; \
+ p.pu[LUMA_64x32].sad = x265_pixel_sad_64x32_ ## cpu; \
+ p.pu[LUMA_64x48].sad = x265_pixel_sad_64x48_ ## cpu; \
+ p.pu[LUMA_64x64].sad = x265_pixel_sad_64x64_ ## cpu; \
+ p.pu[LUMA_48x64].sad = x265_pixel_sad_48x64_ ## cpu; \
+ p.pu[LUMA_24x32].sad = x265_pixel_sad_24x32_ ## cpu; \
+ p.pu[LUMA_12x16].sad = x265_pixel_sad_12x16_ ## cpu
+
+#define HEVC_SAD_X3(cpu) \
+ p.pu[LUMA_16x8].sad_x3 = x265_pixel_sad_x3_16x8_ ## cpu; \
+ p.pu[LUMA_16x12].sad_x3 = x265_pixel_sad_x3_16x12_ ## cpu; \
+ p.pu[LUMA_16x16].sad_x3 = x265_pixel_sad_x3_16x16_ ## cpu; \
+ p.pu[LUMA_16x32].sad_x3 = x265_pixel_sad_x3_16x32_ ## cpu; \
+ p.pu[LUMA_16x64].sad_x3 = x265_pixel_sad_x3_16x64_ ## cpu; \
+ p.pu[LUMA_32x8].sad_x3 = x265_pixel_sad_x3_32x8_ ## cpu; \
+ p.pu[LUMA_32x16].sad_x3 = x265_pixel_sad_x3_32x16_ ## cpu; \
+ p.pu[LUMA_32x24].sad_x3 = x265_pixel_sad_x3_32x24_ ## cpu; \
+ p.pu[LUMA_32x32].sad_x3 = x265_pixel_sad_x3_32x32_ ## cpu; \
+ p.pu[LUMA_32x64].sad_x3 = x265_pixel_sad_x3_32x64_ ## cpu; \
+ p.pu[LUMA_24x32].sad_x3 = x265_pixel_sad_x3_24x32_ ## cpu; \
+ p.pu[LUMA_48x64].sad_x3 = x265_pixel_sad_x3_48x64_ ## cpu; \
+ p.pu[LUMA_64x16].sad_x3 = x265_pixel_sad_x3_64x16_ ## cpu; \
+ p.pu[LUMA_64x32].sad_x3 = x265_pixel_sad_x3_64x32_ ## cpu; \
+ p.pu[LUMA_64x48].sad_x3 = x265_pixel_sad_x3_64x48_ ## cpu; \
+ p.pu[LUMA_64x64].sad_x3 = x265_pixel_sad_x3_64x64_ ## cpu
+
+#define HEVC_SAD_X4(cpu) \
+ p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_ ## cpu; \
+ p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_ ## cpu; \
+ p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_ ## cpu; \
+ p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_ ## cpu; \
+ p.pu[LUMA_16x64].sad_x4 = x265_pixel_sad_x4_16x64_ ## cpu; \
+ p.pu[LUMA_32x8].sad_x4 = x265_pixel_sad_x4_32x8_ ## cpu; \
+ p.pu[LUMA_32x16].sad_x4 = x265_pixel_sad_x4_32x16_ ## cpu; \
+ p.pu[LUMA_32x24].sad_x4 = x265_pixel_sad_x4_32x24_ ## cpu; \
+ p.pu[LUMA_32x32].sad_x4 = x265_pixel_sad_x4_32x32_ ## cpu; \
+ p.pu[LUMA_32x64].sad_x4 = x265_pixel_sad_x4_32x64_ ## cpu; \
+ p.pu[LUMA_24x32].sad_x4 = x265_pixel_sad_x4_24x32_ ## cpu; \
+ p.pu[LUMA_48x64].sad_x4 = x265_pixel_sad_x4_48x64_ ## cpu; \
+ p.pu[LUMA_64x16].sad_x4 = x265_pixel_sad_x4_64x16_ ## cpu; \
+ p.pu[LUMA_64x32].sad_x4 = x265_pixel_sad_x4_64x32_ ## cpu; \
+ p.pu[LUMA_64x48].sad_x4 = x265_pixel_sad_x4_64x48_ ## cpu; \
+ p.pu[LUMA_64x64].sad_x4 = x265_pixel_sad_x4_64x64_ ## cpu
+
+#define ASSIGN_SSE_PP(cpu) \
+ p.cu[BLOCK_8x8].sse_pp = x265_pixel_ssd_8x8_ ## cpu; \
+ p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_ ## cpu; \
+ p.cu[BLOCK_32x32].sse_pp = x265_pixel_ssd_32x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = x265_pixel_ssd_8x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = x265_pixel_ssd_16x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = x265_pixel_ssd_32x64_ ## cpu;
+
+#define ASSIGN_SSE_SS(cpu) ALL_LUMA_BLOCKS(sse_ss, pixel_ssd_ss, cpu)
+
+#define ASSIGN_SA8D(cpu) \
+ ALL_LUMA_CU(sa8d, pixel_sa8d, cpu); \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = x265_pixel_sa8d_8x16_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = x265_pixel_sa8d_16x32_ ## cpu; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = x265_pixel_sa8d_32x64_ ## cpu
#define PIXEL_AVG(cpu) \
- p.pixelavg_pp[LUMA_64x64] = x265_pixel_avg_64x64_ ## cpu; \
- p.pixelavg_pp[LUMA_64x48] = x265_pixel_avg_64x48_ ## cpu; \
- p.pixelavg_pp[LUMA_64x32] = x265_pixel_avg_64x32_ ## cpu; \
- p.pixelavg_pp[LUMA_64x16] = x265_pixel_avg_64x16_ ## cpu; \
- p.pixelavg_pp[LUMA_48x64] = x265_pixel_avg_48x64_ ## cpu; \
- p.pixelavg_pp[LUMA_32x64] = x265_pixel_avg_32x64_ ## cpu; \
- p.pixelavg_pp[LUMA_32x32] = x265_pixel_avg_32x32_ ## cpu; \
- p.pixelavg_pp[LUMA_32x24] = x265_pixel_avg_32x24_ ## cpu; \
- p.pixelavg_pp[LUMA_32x16] = x265_pixel_avg_32x16_ ## cpu; \
- p.pixelavg_pp[LUMA_32x8] = x265_pixel_avg_32x8_ ## cpu; \
- p.pixelavg_pp[LUMA_24x32] = x265_pixel_avg_24x32_ ## cpu; \
- p.pixelavg_pp[LUMA_16x64] = x265_pixel_avg_16x64_ ## cpu; \
- p.pixelavg_pp[LUMA_16x32] = x265_pixel_avg_16x32_ ## cpu; \
- p.pixelavg_pp[LUMA_16x16] = x265_pixel_avg_16x16_ ## cpu; \
- p.pixelavg_pp[LUMA_16x12] = x265_pixel_avg_16x12_ ## cpu; \
- p.pixelavg_pp[LUMA_16x8] = x265_pixel_avg_16x8_ ## cpu; \
- p.pixelavg_pp[LUMA_16x4] = x265_pixel_avg_16x4_ ## cpu; \
- p.pixelavg_pp[LUMA_12x16] = x265_pixel_avg_12x16_ ## cpu; \
- p.pixelavg_pp[LUMA_8x32] = x265_pixel_avg_8x32_ ## cpu; \
- p.pixelavg_pp[LUMA_8x16] = x265_pixel_avg_8x16_ ## cpu; \
- p.pixelavg_pp[LUMA_8x8] = x265_pixel_avg_8x8_ ## cpu; \
- p.pixelavg_pp[LUMA_8x4] = x265_pixel_avg_8x4_ ## cpu;
+ p.pu[LUMA_64x64].pixelavg_pp = x265_pixel_avg_64x64_ ## cpu; \
+ p.pu[LUMA_64x48].pixelavg_pp = x265_pixel_avg_64x48_ ## cpu; \
+ p.pu[LUMA_64x32].pixelavg_pp = x265_pixel_avg_64x32_ ## cpu; \
+ p.pu[LUMA_64x16].pixelavg_pp = x265_pixel_avg_64x16_ ## cpu; \
+ p.pu[LUMA_48x64].pixelavg_pp = x265_pixel_avg_48x64_ ## cpu; \
+ p.pu[LUMA_32x64].pixelavg_pp = x265_pixel_avg_32x64_ ## cpu; \
+ p.pu[LUMA_32x32].pixelavg_pp = x265_pixel_avg_32x32_ ## cpu; \
+ p.pu[LUMA_32x24].pixelavg_pp = x265_pixel_avg_32x24_ ## cpu; \
+ p.pu[LUMA_32x16].pixelavg_pp = x265_pixel_avg_32x16_ ## cpu; \
+ p.pu[LUMA_32x8].pixelavg_pp = x265_pixel_avg_32x8_ ## cpu; \
+ p.pu[LUMA_24x32].pixelavg_pp = x265_pixel_avg_24x32_ ## cpu; \
+ p.pu[LUMA_16x64].pixelavg_pp = x265_pixel_avg_16x64_ ## cpu; \
+ p.pu[LUMA_16x32].pixelavg_pp = x265_pixel_avg_16x32_ ## cpu; \
+ p.pu[LUMA_16x16].pixelavg_pp = x265_pixel_avg_16x16_ ## cpu; \
+ p.pu[LUMA_16x12].pixelavg_pp = x265_pixel_avg_16x12_ ## cpu; \
+ p.pu[LUMA_16x8].pixelavg_pp = x265_pixel_avg_16x8_ ## cpu; \
+ p.pu[LUMA_16x4].pixelavg_pp = x265_pixel_avg_16x4_ ## cpu; \
+ p.pu[LUMA_12x16].pixelavg_pp = x265_pixel_avg_12x16_ ## cpu; \
+ p.pu[LUMA_8x32].pixelavg_pp = x265_pixel_avg_8x32_ ## cpu; \
+ p.pu[LUMA_8x16].pixelavg_pp = x265_pixel_avg_8x16_ ## cpu; \
+ p.pu[LUMA_8x8].pixelavg_pp = x265_pixel_avg_8x8_ ## cpu; \
+ p.pu[LUMA_8x4].pixelavg_pp = x265_pixel_avg_8x4_ ## cpu;
#define PIXEL_AVG_W4(cpu) \
- p.pixelavg_pp[LUMA_4x4] = x265_pixel_avg_4x4_ ## cpu; \
- p.pixelavg_pp[LUMA_4x8] = x265_pixel_avg_4x8_ ## cpu; \
- p.pixelavg_pp[LUMA_4x16] = x265_pixel_avg_4x16_ ## cpu;
-
-#define SETUP_CHROMA_FUNC_DEF_420(W, H, cpu) \
- p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
- p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
- p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
- p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
- p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
- p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
-
-#define CHROMA_FILTERS_420(cpu) \
- SETUP_CHROMA_FUNC_DEF_420(4, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(4, 2, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(2, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(8, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(8, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(4, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(8, 6, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(6, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(8, 2, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(2, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(16, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(16, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(8, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(16, 12, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(12, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(16, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(4, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(32, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(32, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(16, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(32, 24, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(24, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(32, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_420(8, 32, cpu);
-
-#define CHROMA_FILTERS_422(cpu) \
- SETUP_CHROMA_FUNC_DEF_422(4, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(4, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(2, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(8, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(8, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(4, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(8, 12, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(6, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(8, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(2, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(16, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(16, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(8, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(16, 24, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(12, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(16, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(4, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(32, 64, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(32, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(16, 64, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(32, 48, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(24, 64, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(32, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_422(8, 64, cpu);
-
-#define CHROMA_FILTERS_444(cpu) \
- SETUP_CHROMA_FUNC_DEF_444(8, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(8, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(4, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(16, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(16, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(8, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(16, 12, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(12, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(16, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(4, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(32, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(32, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(16, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(32, 24, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(24, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(64, 64, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(64, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(32, 64, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(64, 48, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(48, 64, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(64, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(16, 64, cpu);
-
-#define CHROMA_SP_FILTERS_SSE4_420(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_420(4, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(4, 2, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(2, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(4, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(6, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(2, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(16, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(16, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(16, 12, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(12, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(16, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(4, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(32, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(32, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(16, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(32, 24, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(24, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(32, 8, cpu);
-
-#define CHROMA_SP_FILTERS_420(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_420(8, 2, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(8, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(8, 6, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(8, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(8, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_420(8, 32, cpu);
-
-#define CHROMA_SP_FILTERS_SSE4_422(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_422(4, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(4, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(2, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(4, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(6, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(2, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(16, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(16, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(16, 24, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(12, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(16, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(4, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(32, 64, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(32, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(16, 64, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(32, 48, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(24, 64, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(32, 16, cpu);
-
-#define CHROMA_SP_FILTERS_422(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_422(8, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(8, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(8, 12, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(8, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(8, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_422(8, 64, cpu);
-
-#define CHROMA_SP_FILTERS_SSE4_444(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_444(4, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(16, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(16, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(16, 12, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(12, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(16, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(4, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(32, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(32, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(16, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(32, 24, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(24, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(64, 64, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(64, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(32, 64, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(64, 48, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(48, 64, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(64, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(16, 64, cpu);
-
-#define CHROMA_SP_FILTERS_444(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_444(8, 8, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(8, 4, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(8, 16, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(8, 32, cpu);
-
-#define CHROMA_SS_FILTERS_420(cpu) \
- SETUP_CHROMA_SS_FUNC_DEF_420(4, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(4, 2, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(8, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(8, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(4, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(8, 6, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(8, 2, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(16, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(16, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(8, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(16, 12, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(12, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(16, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(4, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(32, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(32, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(16, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(32, 24, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(24, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(32, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(8, 32, cpu);
-
-#define CHROMA_SS_FILTERS_SSE4_420(cpu) \
- SETUP_CHROMA_SS_FUNC_DEF_420(2, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(2, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_420(6, 8, cpu);
-
-#define CHROMA_SS_FILTERS_422(cpu) \
- SETUP_CHROMA_SS_FUNC_DEF_422(4, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(4, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(8, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(8, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(4, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(8, 12, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(8, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(16, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(16, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(8, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(16, 24, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(12, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(16, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(4, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(32, 64, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(32, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(16, 64, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(32, 48, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(24, 64, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(32, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(8, 64, cpu);
-
-#define CHROMA_SS_FILTERS_SSE4_422(cpu) \
- SETUP_CHROMA_SS_FUNC_DEF_422(2, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(2, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_422(6, 16, cpu);
-
-#define CHROMA_SS_FILTERS_444(cpu) \
- SETUP_CHROMA_SS_FUNC_DEF_444(8, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(8, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(4, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(16, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(16, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(8, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(16, 12, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(12, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(16, 4, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(4, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(32, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(32, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(16, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(32, 24, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(24, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(64, 64, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(64, 32, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(32, 64, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(64, 48, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(48, 64, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(64, 16, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(16, 64, cpu);
-
-#if HIGH_BIT_DEPTH // temporary, until all 10bit functions are completed
-#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
- p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
- p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
-#else
-#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
- p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
- p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;
-#endif // if HIGH_BIT_DEPTH
-
-#define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
- p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
- p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
-
-#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
- p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
- p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
-
-#define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
- p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
-
-#define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
- p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
-
-#define CHROMA_BLOCKCOPY(type, cpu) \
- SETUP_CHROMA_BLOCKCOPY(type, 2, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 4, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 6, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 8, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 8, 6, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 8, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 12, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 16, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 16, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 24, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 32, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
-
-#define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
- p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
-
-#define CHROMA_BLOCKCOPY_422(type, cpu) \
- SETUP_CHROMA_BLOCKCOPY_422(type, 2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 2, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 4, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 6, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 8, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 8, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 8, 64, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 12, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 16, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 16, 64, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 24, 64, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 32, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 32, 48, cpu); \
- SETUP_CHROMA_BLOCKCOPY_422(type, 32, 64, cpu);
-
-#define LUMA_BLOCKCOPY(type, cpu) \
- SETUP_LUMA_BLOCKCOPY(type, 4, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 8, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 8, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 4, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 16, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 16, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 8, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 16, 12, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 12, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 16, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 4, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 32, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 32, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 16, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 32, 24, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 24, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 32, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 8, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 64, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 64, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 32, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 64, 48, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 48, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 64, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
-
-#define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
-
-#define CHROMA_BLOCKCOPY_SP(cpu) \
- SETUP_CHROMA_BLOCKCOPY_SP(2, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(4, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(6, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(8, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(8, 6, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(8, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(12, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(16, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(16, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(24, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(32, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
-
-#define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
-
-#define CHROMA_BLOCKCOPY_SP_422(cpu) \
- SETUP_CHROMA_BLOCKCOPY_SP_422(2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(2, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(4, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(6, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(8, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(8, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(8, 64, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(12, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(16, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(16, 64, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(24, 64, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(32, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(32, 48, cpu); \
- SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
-
-#define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
- p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
-
-#define CHROMA_PIXELSUB_PS(cpu) \
- SETUP_CHROMA_PIXELSUB(4, 4, cpu); \
- SETUP_CHROMA_PIXELSUB(8, 8, cpu); \
- SETUP_CHROMA_PIXELSUB(16, 16, cpu); \
- SETUP_CHROMA_PIXELSUB(32, 32, cpu);
-
-#define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
-
-#define CHROMA_PIXELSUB_PS_422(cpu) \
- SETUP_CHROMA_PIXELSUB_422(4, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_422(8, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_422(32, 64, cpu);
+ p.pu[LUMA_4x4].pixelavg_pp = x265_pixel_avg_4x4_ ## cpu; \
+ p.pu[LUMA_4x8].pixelavg_pp = x265_pixel_avg_4x8_ ## cpu; \
+ p.pu[LUMA_4x16].pixelavg_pp = x265_pixel_avg_4x16_ ## cpu;
+
+#define CHROMA_420_FILTERS(cpu) \
+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
+ ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
+ ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
+ ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, cpu);
+
+#define CHROMA_422_FILTERS(cpu) \
+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
+ ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
+ ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
+ ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, cpu);
+
+#define CHROMA_444_FILTERS(cpu) \
+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
+ ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
+ ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
+ ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu);
+
+#define SETUP_CHROMA_420_VSP_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_420_VSP_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(6, 8, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(32, 8, cpu);
+
+#define CHROMA_420_VSP_FILTERS(cpu) \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_VSP_FUNC_DEF(8, 32, cpu);
+
+#define SETUP_CHROMA_422_VSP_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_422_VSP_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(6, 16, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(32, 16, cpu);
+
+#define CHROMA_422_VSP_FILTERS(cpu) \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_422_VSP_FUNC_DEF(8, 64, cpu);
+
+#define SETUP_CHROMA_444_VSP_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_444_VSP_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(16, 64, cpu);
+
+#define CHROMA_444_VSP_FILTERS(cpu) \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_444_VSP_FUNC_DEF(8, 32, cpu);
+
+#define SETUP_CHROMA_420_VSS_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+
+#define CHROMA_420_VSS_FILTERS(cpu) \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(8, 32, cpu);
+
+#define CHROMA_420_VSS_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_VSS_FUNC_DEF(6, 8, cpu);
+
+#define SETUP_CHROMA_422_VSS_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+
+#define CHROMA_422_VSS_FILTERS(cpu) \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_422_VSS_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_422_VSS_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_444_VSS_FILTERS(cpu) ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu)
#define LUMA_FILTERS(cpu) \
- SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
- SETUP_LUMA_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_FUNC_DEF(8, 4, cpu); \
- SETUP_LUMA_FUNC_DEF(4, 8, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 8, cpu); \
- SETUP_LUMA_FUNC_DEF(8, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 12, cpu); \
- SETUP_LUMA_FUNC_DEF(12, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 4, cpu); \
- SETUP_LUMA_FUNC_DEF(4, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_FUNC_DEF(32, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 32, cpu); \
- SETUP_LUMA_FUNC_DEF(32, 24, cpu); \
- SETUP_LUMA_FUNC_DEF(24, 32, cpu); \
- SETUP_LUMA_FUNC_DEF(32, 8, cpu); \
- SETUP_LUMA_FUNC_DEF(8, 32, cpu); \
- SETUP_LUMA_FUNC_DEF(64, 64, cpu); \
- SETUP_LUMA_FUNC_DEF(64, 32, cpu); \
- SETUP_LUMA_FUNC_DEF(32, 64, cpu); \
- SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
- SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
- SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 64, cpu);
+ ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, cpu); p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_ ## cpu; \
+ ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, cpu); p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_ ## cpu; \
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, cpu); p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_ ## cpu; \
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, cpu); p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_ ## cpu; \
+ ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, cpu); p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_ ## cpu; \
+ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
+
+#define LUMA_VSS_FILTERS(cpu) ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, cpu); p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_ ## cpu
+
+#define LUMA_CU_BLOCKCOPY(type, cpu) \
+ p.cu[BLOCK_4x4].copy_ ## type = x265_blockcopy_ ## type ## _4x4_ ## cpu; \
+ ALL_LUMA_CU(copy_ ## type, blockcopy_ ## type, cpu);
+
+#define CHROMA_420_CU_BLOCKCOPY(type, cpu) ALL_CHROMA_420_CU(copy_ ## type, blockcopy_ ## type, cpu)
+#define CHROMA_422_CU_BLOCKCOPY(type, cpu) ALL_CHROMA_422_CU(copy_ ## type, blockcopy_ ## type, cpu)
+
+#define LUMA_PU_BLOCKCOPY(type, cpu) ALL_LUMA_PU(copy_ ## type, blockcopy_ ## type, cpu); p.pu[LUMA_4x4].copy_ ## type = x265_blockcopy_ ## type ## _4x4_ ## cpu
+#define CHROMA_420_PU_BLOCKCOPY(type, cpu) ALL_CHROMA_420_PU(copy_ ## type, blockcopy_ ## type, cpu)
+#define CHROMA_422_PU_BLOCKCOPY(type, cpu) ALL_CHROMA_422_PU(copy_ ## type, blockcopy_ ## type, cpu)
#define LUMA_PIXELSUB(cpu) \
- SETUP_LUMA_SUB_FUNC_DEF(4, 4, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu);
-
-#define LUMA_SP_FILTERS(cpu) \
- SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(8, 4, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(4, 8, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(16, 8, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(8, 16, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(16, 4, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(4, 16, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(32, 8, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(8, 32, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
-
-#define LUMA_SS_FILTERS(cpu) \
- SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(8, 4, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(4, 8, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(16, 8, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(8, 16, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(16, 4, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(4, 16, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(32, 8, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(8, 32, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
-
-#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
- p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
-
-#define LUMA_VAR(cpu) \
- SETUP_PIXEL_VAR_DEF(8, 8, cpu); \
- SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
- SETUP_PIXEL_VAR_DEF(32, 32, cpu); \
- SETUP_PIXEL_VAR_DEF(64, 64, cpu);
-
-#define SETUP_PIXEL_SSE_SP_DEF(W, H, cpu) \
- p.sse_sp[LUMA_ ## W ## x ## H] = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
-
-#define LUMA_SSE_SP(cpu) \
- SETUP_PIXEL_SSE_SP_DEF(4, 4, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(8, 8, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(8, 4, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(4, 8, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(16, 16, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(16, 8, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(8, 16, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(16, 12, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(12, 16, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(16, 4, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(4, 16, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(32, 32, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(32, 16, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(16, 32, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(32, 24, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(24, 32, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(32, 8, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(8, 32, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(64, 64, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(64, 32, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(32, 64, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(64, 48, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(48, 64, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
- SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
-
-#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
- p.luma_addAvg[LUMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
-
-#define LUMA_ADDAVG(cpu) \
- SETUP_LUMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
- SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
-
-#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
- p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
-
-#define CHROMA_ADDAVG(cpu) \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(6, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
-
-#define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
-
-#define CHROMA_ADDAVG_422(cpu) \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 4, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(6, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 4, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 12, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 64, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(12, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 8, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 24, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 64, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(24, 64, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 16, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 32, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
+ p.cu[BLOCK_4x4].sub_ps = x265_pixel_sub_ps_4x4_ ## cpu; \
+ p.cu[BLOCK_4x4].add_ps = x265_pixel_add_ps_4x4_ ## cpu; \
+ ALL_LUMA_CU(sub_ps, pixel_sub_ps, cpu); \
+ ALL_LUMA_CU(add_ps, pixel_add_ps, cpu);
-#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
+#define CHROMA_420_PIXELSUB_PS(cpu) \
+ ALL_CHROMA_420_CU(sub_ps, pixel_sub_ps, cpu); \
+ ALL_CHROMA_420_CU(add_ps, pixel_add_ps, cpu);
+
+#define CHROMA_422_PIXELSUB_PS(cpu) \
+ ALL_CHROMA_422_CU(sub_ps, pixel_sub_ps, cpu); \
+ ALL_CHROMA_422_CU(add_ps, pixel_add_ps, cpu);
-#define SETUP_INTRA_ANG(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
+#define LUMA_VAR(cpu) ALL_LUMA_CU(var, pixel_var, cpu)
+
+#define LUMA_ADDAVG(cpu) ALL_LUMA_PU(addAvg, addAvg, cpu); p.pu[LUMA_4x4].addAvg = x265_addAvg_4x4_ ## cpu
+#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg, addAvg, cpu);
+#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg, addAvg, cpu);
+
+#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
+ p.cu[BLOCK_4x4].intra_pred[mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_8x8].intra_pred[mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_16x16].intra_pred[mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_32x32].intra_pred[mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG4(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
+ p.cu[BLOCK_4x4].intra_pred[mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG16_32(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
+ p.cu[BLOCK_16x16].intra_pred[mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_32x32].intra_pred[mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG4_8(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
+ p.cu[BLOCK_4x4].intra_pred[mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_8x8].intra_pred[mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
#define INTRA_ANG_SSSE3(cpu) \
SETUP_INTRA_ANG_COMMON(2, 2, cpu); \
@@ -975,22 +608,27 @@ extern "C" {
SETUP_INTRA_ANG_COMMON(17, 17, cpu); \
SETUP_INTRA_ANG_COMMON(18, 18, cpu);
+#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
+ p.cu[BLOCK_8x8].intra_pred[mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_16x16].intra_pred[mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
+ p.cu[BLOCK_32x32].intra_pred[mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
+
#define INTRA_ANG_SSE4_HIGH(cpu) \
- SETUP_INTRA_ANG(19, 19, cpu); \
- SETUP_INTRA_ANG(20, 20, cpu); \
- SETUP_INTRA_ANG(21, 21, cpu); \
- SETUP_INTRA_ANG(22, 22, cpu); \
- SETUP_INTRA_ANG(23, 23, cpu); \
- SETUP_INTRA_ANG(24, 24, cpu); \
- SETUP_INTRA_ANG(25, 25, cpu); \
- SETUP_INTRA_ANG(26, 26, cpu); \
- SETUP_INTRA_ANG(27, 27, cpu); \
- SETUP_INTRA_ANG(28, 28, cpu); \
- SETUP_INTRA_ANG(29, 29, cpu); \
- SETUP_INTRA_ANG(30, 30, cpu); \
- SETUP_INTRA_ANG(31, 31, cpu); \
- SETUP_INTRA_ANG(32, 32, cpu); \
- SETUP_INTRA_ANG(33, 33, cpu); \
+ SETUP_INTRA_ANG_HIGH(19, 19, cpu); \
+ SETUP_INTRA_ANG_HIGH(20, 20, cpu); \
+ SETUP_INTRA_ANG_HIGH(21, 21, cpu); \
+ SETUP_INTRA_ANG_HIGH(22, 22, cpu); \
+ SETUP_INTRA_ANG_HIGH(23, 23, cpu); \
+ SETUP_INTRA_ANG_HIGH(24, 24, cpu); \
+ SETUP_INTRA_ANG_HIGH(25, 25, cpu); \
+ SETUP_INTRA_ANG_HIGH(26, 26, cpu); \
+ SETUP_INTRA_ANG_HIGH(27, 27, cpu); \
+ SETUP_INTRA_ANG_HIGH(28, 28, cpu); \
+ SETUP_INTRA_ANG_HIGH(29, 29, cpu); \
+ SETUP_INTRA_ANG_HIGH(30, 30, cpu); \
+ SETUP_INTRA_ANG_HIGH(31, 31, cpu); \
+ SETUP_INTRA_ANG_HIGH(32, 32, cpu); \
+ SETUP_INTRA_ANG_HIGH(33, 33, cpu); \
SETUP_INTRA_ANG4(19, 17, cpu); \
SETUP_INTRA_ANG4(20, 16, cpu); \
SETUP_INTRA_ANG4(21, 15, cpu); \
@@ -1039,770 +677,709 @@ extern "C" {
SETUP_INTRA_ANG16_32(32, 32, cpu); \
SETUP_INTRA_ANG16_32(33, 33, cpu);
-#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
- p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define CHROMA_VERT_FILTERS(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu);
-
-#define CHROMA_VERT_FILTERS_SSE4(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
-
-#define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define CHROMA_VERT_FILTERS_422(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(8, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(8, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(4, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(8, 12, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(8, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(16, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(16, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(8, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(16, 24, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(12, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(16, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(4, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(32, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(32, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(16, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(32, 48, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(24, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(32, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(8, 64, cpu);
-
-#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF_422(2, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(2, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(4, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
-
-#define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
- p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-
-#define CHROMA_VERT_FILTERS_444(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(8, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(4, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(16, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(16, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(8, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(16, 12, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(12, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(16, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(4, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(32, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(32, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(16, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(32, 24, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(24, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(8, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(64, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(64, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(32, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(64, 48, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(48, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(64, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu);
-
-#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
- p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
-
-#define CHROMA_HORIZ_FILTERS(cpu) \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu);
-
-#define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
- p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
-
-#define CHROMA_HORIZ_FILTERS_422(cpu) \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 12, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(6, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 24, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(12, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 48, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(24, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu);
-
-#define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
- p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
-
-#define CHROMA_HORIZ_FILTERS_444(cpu) \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 12, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(12, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 24, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(24, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 48, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(48, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 64, cpu);
+#define CHROMA_420_VERT_FILTERS(cpu) \
+ ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, cpu); \
+ ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
+ ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, cpu); \
+ ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, cpu)
+
+#define SETUP_CHROMA_420_VERT_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_420_VERT_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(6, 8, cpu);
+
+#define SETUP_CHROMA_422_VERT_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_422_VERT_FILTERS(cpu) \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_422_VERT_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_422_VERT_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_444_VERT_FILTERS(cpu) \
+ ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu); \
+ ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
+ ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu); \
+ ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, cpu)
+
+#define CHROMA_420_HORIZ_FILTERS(cpu) \
+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
+ ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu);
+
+#define SETUP_CHROMA_422_HORIZ_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_422_HORIZ_FILTERS(cpu) \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(6, 16, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_444_HORIZ_FILTERS(cpu) \
+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
+ ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu);
namespace x265 {
// private x265 namespace
-#if HIGH_BIT_DEPTH
-/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
-template<int log2Size>
-void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
+template<int size>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
{
- const int size = 1 << log2Size;
- const int sizeIdx = log2Size - 2;
- ALIGN_VAR_32(pixel, buffer[32 * 32]);
+ ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA)]);
+ const int filterSize = NTAPS_LUMA;
+ const int halfFilterSize = filterSize >> 1;
- for (int mode = 2; mode <= 34; mode++)
- {
- pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0);
- pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0);
- pixel *out = dest + ((mode - 2) << (log2Size * 2));
-
- if (mode < 18)
- {
- primitives.intra_pred[mode][sizeIdx](buffer, size, left, above, mode, bLuma);
- primitives.transpose[sizeIdx](out, buffer, size);
- }
- else
- primitives.intra_pred[mode][sizeIdx](out, size, left, above, mode, bLuma);
- }
+ x265::primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
+ x265::primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
}
-#endif
-void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
-{
#if HIGH_BIT_DEPTH
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 16bpp
+{
if (cpuMask & X265_CPU_SSE2)
{
- INIT8(sad, _mmx2);
- INIT2(sad, _sse2);
- SAD(sse2);
-
- INIT6(satd, _sse2);
- HEVC_SATD(sse2);
- p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
-
- p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
- SA8D_INTER_FROM_BLOCK(sse2);
- p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
- p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
-
- p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_mmx2;
- p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_mmx2;
- p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_mmx2;
- p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_sse2;
- p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2;
- p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2;
- p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2;
- p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2;
- p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2;
- p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2;
- p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2;
- p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2;
- p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2;
- p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2;
- p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2;
- p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2;
- p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2;
- p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2;
- p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
- p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
- p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2;
- p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2;
- p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2;
- p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2;
- p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2;
-
- p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
- p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
- p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
- p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
- p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
+ /* We do not differentiate CPUs which support MMX and not SSE2. We only check
+ * for SSE2 and then use both MMX and SSE2 functions */
+ AVC_LUMA_PU(sad, mmx2);
+
+ p.pu[LUMA_16x16].sad = x265_pixel_sad_16x16_sse2;
+ p.pu[LUMA_16x8].sad = x265_pixel_sad_16x8_sse2;
+ HEVC_SAD(sse2);
+
+ p.pu[LUMA_4x4].sad_x3 = x265_pixel_sad_x3_4x4_mmx2;
+ p.pu[LUMA_4x8].sad_x3 = x265_pixel_sad_x3_4x8_mmx2;
+ p.pu[LUMA_4x16].sad_x3 = x265_pixel_sad_x3_4x16_mmx2;
+ p.pu[LUMA_8x4].sad_x3 = x265_pixel_sad_x3_8x4_sse2;
+ p.pu[LUMA_8x8].sad_x3 = x265_pixel_sad_x3_8x8_sse2;
+ p.pu[LUMA_8x16].sad_x3 = x265_pixel_sad_x3_8x16_sse2;
+ p.pu[LUMA_8x32].sad_x3 = x265_pixel_sad_x3_8x32_sse2;
+ p.pu[LUMA_16x4].sad_x3 = x265_pixel_sad_x3_16x4_sse2;
+ p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_mmx2;
+ HEVC_SAD_X3(sse2);
+
+ p.pu[LUMA_4x4].sad_x4 = x265_pixel_sad_x4_4x4_mmx2;
+ p.pu[LUMA_4x8].sad_x4 = x265_pixel_sad_x4_4x8_mmx2;
+ p.pu[LUMA_4x16].sad_x4 = x265_pixel_sad_x4_4x16_mmx2;
+ p.pu[LUMA_8x4].sad_x4 = x265_pixel_sad_x4_8x4_sse2;
+ p.pu[LUMA_8x8].sad_x4 = x265_pixel_sad_x4_8x8_sse2;
+ p.pu[LUMA_8x16].sad_x4 = x265_pixel_sad_x4_8x16_sse2;
+ p.pu[LUMA_8x32].sad_x4 = x265_pixel_sad_x4_8x32_sse2;
+ p.pu[LUMA_16x4].sad_x4 = x265_pixel_sad_x4_16x4_sse2;
+ p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_mmx2;
+ HEVC_SAD_X4(sse2);
+
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_mmx2;
+ ALL_LUMA_PU(satd, pixel_satd, sse2);
+
+ ASSIGN_SA8D(sse2);
+ LUMA_PIXELSUB(sse2);
+ CHROMA_420_PIXELSUB_PS(sse2);
+ CHROMA_422_PIXELSUB_PS(sse2);
+
+ LUMA_CU_BLOCKCOPY(ss, sse2);
+ CHROMA_420_CU_BLOCKCOPY(ss, sse2);
+ CHROMA_422_CU_BLOCKCOPY(ss, sse2);
+
+ p.pu[LUMA_4x4].copy_pp = (copy_pp_t)x265_blockcopy_ss_4x4_sse2;
+ ALL_LUMA_PU_TYPED(copy_pp, (copy_pp_t), blockcopy_ss, sse2);
+ ALL_CHROMA_420_PU_TYPED(copy_pp, (copy_pp_t), blockcopy_ss, sse2);
+ ALL_CHROMA_422_PU_TYPED(copy_pp, (copy_pp_t), blockcopy_ss, sse2);
+
+ CHROMA_420_VERT_FILTERS(sse2);
+ CHROMA_422_VERT_FILTERS(_sse2);
+ CHROMA_444_VERT_FILTERS(sse2);
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
- LUMA_VAR(_sse2);
-
- SAD_X3(sse2);
- p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
- p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
- p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
- p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
- p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
- p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
- p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
- p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
- p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
-
- SAD_X4(sse2);
- p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
- p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
- p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
- p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
- p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
- p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
- p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
- p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
- p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
-
- p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
-
- CHROMA_PIXELSUB_PS(_sse2);
- CHROMA_PIXELSUB_PS_422(_sse2);
- LUMA_PIXELSUB(_sse2);
-
- CHROMA_BLOCKCOPY(ss, _sse2);
- CHROMA_BLOCKCOPY_422(ss, _sse2);
- LUMA_BLOCKCOPY(ss, _sse2);
-
- CHROMA_VERT_FILTERS(_sse2);
- CHROMA_VERT_FILTERS_422(_sse2);
- CHROMA_VERT_FILTERS_444(_sse2);
+ LUMA_VAR(sse2);
+
p.luma_p2s = x265_luma_p2s_sse2;
- p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
- p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
- p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
-
- p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
- p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
- p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
- p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
-
- // TODO: overflow on 12-bits mode!
- p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
- p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
- p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
- p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
-
- p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
- p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
- p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
- p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
-
- p.dct[DCT_4x4] = x265_dct4_sse2;
- p.idct[IDCT_4x4] = x265_idct4_sse2;
- p.idct[IDST_4x4] = x265_idst4_sse2;
-
- LUMA_SS_FILTERS(_sse2);
+ p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
+ p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
+
+ ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
+ ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
+ ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
+ ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+ ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
+ ALL_LUMA_TU_S(transpose, transpose, sse2);
+
+ p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2;
+ ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
+
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_4x8_mmx2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_8x16_sse2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_16x32_sse2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_32x64_sse2;
+
+ p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
+ p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
+#if X86_64
+ p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
+#endif
+ p.idst4x4 = x265_idst4_sse2;
+
+ LUMA_VSS_FILTERS(sse2);
+
+ p.frameInitLowres = x265_frame_init_lowres_core_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
+ // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_ssse3; this one is broken
+ ALL_LUMA_PU(satd, pixel_satd, ssse3);
+ ASSIGN_SA8D(ssse3);
INTRA_ANG_SSSE3(ssse3);
- p.dct[DST_4x4] = x265_dst4_ssse3;
- p.idct[IDCT_8x8] = x265_idct8_ssse3;
+ p.dst4x4 = x265_dst4_ssse3;
+ p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
p.count_nonzero = x265_count_nonzero_ssse3;
+ p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
- LUMA_ADDAVG(_sse4);
- CHROMA_ADDAVG(_sse4);
- CHROMA_ADDAVG_422(_sse4);
- LUMA_FILTERS(_sse4);
- CHROMA_HORIZ_FILTERS(_sse4);
- CHROMA_VERT_FILTERS_SSE4(_sse4);
- CHROMA_HORIZ_FILTERS_422(_sse4);
- CHROMA_VERT_FILTERS_SSE4_422(_sse4);
- CHROMA_HORIZ_FILTERS_444(_sse4);
-
- p.dct[DCT_8x8] = x265_dct8_sse4;
+ LUMA_ADDAVG(sse4);
+ CHROMA_420_ADDAVG(sse4);
+ CHROMA_422_ADDAVG(sse4);
+
+ LUMA_FILTERS(sse4);
+ CHROMA_420_HORIZ_FILTERS(sse4);
+ CHROMA_420_VERT_FILTERS_SSE4(_sse4);
+ CHROMA_422_HORIZ_FILTERS(_sse4);
+ CHROMA_422_VERT_FILTERS_SSE4(_sse4);
+ CHROMA_444_HORIZ_FILTERS(sse4);
+
+ p.cu[BLOCK_8x8].dct = x265_dct8_sse4;
p.quant = x265_quant_sse4;
p.nquant = x265_nquant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
- p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
- p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
- p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
- p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
- p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
- p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
- p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
- p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
- p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4;
-
- p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4;
- p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
- p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
- p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
- p.planecopy_cp = x265_upShift_8_sse4;
+ // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_sse4; fails tests
+ ALL_LUMA_PU(satd, pixel_satd, sse4);
+ ASSIGN_SA8D(sse4);
+
+ ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
+ ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4_HIGH(sse4);
+
+ p.planecopy_cp = x265_upShift_8_sse4;
+ p.weight_pp = x265_weight_pp_sse4;
+ p.weight_sp = x265_weight_sp_sse4;
+
+ p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
+ p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
+
+#if X86_64
+ ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
+ ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
+#endif
+ }
+ if (cpuMask & X265_CPU_AVX)
+ {
+ // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx; fails tests
+ ALL_LUMA_PU(satd, pixel_satd, avx);
+ ASSIGN_SA8D(avx);
+ LUMA_VAR(avx);
+ p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
+ p.ssim_end_4 = x265_pixel_ssim_end4_avx;
+ p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
+ p.pu[LUMA_16x4].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x4_avx;
+ p.pu[LUMA_16x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x8_avx;
+ p.pu[LUMA_16x12].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x12_avx;
+ p.pu[LUMA_16x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x16_avx;
+ p.pu[LUMA_16x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x32_avx;
+ p.pu[LUMA_16x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x64_avx;
+ p.pu[LUMA_64x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x16_avx;
+ p.pu[LUMA_64x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x32_avx;
+ p.pu[LUMA_64x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x48_avx;
+ p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x4_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x8_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x12_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x16_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x32_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x16_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x24_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x32_avx;
+ p.frameInitLowres = x265_frame_init_lowres_core_avx;
}
if (cpuMask & X265_CPU_XOP)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
- SA8D_INTER_FROM_BLOCK(xop);
- INIT7(satd, _xop);
- HEVC_SATD(xop);
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop;
+ ALL_LUMA_PU(satd, pixel_satd, xop);
+ ASSIGN_SA8D(xop);
+ LUMA_VAR(xop);
+ p.frameInitLowres = x265_frame_init_lowres_core_xop;
}
if (cpuMask & X265_CPU_AVX2)
{
- p.dct[DCT_4x4] = x265_dct4_avx2;
p.quant = x265_quant_avx2;
p.nquant = x265_nquant_avx2;
- p.dequant_normal = x265_dequant_normal_avx2;
+ p.dequant_normal = x265_dequant_normal_avx2;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
-#if X86_64
- p.dct[DCT_8x8] = x265_dct8_avx2;
- p.dct[DCT_16x16] = x265_dct16_avx2;
- p.dct[DCT_32x32] = x265_dct32_avx2;
- p.idct[IDCT_4x4] = x265_idct4_avx2;
- p.idct[IDCT_8x8] = x265_idct8_avx2;
- p.idct[IDCT_16x16] = x265_idct16_avx2;
- p.idct[IDCT_32x32] = x265_idct32_avx2;
-
- p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
- p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
- p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
- p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
-#endif
- }
- /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
- for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
- {
- p.sse_pp[i] = (pixelcmp_t)p.sse_ss[i];
- p.sse_sp[i] = (pixelcmp_sp_t)p.sse_ss[i];
- }
+ // p.weight_pp = x265_weight_pp_avx2; fails tests
- for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
- {
- p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
- p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
- p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
- }
+ ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
- for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
- {
- p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
- p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
- p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
- }
+#if X86_64
+ ALL_LUMA_TU_S(dct, dct, avx2);
+ ALL_LUMA_TU_S(idct, idct, avx2);
- for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
- {
- p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
- p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
- p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
- }
+ p.cu[BLOCK_8x8].transpose = x265_transpose8_avx2;
+ p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
+ p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
+ p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
+#else
+ p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
+#endif
+ p.pu[LUMA_64x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x16_avx;
+ p.pu[LUMA_64x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x32_avx;
+ p.pu[LUMA_64x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x48_avx;
+ p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
- if (p.intra_pred[0][0] && p.transpose[0])
- {
- p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
- p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
- p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
- p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
+ p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
+ p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
}
+}
#else // if HIGH_BIT_DEPTH
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp
+{
if (cpuMask & X265_CPU_SSE2)
{
- INIT8_NAME(sse_pp, ssd, _mmx);
- INIT8(sad, _mmx2);
- INIT8(sad_x3, _mmx2);
- INIT8(sad_x4, _mmx2);
- p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
- p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
- p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
+ /* We do not differentiate CPUs which support MMX and not SSE2. We only check
+ * for SSE2 and then use both MMX and SSE2 functions */
+ AVC_LUMA_PU(sad, mmx2);
+ AVC_LUMA_PU(sad_x3, mmx2);
+ AVC_LUMA_PU(sad_x4, mmx2);
+
+ p.pu[LUMA_16x16].sad = x265_pixel_sad_16x16_sse2;
+ p.pu[LUMA_16x16].sad_x3 = x265_pixel_sad_x3_16x16_sse2;
+ p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_sse2;
+ p.pu[LUMA_16x8].sad = x265_pixel_sad_16x8_sse2;
+ p.pu[LUMA_16x8].sad_x3 = x265_pixel_sad_x3_16x8_sse2;
+ p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_sse2;
+ HEVC_SAD(sse2);
+
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_mmx2;
+ ALL_LUMA_PU(satd, pixel_satd, sse2);
+
+ p.cu[BLOCK_4x4].sse_pp = x265_pixel_ssd_4x4_mmx;
+ p.cu[BLOCK_8x8].sse_pp = x265_pixel_ssd_8x8_mmx;
+ p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_mmx;
- PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
+ PIXEL_AVG(sse2);
+ LUMA_VAR(sse2);
+
+ ASSIGN_SA8D(sse2);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = x265_pixel_ssd_4x8_mmx;
+ ASSIGN_SSE_PP(sse2);
+ ASSIGN_SSE_SS(sse2);
+
+ LUMA_PU_BLOCKCOPY(pp, sse2);
+ CHROMA_420_PU_BLOCKCOPY(pp, sse2);
+ CHROMA_422_PU_BLOCKCOPY(pp, sse2);
+
+ LUMA_CU_BLOCKCOPY(ss, sse2);
+ LUMA_CU_BLOCKCOPY(sp, sse2);
+ CHROMA_420_CU_BLOCKCOPY(ss, sse2);
+ CHROMA_422_CU_BLOCKCOPY(ss, sse2);
+ CHROMA_420_CU_BLOCKCOPY(sp, sse2);
+ CHROMA_422_CU_BLOCKCOPY(sp, sse2);
+
+ LUMA_VSS_FILTERS(sse2);
+ CHROMA_420_VSS_FILTERS(_sse2);
+ CHROMA_422_VSS_FILTERS(_sse2);
+ CHROMA_444_VSS_FILTERS(sse2);
+ CHROMA_420_VSP_FILTERS(_sse2);
+ CHROMA_422_VSP_FILTERS(_sse2);
+ CHROMA_444_VSP_FILTERS(_sse2);
+
+ //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
+ p.frameInitLowres = x265_frame_init_lowres_core_sse2;
+
+ ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
+ ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
+ ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
+ ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+
+ p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
+ p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
+
+ ALL_LUMA_TU_S(transpose, transpose, sse2);
+ p.cu[BLOCK_64x64].transpose = x265_transpose64_sse2;
- LUMA_VAR(_sse2);
-
- ASSGN_SSE(sse2);
- ASSGN_SSE_SS(sse2);
- INIT2(sad, _sse2);
- SAD(sse2);
- INIT2(sad_x3, _sse2);
- INIT2(sad_x4, _sse2);
- HEVC_SATD(sse2);
-
- CHROMA_BLOCKCOPY(ss, _sse2);
- CHROMA_BLOCKCOPY(pp, _sse2);
- CHROMA_BLOCKCOPY_422(ss, _sse2);
- CHROMA_BLOCKCOPY_422(pp, _sse2);
- LUMA_BLOCKCOPY(ss, _sse2);
- LUMA_BLOCKCOPY(pp, _sse2);
- LUMA_BLOCKCOPY(sp, _sse2);
- CHROMA_BLOCKCOPY_SP(_sse2);
- CHROMA_BLOCKCOPY_SP_422(_sse2);
-
- CHROMA_SS_FILTERS_420(_sse2);
- CHROMA_SS_FILTERS_422(_sse2);
- CHROMA_SS_FILTERS_444(_sse2);
- CHROMA_SP_FILTERS_420(_sse2);
- CHROMA_SP_FILTERS_422(_sse2);
- CHROMA_SP_FILTERS_444(_sse2);
- LUMA_SS_FILTERS(_sse2);
-
- // This function pointer initialization is temporary will be removed
- // later with macro definitions. It is used to avoid linker errors
- // until all partitions are coded and commit smaller patches, easier to
- // review.
-
- p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
- p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
- p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
- p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
-
- p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
- p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
- p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
- p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
-
- p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
- SA8D_INTER_FROM_BLOCK(sse2);
-
- p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
- p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
- p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
- p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
- p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
- p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
- p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
- p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
- p.dct[DCT_4x4] = x265_dct4_sse2;
- p.idct[IDCT_4x4] = x265_idct4_sse2;
- p.idct[IDST_4x4] = x265_idst4_sse2;
+
+ p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
+ p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
+#if X86_64
+ p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
+#endif
+ p.idst4x4 = x265_idst4_sse2;
+
p.planecopy_sp = x265_downShift_16_sse2;
- p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
- p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
- p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
- p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3;
- SA8D_INTER_FROM_BLOCK(ssse3);
- p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
- ASSGN_SSE(ssse3);
+ p.pu[LUMA_8x16].sad_x3 = x265_pixel_sad_x3_8x16_ssse3;
+ p.pu[LUMA_8x32].sad_x3 = x265_pixel_sad_x3_8x32_ssse3;
+ p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_ssse3;
+ HEVC_SAD_X3(ssse3);
+
+ p.pu[LUMA_8x4].sad_x4 = x265_pixel_sad_x4_8x4_ssse3;
+ p.pu[LUMA_8x8].sad_x4 = x265_pixel_sad_x4_8x8_ssse3;
+ p.pu[LUMA_8x16].sad_x4 = x265_pixel_sad_x4_8x16_ssse3;
+ p.pu[LUMA_8x32].sad_x4 = x265_pixel_sad_x4_8x32_ssse3;
+ p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_ssse3;
+ HEVC_SAD_X4(ssse3);
+
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_ssse3;
+ ALL_LUMA_PU(satd, pixel_satd, ssse3);
+
+ ASSIGN_SA8D(ssse3);
PIXEL_AVG(ssse3);
PIXEL_AVG_W4(ssse3);
-
INTRA_ANG_SSSE3(ssse3);
- p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
- p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
- SAD_X3(ssse3);
- SAD_X4(ssse3);
- p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
- p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
- p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
- p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3;
- p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_ssse3;
- p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_ssse3;
-
- p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
- p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
-
- p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
+ ASSIGN_SSE_PP(ssse3);
+ p.cu[BLOCK_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = x265_pixel_ssd_4x8_ssse3;
+
p.luma_p2s = x265_luma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
+ p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
+ p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
- p.dct[DST_4x4] = x265_dst4_ssse3;
- p.idct[IDCT_8x8] = x265_idct8_ssse3;
+ p.dst4x4 = x265_dst4_ssse3;
+ p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
p.count_nonzero = x265_count_nonzero_ssse3;
+
+ p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
+ p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
+ p.sign = x265_calSign_sse4;
p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
+ p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
+ p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
+ p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
+ p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
- LUMA_ADDAVG(_sse4);
- CHROMA_ADDAVG(_sse4);
- CHROMA_ADDAVG_422(_sse4);
- p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
- p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
- p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
- p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
- p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
+ LUMA_ADDAVG(sse4);
+ CHROMA_420_ADDAVG(sse4);
+ CHROMA_422_ADDAVG(sse4);
// TODO: check POPCNT flag!
- p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
- p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_sse4;
- p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_sse4;
- p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_sse4;
-
- HEVC_SATD(sse4);
- SA8D_INTER_FROM_BLOCK(sse4);
-
- p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
- p.sse_pp[LUMA_24x32] = x265_pixel_ssd_24x32_sse4;
- p.sse_pp[LUMA_48x64] = x265_pixel_ssd_48x64_sse4;
- p.sse_pp[LUMA_64x16] = x265_pixel_ssd_64x16_sse4;
- p.sse_pp[LUMA_64x32] = x265_pixel_ssd_64x32_sse4;
- p.sse_pp[LUMA_64x48] = x265_pixel_ssd_64x48_sse4;
- p.sse_pp[LUMA_64x64] = x265_pixel_ssd_64x64_sse4;
-
- LUMA_SSE_SP(_sse4);
-
- CHROMA_PIXELSUB_PS(_sse4);
- CHROMA_PIXELSUB_PS_422(_sse4);
- LUMA_PIXELSUB(_sse4);
-
- CHROMA_FILTERS_420(_sse4);
- CHROMA_FILTERS_422(_sse4);
- CHROMA_FILTERS_444(_sse4);
- CHROMA_SS_FILTERS_SSE4_420(_sse4);
- CHROMA_SS_FILTERS_SSE4_422(_sse4);
- CHROMA_SP_FILTERS_SSE4_420(_sse4);
- CHROMA_SP_FILTERS_SSE4_422(_sse4);
- CHROMA_SP_FILTERS_SSE4_444(_sse4);
- LUMA_SP_FILTERS(_sse4);
- LUMA_FILTERS(_sse4);
- ASSGN_SSE_SS(sse4);
-
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
- CHROMA_BLOCKCOPY(ps, _sse4);
- CHROMA_BLOCKCOPY_422(ps, _sse4);
- LUMA_BLOCKCOPY(ps, _sse4);
-
- p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
- p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
+ ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
+
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_sse4;
+ ALL_LUMA_PU(satd, pixel_satd, sse4);
+ ASSIGN_SA8D(sse4);
+ ASSIGN_SSE_SS(sse4);
+ p.cu[BLOCK_64x64].sse_pp = x265_pixel_ssd_64x64_sse4;
+
+ LUMA_PIXELSUB(sse4);
+ CHROMA_420_PIXELSUB_PS(sse4);
+ CHROMA_422_PIXELSUB_PS(sse4);
+
+ LUMA_FILTERS(sse4);
+ CHROMA_420_FILTERS(sse4);
+ CHROMA_422_FILTERS(sse4);
+ CHROMA_444_FILTERS(sse4);
+ CHROMA_420_VSS_FILTERS_SSE4(_sse4);
+ CHROMA_422_VSS_FILTERS_SSE4(_sse4);
+ CHROMA_420_VSP_FILTERS_SSE4(_sse4);
+ CHROMA_422_VSP_FILTERS_SSE4(_sse4);
+ CHROMA_444_VSP_FILTERS_SSE4(_sse4);
+
+ // MUST be done after LUMA_FILTERS() to overwrite default version
+ p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse4;
+
+ LUMA_CU_BLOCKCOPY(ps, sse4);
+ CHROMA_420_CU_BLOCKCOPY(ps, sse4);
+ CHROMA_422_CU_BLOCKCOPY(ps, sse4);
+
+ p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_sse4;
+ p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_sse4;
+ p.cu[BLOCK_8x8].dct = x265_dct8_sse4;
+ p.denoiseDct = x265_denoise_dct_sse4;
p.quant = x265_quant_sse4;
p.nquant = x265_nquant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
+
p.weight_pp = x265_weight_pp_sse4;
p.weight_sp = x265_weight_sp_sse4;
- p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
- p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
- p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
- p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4;
-
- p.intra_pred_allangs[BLOCK_4x4] = x265_all_angs_pred_4x4_sse4;
- p.intra_pred_allangs[BLOCK_8x8] = x265_all_angs_pred_8x8_sse4;
- p.intra_pred_allangs[BLOCK_16x16] = x265_all_angs_pred_16x16_sse4;
- p.intra_pred_allangs[BLOCK_32x32] = x265_all_angs_pred_32x32_sse4;
- p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4;
- p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
- p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
- p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
+ ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
+ ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
+ ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4(sse4);
- p.dct[DCT_8x8] = x265_dct8_sse4;
- p.copy_shr = x265_copy_shr_sse4;
- p.denoiseDct = x265_denoise_dct_sse4;
+ p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
+#if X86_64
+ ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
+ p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_sse4;
+ p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_sse4;
+ p.cu[BLOCK_32x32].psy_cost_ss = x265_psyCost_ss_32x32_sse4;
+ p.cu[BLOCK_64x64].psy_cost_ss = x265_psyCost_ss_64x64_sse4;
+#endif
+ p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
- HEVC_SATD(avx);
- SA8D_INTER_FROM_BLOCK(avx);
- ASSGN_SSE(avx);
-
- ASSGN_SSE_SS(avx);
- SAD_X3(avx);
- SAD_X4(avx);
- p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
- p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
- p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
- p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx;
+ ALL_LUMA_PU(satd, pixel_satd, avx);
+ ASSIGN_SA8D(avx);
+ ASSIGN_SSE_PP(avx);
+ ASSIGN_SSE_SS(avx);
+ LUMA_VAR(avx);
+
+ p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_avx;
+ p.pu[LUMA_16x4].sad_x3 = x265_pixel_sad_x3_16x4_avx;
+ HEVC_SAD_X3(avx);
+
+ p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_avx;
+ p.pu[LUMA_16x4].sad_x4 = x265_pixel_sad_x4_16x4_avx;
+ HEVC_SAD_X4(avx);
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
p.ssim_end_4 = x265_pixel_ssim_end4_avx;
- p.luma_copy_ss[LUMA_64x16] = x265_blockcopy_ss_64x16_avx;
- p.luma_copy_ss[LUMA_64x32] = x265_blockcopy_ss_64x32_avx;
- p.luma_copy_ss[LUMA_64x48] = x265_blockcopy_ss_64x48_avx;
- p.luma_copy_ss[LUMA_64x64] = x265_blockcopy_ss_64x64_avx;
+ p.cu[BLOCK_64x64].copy_ss = x265_blockcopy_ss_64x64_avx;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = x265_blockcopy_pp_32x8_avx;
+ p.pu[LUMA_32x8].copy_pp = x265_blockcopy_pp_32x8_avx;
- p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x8] = x265_blockcopy_pp_32x8_avx;
- p.luma_copy_pp[LUMA_32x8] = x265_blockcopy_pp_32x8_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
+ p.pu[LUMA_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
- p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x16] = x265_blockcopy_pp_32x16_avx;
- p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x16] = x265_blockcopy_pp_32x16_avx;
- p.luma_copy_pp[LUMA_32x16] = x265_blockcopy_pp_32x16_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = x265_blockcopy_pp_32x24_avx;
+ p.pu[LUMA_32x24].copy_pp = x265_blockcopy_pp_32x24_avx;
- p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x24] = x265_blockcopy_pp_32x24_avx;
- p.luma_copy_pp[LUMA_32x24] = x265_blockcopy_pp_32x24_avx;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
+ p.pu[LUMA_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
- p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x32] = x265_blockcopy_pp_32x32_avx;
- p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x32] = x265_blockcopy_pp_32x32_avx;
- p.luma_copy_pp[LUMA_32x32] = x265_blockcopy_pp_32x32_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = x265_blockcopy_pp_32x48_avx;
- p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x48] = x265_blockcopy_pp_32x48_avx;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = x265_blockcopy_pp_32x64_avx;
+ p.pu[LUMA_32x64].copy_pp = x265_blockcopy_pp_32x64_avx;
- p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x64] = x265_blockcopy_pp_32x64_avx;
- p.luma_copy_pp[LUMA_32x64] = x265_blockcopy_pp_32x64_avx;
+ p.frameInitLowres = x265_frame_init_lowres_core_avx;
}
if (cpuMask & X265_CPU_XOP)
{
- p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
- SA8D_INTER_FROM_BLOCK(xop);
- INIT7(satd, _xop);
- INIT5_NAME(sse_pp, ssd, _xop);
- HEVC_SATD(xop);
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop;
+ ALL_LUMA_PU(satd, pixel_satd, xop);
+ ASSIGN_SA8D(xop);
+ LUMA_VAR(xop);
+ p.cu[BLOCK_8x8].sse_pp = x265_pixel_ssd_8x8_xop;
+ p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_xop;
+ p.frameInitLowres = x265_frame_init_lowres_core_xop;
}
if (cpuMask & X265_CPU_AVX2)
{
- INIT2(sad_x4, _avx2);
- INIT4(satd, _avx2);
- INIT2_NAME(sse_pp, ssd, _avx2);
- p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
- p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
- p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
-
- /* Need to update assembly code as per changed interface of the copy_cnt primitive, once
- * code is updated, avx2 version will be enabled */
-
- p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
- p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
- p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
-
- p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
- p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
-
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
+ p.pu[LUMA_16x16].satd = x265_pixel_satd_16x16_avx2;
+ p.pu[LUMA_16x8].satd = x265_pixel_satd_16x8_avx2;
+ p.pu[LUMA_8x16].satd = x265_pixel_satd_8x16_avx2;
+ p.pu[LUMA_8x8].satd = x265_pixel_satd_8x8_avx2;
+
+ p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_sse2;
+ p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_avx2;
+ p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_sse2;
+ p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_avx2;
+
+ p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_avx2;
+ p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
+
+ p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_avx2;
+ p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_avx2;
+ p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_avx2;
+
+ p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_avx2;
+ p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_avx2;
+
+ ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
+
p.denoiseDct = x265_denoise_dct_avx2;
- p.dct[DCT_4x4] = x265_dct4_avx2;
p.quant = x265_quant_avx2;
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x16] = x265_blockcopy_ss_16x16_avx;
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x32] = x265_blockcopy_ss_16x32_avx;
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x8] = x265_blockcopy_ss_16x8_avx;
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x16] = x265_blockcopy_ss_16x16_avx;
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x24] = x265_blockcopy_ss_16x24_avx;
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx;
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx;
+
+ p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
+ p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
p.weight_pp = x265_weight_pp_avx2;
+ p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
#if X86_64
- p.dct[DCT_8x8] = x265_dct8_avx2;
- p.dct[DCT_16x16] = x265_dct16_avx2;
- p.dct[DCT_32x32] = x265_dct32_avx2;
- p.idct[IDCT_4x4] = x265_idct4_avx2;
- p.idct[IDCT_8x8] = x265_idct8_avx2;
- p.idct[IDCT_16x16] = x265_idct16_avx2;
- p.idct[IDCT_32x32] = x265_idct32_avx2;
-
- p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
- p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
- p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
- p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
+ p.cu[BLOCK_8x8].dct = x265_dct8_avx2;
+ p.cu[BLOCK_16x16].dct = x265_dct16_avx2;
+ p.cu[BLOCK_32x32].dct = x265_dct32_avx2;
+
+ p.cu[BLOCK_4x4].idct = x265_idct4_avx2;
+ p.cu[BLOCK_8x8].idct = x265_idct8_avx2;
+ p.cu[BLOCK_16x16].idct = x265_idct16_avx2;
+ p.cu[BLOCK_32x32].idct = x265_idct32_avx2;
+
+ p.cu[BLOCK_8x8].transpose = x265_transpose8_avx2;
+ p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
+ p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
+ p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
+
+ p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2;
+
+ p.pu[LUMA_16x4].luma_vpp = x265_interp_8tap_vert_pp_16x4_avx2;
+ p.pu[LUMA_16x8].luma_vpp = x265_interp_8tap_vert_pp_16x8_avx2;
+ p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2;
+ p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2;
+ p.pu[LUMA_16x32].luma_vpp = x265_interp_8tap_vert_pp_16x32_avx2;
+ p.pu[LUMA_16x64].luma_vpp = x265_interp_8tap_vert_pp_16x64_avx2;
+
+ p.pu[LUMA_24x32].luma_vpp = x265_interp_8tap_vert_pp_24x32_avx2;
+
+ p.pu[LUMA_32x8].luma_vpp = x265_interp_8tap_vert_pp_32x8_avx2;
+ p.pu[LUMA_32x16].luma_vpp = x265_interp_8tap_vert_pp_32x16_avx2;
+ p.pu[LUMA_32x24].luma_vpp = x265_interp_8tap_vert_pp_32x24_avx2;
+ p.pu[LUMA_32x32].luma_vpp = x265_interp_8tap_vert_pp_32x32_avx2;
+ p.pu[LUMA_32x64].luma_vpp = x265_interp_8tap_vert_pp_32x64_avx2;
+
+ p.pu[LUMA_48x64].luma_vpp = x265_interp_8tap_vert_pp_48x64_avx2;
+
+ p.pu[LUMA_64x16].luma_vpp = x265_interp_8tap_vert_pp_64x16_avx2;
+ p.pu[LUMA_64x32].luma_vpp = x265_interp_8tap_vert_pp_64x32_avx2;
+ p.pu[LUMA_64x48].luma_vpp = x265_interp_8tap_vert_pp_64x48_avx2;
+ p.pu[LUMA_64x64].luma_vpp = x265_interp_8tap_vert_pp_64x64_avx2;
+#endif
+ p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2;
+
+ p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2;
+ p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
+ p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
+ p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
+
+ p.pu[LUMA_16x4].luma_hpp = x265_interp_8tap_horiz_pp_16x4_avx2;
+ p.pu[LUMA_16x8].luma_hpp = x265_interp_8tap_horiz_pp_16x8_avx2;
+ p.pu[LUMA_16x12].luma_hpp = x265_interp_8tap_horiz_pp_16x12_avx2;
+ p.pu[LUMA_16x16].luma_hpp = x265_interp_8tap_horiz_pp_16x16_avx2;
+ p.pu[LUMA_16x32].luma_hpp = x265_interp_8tap_horiz_pp_16x32_avx2;
+ p.pu[LUMA_16x64].luma_hpp = x265_interp_8tap_horiz_pp_16x64_avx2;
+
+ p.pu[LUMA_32x8].luma_hpp = x265_interp_8tap_horiz_pp_32x8_avx2;
+ p.pu[LUMA_32x16].luma_hpp = x265_interp_8tap_horiz_pp_32x16_avx2;
+ p.pu[LUMA_32x24].luma_hpp = x265_interp_8tap_horiz_pp_32x24_avx2;
+ p.pu[LUMA_32x32].luma_hpp = x265_interp_8tap_horiz_pp_32x32_avx2;
+ p.pu[LUMA_32x64].luma_hpp = x265_interp_8tap_horiz_pp_32x64_avx2;
+
+ p.pu[LUMA_64x64].luma_hpp = x265_interp_8tap_horiz_pp_64x64_avx2;
+ p.pu[LUMA_64x48].luma_hpp = x265_interp_8tap_horiz_pp_64x48_avx2;
+ p.pu[LUMA_64x32].luma_hpp = x265_interp_8tap_horiz_pp_64x32_avx2;
+ p.pu[LUMA_64x16].luma_hpp = x265_interp_8tap_horiz_pp_64x16_avx2;
+
+ p.pu[LUMA_48x64].luma_hpp = x265_interp_8tap_horiz_pp_48x64_avx2;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = x265_interp_4tap_horiz_pp_8x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = x265_interp_4tap_horiz_pp_32x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
+
+ p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_avx2;
+ p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_avx2;
+ p.pu[LUMA_8x4].luma_vpp = x265_interp_8tap_vert_pp_8x4_avx2;
+ p.pu[LUMA_8x8].luma_vpp = x265_interp_8tap_vert_pp_8x8_avx2;
+ p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2;
+ p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2;
+
+ // color space i420
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2;
+
+ // color space i422
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
+
+#if X86_64
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2;
#endif
- p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
}
-#endif // if HIGH_BIT_DEPTH
-}
}
+#endif // if HIGH_BIT_DEPTH
+
+} // namespace x265
extern "C" {
#ifdef __INTEL_COMPILER
diff --git a/source/common/x86/blockcopy8.asm b/source/common/x86/blockcopy8.asm
index e892157..f82ff79 100644
--- a/source/common/x86/blockcopy8.asm
+++ b/source/common/x86/blockcopy8.asm
@@ -41,7 +41,7 @@ cextern pb_128
SECTION .text
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x4, 4, 7, 0
@@ -59,7 +59,7 @@ cglobal blockcopy_pp_2x4, 4, 7, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x8, 4, 7, 0
@@ -97,7 +97,7 @@ cglobal blockcopy_pp_2x8, 4, 7, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x16, 4, 7, 0
@@ -115,7 +115,7 @@ cglobal blockcopy_pp_2x16, 4, 7, 0
;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x2, 4, 6, 0
@@ -127,7 +127,7 @@ cglobal blockcopy_pp_4x2, 4, 6, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x4, 4, 4, 4
@@ -145,7 +145,7 @@ cglobal blockcopy_pp_4x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W4_H8 2
INIT_XMM sse2
@@ -192,7 +192,7 @@ BLOCKCOPY_PP_W4_H8 4, 16
BLOCKCOPY_PP_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_6x8, 4, 7, 8
@@ -257,7 +257,7 @@ cglobal blockcopy_pp_6x8, 4, 7, 8
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_6x16, 4, 7, 2
@@ -279,7 +279,7 @@ cglobal blockcopy_pp_6x16, 4, 7, 2
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x2, 4, 4, 2
@@ -291,7 +291,7 @@ cglobal blockcopy_pp_8x2, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x4, 4, 4, 4
@@ -309,7 +309,7 @@ cglobal blockcopy_pp_8x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x6, 4, 7, 6
@@ -333,7 +333,7 @@ cglobal blockcopy_pp_8x6, 4, 7, 6
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x12, 4, 5, 2
@@ -350,7 +350,7 @@ cglobal blockcopy_pp_8x12, 4, 5, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W8_H8 2
INIT_XMM sse2
@@ -397,7 +397,7 @@ BLOCKCOPY_PP_W8_H8 8, 32
BLOCKCOPY_PP_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W12_H4 2
INIT_XMM sse2
@@ -439,7 +439,7 @@ BLOCKCOPY_PP_W12_H4 12, 16
BLOCKCOPY_PP_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H4 2
INIT_XMM sse2
@@ -471,7 +471,7 @@ BLOCKCOPY_PP_W16_H4 16, 4
BLOCKCOPY_PP_W16_H4 16, 12
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H8 2
INIT_XMM sse2
@@ -519,7 +519,7 @@ BLOCKCOPY_PP_W16_H8 16, 64
BLOCKCOPY_PP_W16_H8 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W24_H4 2
INIT_XMM sse2
@@ -560,7 +560,7 @@ BLOCKCOPY_PP_W24_H4 24, 32
BLOCKCOPY_PP_W24_H4 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W32_H4 2
INIT_XMM sse2
@@ -684,7 +684,7 @@ cglobal blockcopy_pp_32x16, 4, 6, 6
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_YMM avx
cglobal blockcopy_pp_32x24, 4, 7, 6
@@ -722,7 +722,7 @@ mov r6d, 24/8
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W32_H16_avx 2
INIT_YMM avx
@@ -788,7 +788,7 @@ BLOCKCOPY_PP_W32_H16_avx 32, 48
BLOCKCOPY_PP_W32_H16_avx 32, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W48_H2 2
INIT_XMM sse2
@@ -836,7 +836,7 @@ cglobal blockcopy_pp_%1x%2, 4, 5, 6
BLOCKCOPY_PP_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W64_H4 2
INIT_XMM sse2
@@ -897,7 +897,7 @@ BLOCKCOPY_PP_W64_H4 64, 48
BLOCKCOPY_PP_W64_H4 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_2x4, 4, 5, 2
@@ -926,7 +926,7 @@ RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_2x8, 4, 5, 2
@@ -974,11 +974,11 @@ pextrw [r0 + r1], m0, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W2_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
add r3, r3
mov r6d, %2/2
.loop:
@@ -1003,10 +1003,10 @@ BLOCKCOPY_SP_W2_H2 2, 8
BLOCKCOPY_SP_W2_H2 2, 16
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
add r3, r3
@@ -1022,10 +1022,10 @@ movd [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
add r3, r3
@@ -1049,10 +1049,10 @@ movd [r0 + r1], m2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
add r3, r3
@@ -1092,11 +1092,11 @@ movd [r0 + r1], m6
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W4_H8 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/8
@@ -1150,7 +1150,7 @@ BLOCKCOPY_SP_W4_H8 4, 16
BLOCKCOPY_SP_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_6x8, 4, 4, 2
@@ -1213,11 +1213,11 @@ cglobal blockcopy_sp_6x8, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W6_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
add r3, r3
mov r6d, %2/2
.loop:
@@ -1247,10 +1247,10 @@ BLOCKCOPY_SP_W6_H2 6, 8
BLOCKCOPY_SP_W6_H2 6, 16
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
add r3, r3
@@ -1265,10 +1265,10 @@ movhps [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
add r3, r3
@@ -1290,10 +1290,10 @@ movhps [r0 + r1], m2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
add r3, r3
@@ -1322,10 +1322,10 @@ movhps [r0 + r1], m4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
add r3, r3
@@ -1361,11 +1361,11 @@ movhps [r0 + r1], m6
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
add r3, r3
mov r4d, %2/4
.loop:
@@ -1391,11 +1391,11 @@ cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
BLOCKCOPY_SP_W8_H4 8, 12
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H8 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/8
@@ -1446,11 +1446,11 @@ BLOCKCOPY_SP_W8_H8 8, 32
BLOCKCOPY_SP_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W12_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/4
@@ -1503,11 +1503,11 @@ BLOCKCOPY_SP_W12_H4 12, 16
BLOCKCOPY_SP_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W16_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/4
@@ -1554,11 +1554,11 @@ BLOCKCOPY_SP_W16_H4 16, 64
BLOCKCOPY_SP_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W24_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2/2
@@ -1595,11 +1595,11 @@ BLOCKCOPY_SP_W24_H2 24, 32
BLOCKCOPY_SP_W24_H2 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W32_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/2
@@ -1643,11 +1643,11 @@ BLOCKCOPY_SP_W32_H2 32, 64
BLOCKCOPY_SP_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W48_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2
@@ -1681,11 +1681,11 @@ RET
BLOCKCOPY_SP_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W64_H1 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2
@@ -1726,10 +1726,10 @@ BLOCKCOPY_SP_W64_H1 64, 48
BLOCKCOPY_SP_W64_H1 64, 64
;-----------------------------------------------------------------------------
-; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
add r1, r1
@@ -1745,10 +1745,10 @@ movh [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
add r1, r1
@@ -1774,11 +1774,11 @@ movu [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
%macro BLOCKFILL_S_W16_H8 2
INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
mov r3d, %2/8
@@ -1855,11 +1855,11 @@ movu [r0 + r3], m0
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
%macro BLOCKFILL_S_W32_H4 2
INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
mov r3d, %2/4
@@ -1983,10 +1983,10 @@ movu [r0 + r3 + 32], m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2013,10 +2013,10 @@ RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2065,10 +2065,10 @@ RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
add r1, r1
mov r4d, 16/2
.loop:
@@ -2086,10 +2086,10 @@ cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2105,10 +2105,10 @@ RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2135,11 +2135,11 @@ RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W4_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2180,11 +2180,11 @@ BLOCKCOPY_PS_W4_H4 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W6_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2227,10 +2227,10 @@ BLOCKCOPY_PS_W6_H4 6, 8
BLOCKCOPY_PS_W6_H4 6, 16
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2245,10 +2245,10 @@ movu [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2274,10 +2274,10 @@ movu [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2314,11 +2314,11 @@ movu [r0 + r1], m0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W8_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2361,11 +2361,11 @@ BLOCKCOPY_PS_W8_H4 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W12_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2398,10 +2398,10 @@ BLOCKCOPY_PS_W12_H2 12, 16
BLOCKCOPY_PS_W12_H2 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
add r1, r1
pxor m0, m0
@@ -2436,11 +2436,11 @@ movu [r0 + r1 + 16], m1
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W16_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2492,11 +2492,11 @@ BLOCKCOPY_PS_W16_H4 16, 64
BLOCKCOPY_PS_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W24_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2537,11 +2537,11 @@ BLOCKCOPY_PS_W24_H2 24, 32
BLOCKCOPY_PS_W24_H2 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W32_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2590,11 +2590,11 @@ BLOCKCOPY_PS_W32_H2 32, 64
BLOCKCOPY_PS_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W48_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2649,11 +2649,11 @@ RET
BLOCKCOPY_PS_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W64_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2723,7 +2723,7 @@ BLOCKCOPY_PS_W64_H2 64, 48
BLOCKCOPY_PS_W64_H2 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x4, 4, 6, 0
@@ -2746,7 +2746,7 @@ cglobal blockcopy_ss_2x4, 4, 6, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x8, 4, 6, 0
@@ -2785,7 +2785,7 @@ cglobal blockcopy_ss_2x8, 4, 6, 0
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x16, 4, 7, 0
@@ -2805,7 +2805,7 @@ cglobal blockcopy_ss_2x16, 4, 7, 0
;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_4x2, 4, 4, 2
@@ -2821,7 +2821,7 @@ cglobal blockcopy_ss_4x2, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_4x4, 4, 4, 4
@@ -2841,7 +2841,7 @@ cglobal blockcopy_ss_4x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W4_H8 2
INIT_XMM sse2
@@ -2889,7 +2889,7 @@ BLOCKCOPY_SS_W4_H8 4, 16
BLOCKCOPY_SS_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_6x8, 4, 4, 4
@@ -2944,7 +2944,7 @@ cglobal blockcopy_ss_6x8, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_6x16, 4, 5, 4
@@ -2968,7 +2968,7 @@ cglobal blockcopy_ss_6x16, 4, 5, 4
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x2, 4, 4, 2
@@ -2984,7 +2984,7 @@ cglobal blockcopy_ss_8x2, 4, 4, 2
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x4, 4, 4, 4
@@ -3005,7 +3005,7 @@ cglobal blockcopy_ss_8x4, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x6, 4, 4, 4
@@ -3034,7 +3034,7 @@ cglobal blockcopy_ss_8x6, 4, 4, 4
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x12, 4, 5, 2
@@ -3054,7 +3054,7 @@ cglobal blockcopy_ss_8x12, 4, 5, 2
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W8_H8 2
INIT_XMM sse2
@@ -3105,7 +3105,7 @@ BLOCKCOPY_SS_W8_H8 8, 32
BLOCKCOPY_SS_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W12_H4 2
INIT_XMM sse2
@@ -3149,7 +3149,7 @@ BLOCKCOPY_SS_W12_H4 12, 16
BLOCKCOPY_SS_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H4 2
INIT_XMM sse2
@@ -3192,7 +3192,7 @@ BLOCKCOPY_SS_W16_H4 16, 4
BLOCKCOPY_SS_W16_H4 16, 12
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H4_avx 2
INIT_YMM avx
@@ -3229,7 +3229,7 @@ BLOCKCOPY_SS_W16_H4_avx 16, 32
BLOCKCOPY_SS_W16_H4_avx 16, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H8 2
INIT_XMM sse2
@@ -3302,7 +3302,7 @@ BLOCKCOPY_SS_W16_H8 16, 64
BLOCKCOPY_SS_W16_H8 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W24_H4 2
INIT_XMM sse2
@@ -3354,7 +3354,7 @@ BLOCKCOPY_SS_W24_H4 24, 32
BLOCKCOPY_SS_W24_H4 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W32_H4 2
INIT_XMM sse2
@@ -3422,7 +3422,7 @@ BLOCKCOPY_SS_W32_H4 32, 64
BLOCKCOPY_SS_W32_H4 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W48_H2 2
INIT_XMM sse2
@@ -3500,11 +3500,11 @@ RET
BLOCKCOPY_SS_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W64_H4 2
INIT_XMM sse2
-cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2/4
add r1, r1
add r3, r3
@@ -3606,11 +3606,11 @@ BLOCKCOPY_SS_W64_H4 64, 48
BLOCKCOPY_SS_W64_H4 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W64_H4_avx 2
INIT_YMM avx
-cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
mov r4d, %2/4
add r1, r1
add r3, r3
@@ -3669,229 +3669,83 @@ BLOCKCOPY_SS_W64_H4_avx 64, 32
BLOCKCOPY_SS_W64_H4_avx 64, 48
BLOCKCOPY_SS_W64_H4_avx 64, 64
-;-----------------------------------------------------------------------------
-; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride
-%define rnd m2
-%define shift m1
-
- ; make shift
- mov r5d, r3m
- movd shift, r5d
-
- ; make round
- dec r5
- xor r6, r6
- bts r6, r5
-
- movd rnd, r6d
- pshufd rnd, rnd, 0
-
- ; register alloc
- ; r0 - dst
- ; r1 - src
- ; r2 - stride * 2 (short*)
- ; r3 - lx
- ; r4 - size
- ; r5 - ly
- ; r6 - diff
- add r2d, r2d
-
- mov r4d, r4m
- mov r5, r4
- mov r6, r2
- sub r6, r4
- add r6, r6
-
- shr r5, 1
-.loop_row:
-
- mov r3, r4
- shr r3, 2
-.loop_col:
- ; row 0
- movu m0, [r1]
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0], m0
-
- ; row 1
- movu m0, [r1 + r4 * 4]
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0 + r2], m0
-
- ; move col pointer
- add r1, 16
- add r0, 8
-
- dec r3
- jg .loop_col
-
- ; update pointer
- lea r1, [r1 + r4 * 4]
- add r0, r6
-
- ; end of loop_row
- dec r5
- jg .loop_row
-
- RET
-
-
-;--------------------------------------------------------------------------------------
-; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
-;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
-%define shift m1
-
- ; make shift
- mov r5d, r3m
- movd shift, r5d
-
- ; register alloc
- ; r0 - dst
- ; r1 - src
- ; r2 - stride
- ; r3 - shift
- ; r4 - size
-
- sub r2d, r4d
- add r2d, r2d
- mov r5d, r4d
- shr r4d, 2
-.loop_row:
- mov r6d, r4d
-
-.loop_col:
- pmovsxwd m0, [r1]
- pslld m0, shift
- movu [r0], m0
-
- add r1, 8
- add r0, 16
-
- dec r6d
- jnz .loop_col
-
- add r1, r2
- dec r5d
- jnz .loop_row
- RET
-
-
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_4, 3,3,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_4, 3, 4, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; m0 - shift
- ; m1 - dword [offset]
-
- ; Row 0
- pmovsxwd m2, [r1]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 0 * mmsize], m2
-
- ; Row 1
- pmovsxwd m2, [r1 + r2]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 1 * mmsize], m2
+ ; m1 - word [-round]
- ; Row 2
+ ; Row 0-3
+ movh m2, [r1]
+ movhps m2, [r1 + r2]
lea r1, [r1 + r2 * 2]
- pmovsxwd m2, [r1]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 2 * mmsize], m2
-
- ; Row 3
- pmovsxwd m2, [r1 + r2]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 3 * mmsize], m2
+ movh m3, [r1]
+ movhps m3, [r1 + r2]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
RET
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_8, 3,5,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_8, 3, 5, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 8/4
lea r4, [r2 * 3]
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; r4 - stride * 3
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
- ; Row 0
- pmovsxwd m2, [r1]
- pmovsxwd m3, [r1 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
+ ; Row 0-1
+ mova m2, [r1]
+ mova m3, [r1 + r2]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
- ; Row 1
- pmovsxwd m2, [r1 + r2]
- pmovsxwd m3, [r1 + r2 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 2 * mmsize], m2
- movu [r0 + 3 * mmsize], m3
-
- ; Row 2
- pmovsxwd m2, [r1 + r2 * 2]
- pmovsxwd m3, [r1 + r2 * 2 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
-
- ; Row 3
- pmovsxwd m2, [r1 + r4]
- pmovsxwd m3, [r1 + r4 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 6 * mmsize], m2
- movu [r0 + 7 * mmsize], m3
-
- add r0, 8 * mmsize
+ ; Row 2-3
+ mova m2, [r1 + r2 * 2]
+ mova m3, [r1 + r4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
lea r1, [r1 + r2 * 4]
dec r3d
jnz .loop
@@ -3899,62 +3753,47 @@ cglobal cvt16to32_shr_8, 3,5,3
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_16, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_16, 3, 4, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 16/2
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
; Row 0
- pmovsxwd m2, [r1 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + 1 * mmsize/2]
- pmovsxwd m4, [r1 + 2 * mmsize/2]
- pmovsxwd m5, [r1 + 3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
; Row 1
- pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + r2 +1 * mmsize/2]
- pmovsxwd m4, [r1 + r2 +2 * mmsize/2]
- pmovsxwd m5, [r1 + r2 +3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
+ mova m2, [r1 + r2 + 0 * mmsize]
+ mova m3, [r1 + r2 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
lea r1, [r1 + r2 * 2]
dec r3d
jnz .loop
@@ -3962,61 +3801,45 @@ cglobal cvt16to32_shr_16, 3,4,6
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_32, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_32, 3, 4, 6
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 32/1
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
; Row 0
- pmovsxwd m2, [r1 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + 1 * mmsize/2]
- pmovsxwd m4, [r1 + 2 * mmsize/2]
- pmovsxwd m5, [r1 + 3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
-
- pmovsxwd m2, [r1 + 4 * mmsize/2]
- pmovsxwd m3, [r1 + 5 * mmsize/2]
- pmovsxwd m4, [r1 + 6 * mmsize/2]
- pmovsxwd m5, [r1 + 7 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ add r0, 4 * mmsize
add r1, r2
dec r3d
jnz .loop
@@ -4024,172 +3847,150 @@ cglobal cvt16to32_shr_32, 3,4,6
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_4, 3,3,5
+cglobal cpy1Dto2D_shl_4, 3, 3, 3
add r2d, r2d
movd m0, r3m
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
psllw m1, m0
- psllw m3, m0
+ psllw m2, m0
movh [r0], m1
movhps [r0 + r2], m1
- movh [r0 + r2 * 2], m3
+ movh [r0 + r2 * 2], m2
lea r2, [r2 * 3]
- movhps [r0 + r2], m3
+ movhps [r0 + r2], m2
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_4, 3,3,3
+cglobal cpy1Dto2D_shl_4, 3, 3, 2
add r2d, r2d
movd xm0, r3m
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- packssdw m1, m2
+ movu m1, [r1]
psllw m1, xm0
vextracti128 xm0, m1, 1
movq [r0], xm1
- movq [r0 + r2], xm0
+ movhps [r0 + r2], xm1
lea r0, [r0 + r2 * 2]
- movhps [r0], xm1
+ movq [r0], xm0
movhps [r0 + r2], xm0
RET
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_8, 3,5,5
+cglobal cpy1Dto2D_shl_8, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 8/4
- lea r4, [r2 * 3]
+ lea r3, [r2 * 3]
-.loop:
- ; Row 0-1
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 0-3
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0], m1
- movu [r0 + r2], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + r2 * 2], m3
+ mova [r0 + r3], m4
+ lea r0, [r0 + r2 * 4]
- ; Row 2-3
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 4-7
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + r2 * 2], m1
- movu [r0 + r4], m3
-
- add r1, 8 * mmsize
- lea r0, [r0 + r2 * 4]
- dec r3d
- jnz .loop
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + r2 * 2], m3
+ mova [r0 + r3], m4
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_8, 3,4,3
+cglobal cpy1Dto2D_shl_8, 3, 4, 3
add r2d, r2d
movd xm0, r3m
lea r3, [r2 * 3]
- ; Row 0-1
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0], xm1
- vextracti128 [r0 + r2], m1, 1
-
- ; Row 2-3
- movu xm1, [r1 + 2 * mmsize]
- vinserti128 m1, m1, [r1 + 3 * mmsize], 1
- movu xm2, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2 * 2], xm1
- vextracti128 [r0 + r3], m1, 1
-
- add r1, 4 * mmsize
- lea r0, [r0 + r2 * 4]
-
- ; Row 4-5
+ ; Row 0-3
movu m1, [r1 + 0 * mmsize]
movu m2, [r1 + 1 * mmsize]
- packssdw m1, m2
- vpermq m1, m1, 11011000b
psllw m1, xm0
+ psllw m2, xm0
movu [r0], xm1
vextracti128 [r0 + r2], m1, 1
+ movu [r0 + r2 * 2], xm2
+ vextracti128 [r0 + r3], m2, 1
- ; Row 6-7
+ ; Row 4-7
movu m1, [r1 + 2 * mmsize]
movu m2, [r1 + 3 * mmsize]
- packssdw m1, m2
- vpermq m1, m1, 11011000b
+ lea r0, [r0 + r2 * 4]
psllw m1, xm0
- movu [r0 + r2 * 2], xm1
- vextracti128 [r0 + r3], m1, 1
+ psllw m2, xm0
+ movu [r0], xm1
+ vextracti128 [r0 + r2], m1, 1
+ movu [r0 + r2 * 2], xm2
+ vextracti128 [r0 + r3], m2, 1
RET
+
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_16, 3,4,5
+cglobal cpy1Dto2D_shl_16, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 16/2
+ mov r3d, 16/4
.loop:
- ; Row 0
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 0-1
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0], m1
- movu [r0 + mmsize], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + 16], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 + 16], m4
- ; Row 1
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ ; Row 2-3
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 2]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + r2], m1
- movu [r0 + r2 + mmsize], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + 16], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 + 16], m4
add r1, 8 * mmsize
lea r0, [r0 + r2 * 2]
@@ -4199,49 +4000,28 @@ cglobal cvt32to16_shl_16, 3,4,5
INIT_YMM avx2
-cglobal cvt32to16_shl_16, 3,5,3
+cglobal cpy1Dto2D_shl_16, 3, 5, 3
add r2d, r2d
movd xm0, r3m
mov r3d, 16/4
lea r4, [r2 * 3]
.loop:
- ; Row 0
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
+ ; Row 0-1
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
psllw m1, xm0
+ psllw m2, xm0
movu [r0], m1
+ movu [r0 + r2], m2
- ; Row 1
- movu xm1, [r1 + 2 * mmsize]
- vinserti128 m1, m1, [r1 + 3 * mmsize], 1
- movu xm2, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2], m1
-
- add r1, 4 * mmsize
-
- ; Row 2
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2 * 2], m1
-
- ; Row 3
+ ; Row 2-3
movu m1, [r1 + 2 * mmsize]
movu m2, [r1 + 3 * mmsize]
- packssdw m1, m2
psllw m1, xm0
- vpermq m1, m1, 11011000b
- movu [r0 + r4], m1
+ psllw m2, xm0
+ movu [r0 + r2 * 2], m1
+ movu [r0 + r4], m2
add r1, 4 * mmsize
lea r0, [r0 + r2 * 4]
@@ -4251,84 +4031,70 @@ cglobal cvt32to16_shl_16, 3,5,3
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_32, 3,4,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 32/1
+ mov r3d, 32/2
.loop:
; Row 0
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + 0 * mmsize], m1
- movu [r0 + 1 * mmsize], m3
-
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ psllw m4, m0
+ mova [r0 + 0 * mmsize], m1
+ mova [r0 + 1 * mmsize], m2
+ mova [r0 + 2 * mmsize], m3
+ mova [r0 + 3 * mmsize], m4
+
+ ; Row 1
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m3
+ psllw m4, m0
+ mova [r0 + r2 + 0 * mmsize], m1
+ mova [r0 + r2 + 1 * mmsize], m2
+ mova [r0 + r2 + 2 * mmsize], m3
+ mova [r0 + r2 + 3 * mmsize], m4
add r1, 8 * mmsize
- add r0, r2
+ lea r0, [r0 + r2 * 2]
dec r3d
jnz .loop
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_32, 3,4,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
add r2d, r2d
movd xm0, r3m
mov r3d, 32/2
.loop:
- ; Row 0
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- movu xm3, [r1 + 2 * mmsize]
- vinserti128 m3, m3, [r1 + 3 * mmsize], 1
- movu xm4, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, xm0
- psllw m3, xm0
- movu [r0], m1
- movu [r0 + mmsize], m3
-
- add r1, 4 * mmsize
-
- ; Row 1
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ ; Row 0-1
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
movu m3, [r1 + 2 * mmsize]
movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
psllw m1, xm0
+ psllw m2, xm0
psllw m3, xm0
- vpermq m3, m3, 11011000b
- movu [r0 + r2], m1
- movu [r0 + r2 + mmsize], m3
+ psllw m4, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m2
+ movu [r0 + r2], m3
+ movu [r0 + r2 + mmsize], m4
add r1, 4 * mmsize
lea r0, [r0 + r2 * 2]
@@ -4338,7 +4104,7 @@ cglobal cvt32to16_shl_32, 3,4,5
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_4, 3,3,3
@@ -4377,7 +4143,7 @@ cglobal copy_cnt_4, 3,3,3
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_8, 3,3,6
@@ -4481,7 +4247,7 @@ cglobal copy_cnt_8, 3,4,5
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_16, 3,4,6
@@ -4592,7 +4358,7 @@ cglobal copy_cnt_16, 3, 5, 5
RET
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_32, 3,4,6
@@ -4699,227 +4465,470 @@ cglobal copy_cnt_32, 3, 5, 5
movd eax, xm4
RET
-;-----------------------------------------------------------------------------
-; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal copy_shr, 4, 7, 4, dst, src, stride
-%define rnd m2
-%define shift m1
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_4, 4, 4, 4
+ add r2d, r2d
+ movd m0, r3d
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; m0 - shift
- ; make shift
- mov r5d, r3m
- movd shift, r5d
+ ; Row 0-3
+ movh m2, [r1]
+ movhps m2, [r1 + r2]
+ lea r1, [r1 + r2 * 2]
+ movh m3, [r1]
+ movhps m3, [r1 + r2]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
- ; make round
- dec r5
- xor r6, r6
- bts r6, r5
+ RET
- movd rnd, r6d
- pshufd rnd, rnd, 0
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_8, 4, 5, 4
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 8/4
+ lea r4, [r2 * 3]
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride * 2 (short*)
- ; r3 - lx
- ; r4 - size
- ; r5 - ly
- ; r6 - diff
- add r2d, r2d
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; r4 - stride * 3
+ ; m0 - shift
+
+.loop:
+ ; Row 0, 1
+ mova m2, [r1]
+ mova m3, [r1 + r2]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+
+ ; Row 2, 3
+ mova m2, [r1 + r2 * 2]
+ mova m3, [r1 + r4]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
- mov r4d, r4m
- mov r5, r4 ; size
- mov r6, r2 ; stride
- sub r6, r4
- add r6, r6
- shr r5, 1
-.loop_row:
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_16, 4, 4, 4
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 16/2
- mov r3, r4
- shr r3, 2
-.loop_col:
- ; row 0
- movh m3, [r1]
- pmovsxwd m0, m3
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0], m0
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; m0 - shift
- ; row 1
- movh m3, [r1 + r4 * 2]
- pmovsxwd m0, m3
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0 + r2], m0
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
- ; move col pointer
- add r1, 8
- add r0, 8
+ ; Row 1
+ mova m2, [r1 + r2 + 0 * mmsize]
+ mova m3, [r1 + r2 + 1 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
- dec r3
- jg .loop_col
- ; update pointer
- lea r1, [r1 + r4 * 2]
- add r0, r6
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_32, 4, 4, 6
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 32/1
- ; end of loop_row
- dec r5
- jg .loop_row
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; m0 - shift
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ psllw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ add r0, 4 * mmsize
+ add r1, r2
+ dec r3d
+ jnz .loop
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_4, 3,3,3
+cglobal cpy1Dto2D_shr_4, 3, 3, 4
add r2d, r2d
movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- psllw m1, m0
- psllw m2, m0
- movh [r0], m1
- movhps [r0 + r2], m1
- movh [r0 + r2 * 2], m2
- lea r2, [r2 * 3]
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ movh [r0], m2
movhps [r0 + r2], m2
+ movh [r0 + r2 * 2], m3
+ lea r2, [r2 * 3]
+ movhps [r0 + r2], m3
RET
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_4, 3, 3, 3
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+
+ ; Row 0-3
+ movu m2, [r1]
+ psubw m2, m1
+ psraw m2, xm0
+ vextracti128 xm1, m2, 1
+ movq [r0], xm2
+ movhps [r0 + r2], xm2
+ lea r0, [r0 + r2 * 2]
+ movq [r0], xm1
+ movhps [r0 + r2], xm1
+ RET
+
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_8, 3,4,5
+cglobal cpy1Dto2D_shr_8, 3, 4, 6
add r2d, r2d
movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ lea r3, [r2 * 3]
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_8, 3, 4, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ lea r3, [r2 * 3]
+
+ ; Row 0-3
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+
+ ; Row 4-7
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_16, 3,4,5
+cglobal cpy1Dto2D_shr_16, 3, 5, 6
add r2d, r2d
movd m0, r3m
- mov r3d, 256/64
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
.loop:
- ; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + 16], m2
- movu [r0 + r2], m3
- movu [r0 + r2 + 16], m4
+ ; Row 0-1
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + mmsize], m3
+ mova [r0 + r2], m4
+ mova [r0 + r2 + mmsize], m5
- ; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- movu [r0 + r2 * 2 + 16], m2
- lea r0, [r0 + r2 * 2]
- movu [r0 + r2], m3
- movu [r0 + r2 + 16], m4
+ ; Row 2-3
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + r2 * 2], m2
+ mova [r0 + r2 * 2 + mmsize], m3
+ mova [r0 + r4], m4
+ mova [r0 + r4 + mmsize], m5
add r1, 8 * mmsize
- lea r0, [r0 + r2 * 2]
+ lea r0, [r0 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
+
+.loop:
+ ; Row 0-1
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], m2
+ movu [r0 + r2], m3
+
+ ; Row 2-3
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0 + r2 * 2], m2
+ movu [r0 + r4], m3
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
dec r3d
jnz .loop
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_32, 3,4,5
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
add r2d, r2d
movd m0, r3m
- mov r3d, 1024/64
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ mov r3d, 32/2
.loop:
- ; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + 16], m2
- movu [r0 + 32], m3
- movu [r0 + 48], m4
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
- ; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2], m1
- movu [r0 + r2 + 16], m2
- movu [r0 + r2 + 32], m3
- movu [r0 + r2 + 48], m4
+ ; Row 1
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + r2 + 0 * mmsize], m2
+ mova [r0 + r2 + 1 * mmsize], m3
+ mova [r0 + r2 + 2 * mmsize], m4
+ mova [r0 + r2 + 3 * mmsize], m5
add r1, 8 * mmsize
lea r0, [r0 + r2 * 2]
dec r3d
jnz .loop
RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ mov r3d, 32/2
+
+.loop:
+ ; Row 0-1
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ movu m4, [r1 + 2 * mmsize]
+ movu m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ psraw m4, xm0
+ psraw m5, xm0
+ movu [r0], m2
+ movu [r0 + mmsize], m3
+ movu [r0 + r2], m4
+ movu [r0 + r2 + mmsize], m5
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
diff --git a/source/common/x86/blockcopy8.h b/source/common/x86/blockcopy8.h
index 115e340..9fbbeea 100644
--- a/source/common/x86/blockcopy8.h
+++ b/source/common/x86/blockcopy8.h
@@ -24,48 +24,53 @@
#ifndef X265_BLOCKCOPY8_H
#define X265_BLOCKCOPY8_H
-void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
-void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_32_sse2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_4_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int);
-void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int);
-void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int);
-uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t);
-uint32_t x265_copy_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t);
+void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
- void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
- void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); \
- void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
+ void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \
+ void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#define SETUP_BLOCKCOPY_PS(W, H, cpu) \
- void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride);
+ void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
#define SETUP_BLOCKCOPY_SP(W, H, cpu) \
- void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+ void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
- void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
- void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
+ void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#define BLOCKCOPY_COMMON(cpu) \
SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
@@ -178,31 +183,31 @@ BLOCKCOPY_PS(_sse4);
BLOCKCOPY_SP(_sse2);
-void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockcopy_ss_16x4_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x8_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x12_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x24_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_16x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x48_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-void x265_blockcopy_ss_64x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride);
-
-void x265_blockcopy_pp_32x8_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x16_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x24_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x32_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
-
-void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+
+void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x24_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val);
#undef BLOCKCOPY_COMMON
#undef BLOCKCOPY_SS_PP
diff --git a/source/common/x86/const-a.asm b/source/common/x86/const-a.asm
index 17c3335..02cee0f 100644
--- a/source/common/x86/const-a.asm
+++ b/source/common/x86/const-a.asm
@@ -50,6 +50,7 @@ const pb_unpackwq1, db 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3
const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
const pw_swap, times 2 db 6,7,4,5,2,3,0,1
+const pb_2, times 16 db 2
const pb_4, times 16 db 4
const pb_16, times 16 db 16
const pb_64, times 16 db 64
@@ -62,6 +63,7 @@ const pb_32, times 16 db 32
const pb_128, times 16 db 128
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
const pw_2, times 8 dw 2
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
diff --git a/source/common/x86/dct8.asm b/source/common/x86/dct8.asm
index 5323a42..03161c7 100644
--- a/source/common/x86/dct8.asm
+++ b/source/common/x86/dct8.asm
@@ -245,7 +245,7 @@ avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64
avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
-const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
@@ -318,7 +318,7 @@ cextern pd_2048
cextern pw_ppppmmmm
;------------------------------------------------------
-;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
+;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
@@ -384,28 +384,28 @@ cglobal dct4, 3, 4, 8
paddd m1, m3
paddd m1, m7
psrad m1, 8
- movu [r1 + 0 * 16], m1
- pmaddwd m1, m2, m5
+ pmaddwd m4, m2, m5
pmaddwd m3, m0, m5
- psubd m1, m3
- paddd m1, m7
- psrad m1, 8
- movu [r1 + 1 * 16], m1
+ psubd m4, m3
+ paddd m4, m7
+ psrad m4, 8
+ packssdw m1, m4
+ movu [r1 + 0 * 16], m1
pmaddwd m1, m2, m6
pmaddwd m3, m0, m6
paddd m1, m3
paddd m1, m7
psrad m1, 8
- movu [r1 + 2 * 16], m1
pmaddwd m2, [r3 + 3 * 16]
pmaddwd m0, [r3 + 3 * 16]
psubd m2, m0
paddd m2, m7
psrad m2, 8
- movu [r1 + 3 * 16], m2
+ packssdw m1, m2
+ movu [r1 + 1 * 16], m1
RET
; DCT 4x4
@@ -470,14 +470,12 @@ cglobal dct4, 3, 4, 8, src, dst, srcStride
paddd m2, m7
psrad m2, 8
- movu [r1], xm3
- movu [r1 + mmsize/2], m2
- vextracti128 [r1 + mmsize], m3, 1
- vextracti128 [r1 + mmsize + mmsize/2], m2, 1
+ packssdw m3, m2
+ movu [r1], m3
RET
;-------------------------------------------------------
-;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idct4, 3, 4, 7
@@ -497,11 +495,6 @@ cglobal idct4, 3, 4, 7
movu m0, [r0 + 0 * 16]
movu m1, [r0 + 1 * 16]
- packssdw m0, m1
-
- movu m1, [r0 + 2 * 16]
- movu m2, [r0 + 3 * 16]
- packssdw m1, m2
punpcklwd m2, m0, m1
pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
@@ -572,7 +565,7 @@ cglobal idct4, 3, 4, 7
RET
;------------------------------------------------------
-;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM ssse3
%if ARCH_X86_64
@@ -638,33 +631,33 @@ cglobal dst4, 3, 4, 8
phaddd m0, m1
paddd m0, m5
psrad m0, 8
- movu [r1 + 0 * 16], m0
- pmaddwd m0, m2, coef1
+ pmaddwd m4, m2, coef1
pmaddwd m1, m3, coef1
- phaddd m0, m1
- paddd m0, m5
- psrad m0, 8
- movu [r1 + 1 * 16], m0
+ phaddd m4, m1
+ paddd m4, m5
+ psrad m4, 8
+ packssdw m0, m4
+ movu [r1 + 0 * 16], m0
pmaddwd m0, m2, coef2
pmaddwd m1, m3, coef2
phaddd m0, m1
paddd m0, m5
psrad m0, 8
- movu [r1 + 2 * 16], m0
pmaddwd m2, coef3
pmaddwd m3, coef3
phaddd m2, m3
paddd m2, m5
psrad m2, 8
- movu [r1 + 3 * 16], m2
+ packssdw m0, m2
+ movu [r1 + 1 * 16], m0
RET
;-------------------------------------------------------
-;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
+;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idst4, 3, 4, 7
@@ -683,11 +676,6 @@ cglobal idst4, 3, 4, 7
movu m0, [r0 + 0 * 16]
movu m1, [r0 + 1 * 16]
- packssdw m0, m1
-
- movu m1, [r0 + 2 * 16]
- movu m2, [r0 + 3 * 16]
- packssdw m1, m2
punpcklwd m2, m0, m1 ; m2 = m128iAC
punpckhwd m0, m1 ; m0 = m128iBD
@@ -762,7 +750,7 @@ cglobal idst4, 3, 4, 7
;-------------------------------------------------------
-; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse4
cglobal dct8, 3,6,7,0-16*mmsize
@@ -935,10 +923,16 @@ cglobal dct8, 3,6,7,0-16*mmsize
phsubd m4, m2 ; m4 = [Row6 Row4]
paddd m4, m6
psrad m4, 9
- movh [r1 + 0*2*mmsize], m3
- movhps [r1 + 2*2*mmsize], m3
- movh [r1 + 4*2*mmsize], m4
- movhps [r1 + 6*2*mmsize], m4
+
+ packssdw m3, m3
+ movd [r1 + 0*mmsize], m3
+ pshufd m3, m3, 1
+ movd [r1 + 2*mmsize], m3
+
+ packssdw m4, m4
+ movd [r1 + 4*mmsize], m4
+ pshufd m4, m4, 1
+ movd [r1 + 6*mmsize], m4
; odd
pmulld m2, m0, [r4 + 2*16]
@@ -950,8 +944,11 @@ cglobal dct8, 3,6,7,0-16*mmsize
phaddd m2, m4 ; m2 = [Row3 Row1]
paddd m2, m6
psrad m2, 9
- movh [r1 + 1*2*mmsize], m2
- movhps [r1 + 3*2*mmsize], m2
+
+ packssdw m2, m2
+ movd [r1 + 1*mmsize], m2
+ pshufd m2, m2, 1
+ movd [r1 + 3*mmsize], m2
pmulld m2, m0, [r4 + 4*16]
pmulld m3, m1, [r4 + 4*16]
@@ -962,10 +959,13 @@ cglobal dct8, 3,6,7,0-16*mmsize
phaddd m2, m4 ; m2 = [Row7 Row5]
paddd m2, m6
psrad m2, 9
- movh [r1 + 5*2*mmsize], m2
- movhps [r1 + 7*2*mmsize], m2
- add r1, mmsize/2
+ packssdw m2, m2
+ movd [r1 + 5*mmsize], m2
+ pshufd m2, m2, 1
+ movd [r1 + 7*mmsize], m2
+
+ add r1, mmsize/4
add r0, 2*2*mmsize
%endrep
@@ -974,17 +974,392 @@ cglobal dct8, 3,6,7,0-16*mmsize
RET
;-------------------------------------------------------
-; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
+;-------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse2
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+ %define IDCT_ADD pd_512
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+ %define IDCT_ADD pd_2048
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+cglobal idct8, 3, 6, 16, 0-5*mmsize
+ mova m9, [r0 + 1 * mmsize]
+ mova m1, [r0 + 3 * mmsize]
+ mova m7, m9
+ punpcklwd m7, m1
+ punpckhwd m9, m1
+ mova m14, [tab_idct8_3]
+ mova m3, m14
+ pmaddwd m14, m7
+ pmaddwd m3, m9
+ mova m0, [r0 + 5 * mmsize]
+ mova m10, [r0 + 7 * mmsize]
+ mova m2, m0
+ punpcklwd m2, m10
+ punpckhwd m0, m10
+ mova m15, [tab_idct8_3 + 1 * mmsize]
+ mova m11, [tab_idct8_3 + 1 * mmsize]
+ pmaddwd m15, m2
+ mova m4, [tab_idct8_3 + 2 * mmsize]
+ pmaddwd m11, m0
+ mova m1, [tab_idct8_3 + 2 * mmsize]
+ paddd m15, m14
+ mova m5, [tab_idct8_3 + 4 * mmsize]
+ mova m12, [tab_idct8_3 + 4 * mmsize]
+ paddd m11, m3
+ mova [rsp + 0 * mmsize], m11
+ mova [rsp + 1 * mmsize], m15
+ pmaddwd m4, m7
+ pmaddwd m1, m9
+ mova m14, [tab_idct8_3 + 3 * mmsize]
+ mova m3, [tab_idct8_3 + 3 * mmsize]
+ pmaddwd m14, m2
+ pmaddwd m3, m0
+ paddd m14, m4
+ paddd m3, m1
+ mova [rsp + 2 * mmsize], m3
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
+ mova m6, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m12, m7
+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
+ mova m4, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m6, m2
+ paddd m6, m12
+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
+ paddd m7, m2
+ mova [rsp + 3 * mmsize], m6
+ pmaddwd m4, m0
+ pmaddwd m0, [tab_idct8_3 + 7 * mmsize]
+ paddd m9, m0
+ paddd m5, m4
+ mova m6, [r0 + 0 * mmsize]
+ mova m0, [r0 + 4 * mmsize]
+ mova m4, m6
+ punpcklwd m4, m0
+ punpckhwd m6, m0
+ mova m12, [r0 + 2 * mmsize]
+ mova m0, [r0 + 6 * mmsize]
+ mova m13, m12
+ mova m8, [tab_dct4]
+ punpcklwd m13, m0
+ mova m10, [tab_dct4]
+ punpckhwd m12, m0
+ pmaddwd m8, m4
+ mova m3, m8
+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
+ pmaddwd m10, m6
+ mova m2, [tab_dct4 + 1 * mmsize]
+ mova m1, m10
+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
+ mova m0, [tab_dct4 + 1 * mmsize]
+ pmaddwd m2, m13
+ paddd m3, m2
+ psubd m8, m2
+ mova m2, m6
+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
+ pmaddwd m0, m12
+ paddd m1, m0
+ psubd m10, m0
+ mova m0, m4
+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
+ paddd m3, [pd_64]
+ paddd m1, [pd_64]
+ paddd m8, [pd_64]
+ paddd m10, [pd_64]
+ paddd m0, m13
+ paddd m2, m12
+ paddd m0, [pd_64]
+ paddd m2, [pd_64]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [pd_64]
+ paddd m6, [pd_64]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, 7
+ paddd m15, m3
+ psubd m3, [rsp + 1 * mmsize]
+ psrad m15, 7
+ paddd m12, m7
+ psrad m12, 7
+ paddd m11, m1
+ mova m13, m14
+ psrad m11, 7
+ packssdw m15, m11
+ psubd m1, [rsp + 0 * mmsize]
+ psrad m1, 7
+ mova m11, [rsp + 2 * mmsize]
+ paddd m14, m0
+ psrad m14, 7
+ psubd m0, m13
+ psrad m0, 7
+ paddd m11, m2
+ mova m13, [rsp + 3 * mmsize]
+ psrad m11, 7
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, 7
+ psrad m6, 7
+ paddd m11, m5
+ psrad m11, 7
+ packssdw m13, m11
+ mova m11, m10
+ psubd m4, [rsp + 3 * mmsize]
+ psubd m10, m9
+ psrad m4, 7
+ psrad m10, 7
+ packssdw m4, m6
+ packssdw m8, m10
+ paddd m11, m9
+ psrad m11, 7
+ packssdw m12, m11
+ psubd m2, [rsp + 2 * mmsize]
+ mova m5, m15
+ psrad m2, 7
+ packssdw m0, m2
+ mova m2, m14
+ psrad m3, 7
+ packssdw m3, m1
+ mova m6, m13
+ punpcklwd m5, m8
+ punpcklwd m2, m4
+ mova m1, m12
+ punpcklwd m6, m0
+ punpcklwd m1, m3
+ mova m9, m5
+ punpckhwd m13, m0
+ mova m0, m2
+ punpcklwd m9, m6
+ punpckhwd m5, m6
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ punpckhwd m15, m8
+ mova m1, m5
+ punpckhwd m14, m4
+ punpckhwd m12, m3
+ mova m6, m9
+ punpckhwd m9, m0
+ punpcklwd m1, m2
+ mova m4, [tab_idct8_3 + 0 * mmsize]
+ punpckhwd m5, m2
+ punpcklwd m6, m0
+ mova m2, m15
+ mova m0, m14
+ mova m7, m9
+ punpcklwd m2, m13
+ punpcklwd m0, m12
+ punpcklwd m7, m5
+ punpckhwd m14, m12
+ mova m10, m2
+ punpckhwd m15, m13
+ punpckhwd m9, m5
+ pmaddwd m4, m7
+ mova m13, m1
+ punpckhwd m2, m0
+ punpcklwd m10, m0
+ mova m0, m15
+ punpckhwd m15, m14
+ mova m12, m1
+ mova m3, [tab_idct8_3 + 0 * mmsize]
+ punpcklwd m0, m14
+ pmaddwd m3, m9
+ mova m11, m2
+ punpckhwd m2, m15
+ punpcklwd m11, m15
+ mova m8, [tab_idct8_3 + 1 * mmsize]
+ punpcklwd m13, m0
+ punpckhwd m12, m0
+ pmaddwd m8, m11
+ paddd m8, m4
+ mova [rsp + 4 * mmsize], m8
+ mova m4, [tab_idct8_3 + 2 * mmsize]
+ pmaddwd m4, m7
+ mova m15, [tab_idct8_3 + 2 * mmsize]
+ mova m5, [tab_idct8_3 + 1 * mmsize]
+ pmaddwd m15, m9
+ pmaddwd m5, m2
+ paddd m5, m3
+ mova [rsp + 3 * mmsize], m5
+ mova m14, [tab_idct8_3 + 3 * mmsize]
+ mova m5, [tab_idct8_3 + 3 * mmsize]
+ pmaddwd m14, m11
+ paddd m14, m4
+ mova [rsp + 2 * mmsize], m14
+ pmaddwd m5, m2
+ paddd m5, m15
+ mova [rsp + 1 * mmsize], m5
+ mova m15, [tab_idct8_3 + 4 * mmsize]
+ mova m5, [tab_idct8_3 + 4 * mmsize]
+ pmaddwd m15, m7
+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
+ mova m4, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m4, m2
+ paddd m5, m4
+ mova m4, m6
+ mova m8, [tab_idct8_3 + 5 * mmsize]
+ punpckhwd m6, m10
+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
+ punpcklwd m4, m10
+ paddd m9, m2
+ pmaddwd m8, m11
+ mova m10, [tab_dct4]
+ paddd m8, m15
+ pmaddwd m11, [tab_idct8_3 + 7 * mmsize]
+ paddd m7, m11
+ mova [rsp + 0 * mmsize], m8
+ pmaddwd m10, m6
+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
+ mova m1, m10
+ mova m8, [tab_dct4]
+ mova m3, [tab_dct4 + 1 * mmsize]
+ pmaddwd m8, m4
+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
+ mova m0, m8
+ mova m2, [tab_dct4 + 1 * mmsize]
+ pmaddwd m3, m13
+ psubd m8, m3
+ paddd m0, m3
+ mova m3, m6
+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
+ pmaddwd m2, m12
+ paddd m1, m2
+ psubd m10, m2
+ mova m2, m4
+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
+ paddd m0, [IDCT_ADD]
+ paddd m1, [IDCT_ADD]
+ paddd m8, [IDCT_ADD]
+ paddd m10, [IDCT_ADD]
+ paddd m2, m13
+ paddd m3, m12
+ paddd m2, [IDCT_ADD]
+ paddd m3, [IDCT_ADD]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [IDCT_ADD]
+ paddd m6, [IDCT_ADD]
+ mova m15, [rsp + 4 * mmsize]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, IDCT_SHIFT
+ mova m11, [rsp + 3 * mmsize]
+ paddd m15, m0
+ psrad m15, IDCT_SHIFT
+ psubd m0, [rsp + 4 * mmsize]
+ psrad m0, IDCT_SHIFT
+ paddd m12, m7
+ paddd m11, m1
+ mova m14, [rsp + 2 * mmsize]
+ psrad m11, IDCT_SHIFT
+ packssdw m15, m11
+ psubd m1, [rsp + 3 * mmsize]
+ psrad m1, IDCT_SHIFT
+ mova m11, [rsp + 1 * mmsize]
+ paddd m14, m2
+ psrad m14, IDCT_SHIFT
+ packssdw m0, m1
+ psrad m12, IDCT_SHIFT
+ psubd m2, [rsp + 2 * mmsize]
+ paddd m11, m3
+ mova m13, [rsp + 0 * mmsize]
+ psrad m11, IDCT_SHIFT
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, IDCT_SHIFT
+ mova m1, m15
+ paddd m11, m5
+ psrad m11, IDCT_SHIFT
+ packssdw m13, m11
+ mova m11, m10
+ psubd m10, m9
+ psrad m10, IDCT_SHIFT
+ packssdw m8, m10
+ psrad m6, IDCT_SHIFT
+ psubd m4, [rsp + 0 * mmsize]
+ paddd m11, m9
+ psrad m11, IDCT_SHIFT
+ packssdw m12, m11
+ punpcklwd m1, m14
+ mova m5, m13
+ psrad m4, IDCT_SHIFT
+ packssdw m4, m6
+ psubd m3, [rsp + 1 * mmsize]
+ psrad m2, IDCT_SHIFT
+ mova m6, m8
+ psrad m3, IDCT_SHIFT
+ punpcklwd m5, m12
+ packssdw m2, m3
+ punpcklwd m6, m4
+ punpckhwd m8, m4
+ mova m4, m1
+ mova m3, m2
+ punpckhdq m1, m5
+ punpckldq m4, m5
+ punpcklwd m3, m0
+ punpckhwd m2, m0
+ mova m0, m6
+ lea r2, [r2 + r2]
+ lea r4, [r2 + r2]
+ lea r3, [r4 + r2]
+ lea r4, [r4 + r3]
+ lea r0, [r4 + r2 * 2]
+ movq [r1], m4
+ punpckhwd m15, m14
+ movhps [r1 + r2], m4
+ punpckhdq m0, m3
+ movq [r1 + r2 * 2], m1
+ punpckhwd m13, m12
+ movhps [r1 + r3], m1
+ mova m1, m6
+ punpckldq m1, m3
+ movq [r1 + 8], m1
+ movhps [r1 + r2 + 8], m1
+ movq [r1 + r2 * 2 + 8], m0
+ movhps [r1 + r3 + 8], m0
+ mova m0, m15
+ punpckhdq m15, m13
+ punpckldq m0, m13
+ movq [r1 + r2 * 4], m0
+ movhps [r1 + r4], m0
+ mova m0, m8
+ punpckhdq m8, m2
+ movq [r1 + r3 * 2], m15
+ punpckldq m0, m2
+ movhps [r1 + r0], m15
+ movq [r1 + r2 * 4 + 8], m0
+ movhps [r1 + r4 + 8], m0
+ movq [r1 + r3 * 2 + 8], m8
+ movhps [r1 + r0 + 8], m8
+ RET
+
+%undef IDCT_SHIFT
+%undef IDCT_ADD
+%endif
+
+;-------------------------------------------------------
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass1
- movu m0, [r0]
- movu m1, [r0 + 4 * 32]
- movu m2, [r0 + 2 * 32]
- movu m3, [r0 + 6 * 32]
- packssdw m0, m2
- packssdw m1, m3
+ movh m0, [r0]
+ movhps m0, [r0 + 2 * 16]
+ movh m1, [r0 + 4 * 16]
+ movhps m1, [r0 + 6 * 16]
+
punpckhwd m2, m0, m1 ; [2 6]
punpcklwd m0, m1 ; [0 4]
pmaddwd m1, m0, [r6] ; EE[0]
@@ -1004,12 +1379,10 @@ cglobal patial_butterfly_inverse_internal_pass1
paddd m3, m5
paddd m4, m5
- movu m2, [r0 + 32]
- movu m5, [r0 + 5 * 32]
- packssdw m2, m5
- movu m5, [r0 + 3 * 32]
- movu m6, [r0 + 7 * 32]
- packssdw m5, m6
+ movh m2, [r0 + 16]
+ movhps m2, [r0 + 5 * 16]
+ movh m5, [r0 + 3 * 16]
+ movhps m5, [r0 + 7 * 16]
punpcklwd m6, m2, m5 ;[1 3]
punpckhwd m2, m5 ;[5 7]
@@ -1136,7 +1509,7 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize
call patial_butterfly_inverse_internal_pass1
- add r0, 16
+ add r0, 8
add r5, 8
call patial_butterfly_inverse_internal_pass1
@@ -1167,27 +1540,35 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize
;-----------------------------------------------------------------------------
-; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
+; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal denoise_dct, 4, 4, 6
pxor m5, m5
- shr r3d, 2
+ shr r3d, 3
.loop:
mova m0, [r0]
- pabsd m1, m0
+ pabsw m1, m0
+
mova m2, [r1]
- paddd m2, m1
+ pmovsxwd m3, m1
+ paddd m2, m3
mova [r1], m2
- pmovzxwd m3, [r2]
- psubd m1, m3
- pcmpgtd m4, m1, m5
+ mova m2, [r1 + 16]
+ psrldq m3, m1, 8
+ pmovsxwd m4, m3
+ paddd m2, m4
+ mova [r1 + 16], m2
+
+ movu m3, [r2]
+ psubusw m1, m3
+ pcmpgtw m4, m1, m5
pand m1, m4
- psignd m1, m0
+ psignw m1, m0
mova [r0], m1
add r0, 16
- add r1, 16
- add r2, 8
+ add r1, 32
+ add r2, 16
dec r3d
jnz .loop
RET
@@ -1195,25 +1576,32 @@ cglobal denoise_dct, 4, 4, 6
INIT_YMM avx2
cglobal denoise_dct, 4, 4, 6
pxor m5, m5
- shr r3d, 3
+ shr r3d, 4
.loop:
movu m0, [r0]
- pabsd m1, m0
+ pabsw m1, m0
movu m2, [r1]
- paddd m2, m1
+ pmovsxwd m4, xm1
+ paddd m2, m4
movu [r1], m2
- pmovzxwd m3, [r2]
- psubd m1, m3
- pcmpgtd m4, m1, m5
+ vextracti128 xm4, m1, 1
+ movu m2, [r1 + 32]
+ pmovsxwd m3, xm4
+ paddd m2, m3
+ movu [r1 + 32], m2
+ movu m3, [r2]
+ psubusw m1, m3
+ pcmpgtw m4, m1, m5
pand m1, m4
- psignd m1, m0
+ psignw m1, m0
movu [r0], m1
add r0, 32
- add r1, 32
- add r2, 16
+ add r1, 64
+ add r2, 32
dec r3d
jnz .loop
RET
+
%if ARCH_X86_64 == 1
%macro DCT8_PASS_1 4
vpbroadcastq m0, [r6 + %1]
@@ -1227,7 +1615,7 @@ cglobal denoise_dct, 4, 4, 6
mova [r5 + %2], xm2
%endmacro
-%macro DCT8_PASS_2 1
+%macro DCT8_PASS_2 2
vbroadcasti128 m4, [r6 + %1]
pmaddwd m6, m0, m4
pmaddwd m7, m1, m4
@@ -1238,10 +1626,25 @@ cglobal denoise_dct, 4, 4, 6
phaddd m6, m8
paddd m6, m5
psrad m6, DCT_SHIFT2
+
+ vbroadcasti128 m4, [r6 + %2]
+ pmaddwd m10, m0, m4
+ pmaddwd m7, m1, m4
+ pmaddwd m8, m2, m4
+ pmaddwd m9, m3, m4
+ phaddd m10, m7
+ phaddd m8, m9
+ phaddd m10, m8
+ paddd m10, m5
+ psrad m10, DCT_SHIFT2
+
+ packssdw m6, m10
+ vpermq m10, m6, 0xD8
+
%endmacro
INIT_YMM avx2
-cglobal dct8, 3, 7, 10, 0-8*16
+cglobal dct8, 3, 7, 11, 0-8*16
%if BIT_DEPTH == 10
%define DCT_SHIFT 4
vbroadcasti128 m5, [pd_8]
@@ -1294,9 +1697,6 @@ cglobal dct8, 3, 7, 10, 0-8*16
DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
;pass2
- mov r2d, 32
- lea r3, [r2 * 3]
- lea r4, [r1 + r2 * 4]
vbroadcasti128 m5, [pd_256]
mova m0, [r5]
@@ -1304,22 +1704,14 @@ cglobal dct8, 3, 7, 10, 0-8*16
mova m2, [r5 + 64]
mova m3, [r5 + 96]
- DCT8_PASS_2 0 * 16
- movu [r1], m6
- DCT8_PASS_2 1 * 16
- movu [r1 + r2], m6
- DCT8_PASS_2 2 * 16
- movu [r1 + r2 * 2], m6
- DCT8_PASS_2 3 * 16
- movu [r1 + r3], m6
- DCT8_PASS_2 4 * 16
- movu [r4], m6
- DCT8_PASS_2 5 * 16
- movu [r4 + r2], m6
- DCT8_PASS_2 6 * 16
- movu [r4 + r2 * 2], m6
- DCT8_PASS_2 7 * 16
- movu [r4 + r3], m6
+ DCT8_PASS_2 0 * 16, 1 * 16
+ movu [r1], m10
+ DCT8_PASS_2 2 * 16, 3 * 16
+ movu [r1 + 32], m10
+ DCT8_PASS_2 4 * 16, 5 * 16
+ movu [r1 + 64], m10
+ DCT8_PASS_2 6 * 16, 7 * 16
+ movu [r1 + 96], m10
RET
%macro DCT16_PASS_1_E 2
@@ -1360,7 +1752,7 @@ cglobal dct8, 3, 7, 10, 0-8*16
mova [r5 + %2], xm10
%endmacro
-%macro DCT16_PASS_2 1
+%macro DCT16_PASS_2 2
vbroadcasti128 m8, [r7 + %1]
vbroadcasti128 m13, [r8 + %1]
@@ -1385,9 +1777,40 @@ cglobal dct8, 3, 7, 10, 0-8*16
phaddd m10, m11
paddd m10, m9
psrad m10, DCT_SHIFT2
+
+
+ vbroadcasti128 m8, [r7 + %2]
+ vbroadcasti128 m13, [r8 + %2]
+
+ pmaddwd m14, m0, m8
+ pmaddwd m11, m1, m13
+ paddd m14, m11
+
+ pmaddwd m11, m2, m8
+ pmaddwd m12, m3, m13
+ paddd m11, m12
+ phaddd m14, m11
+
+ pmaddwd m11, m4, m8
+ pmaddwd m12, m5, m13
+ paddd m11, m12
+
+ pmaddwd m12, m6, m8
+ pmaddwd m13, m7, m13
+ paddd m12, m13
+ phaddd m11, m12
+
+ phaddd m14, m11
+ paddd m14, m9
+ psrad m14, DCT_SHIFT2
+
+ packssdw m10, m14
+ vextracti128 xm14, m10, 1
+ movlhps xm15, xm10, xm14
+ movhlps xm14, xm10
%endmacro
INIT_YMM avx2
-cglobal dct16, 3, 9, 15, 0-16*mmsize
+cglobal dct16, 3, 9, 16, 0-16*mmsize
%if BIT_DEPTH == 10
%define DCT_SHIFT 5
vbroadcasti128 m9, [pd_16]
@@ -1487,7 +1910,7 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize
mov r5, rsp
mov r4d, 2
- mov r2d, 64
+ mov r2d, 32
lea r3, [r2 * 3]
vbroadcasti128 m9, [pd_512]
@@ -1504,46 +1927,42 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize
mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
- DCT16_PASS_2 -8 * 16
- movu [r1], m10
- DCT16_PASS_2 -7 * 16
- movu [r1 + r2], m10
- DCT16_PASS_2 -6 * 16
- movu [r1 + r2 * 2], m10
- DCT16_PASS_2 -5 * 16
- movu [r1 + r3], m10
+ DCT16_PASS_2 -8 * 16, -7 * 16
+ movu [r1], xm15
+ movu [r1 + r2], xm14
+
+ DCT16_PASS_2 -6 * 16, -5 * 16
+ movu [r1 + r2 * 2], xm15
+ movu [r1 + r3], xm14
lea r6, [r1 + r2 * 4]
- DCT16_PASS_2 -4 * 16
- movu [r6], m10
- DCT16_PASS_2 -3 * 16
- movu [r6 + r2], m10
- DCT16_PASS_2 -2 * 16
- movu [r6 + r2 * 2], m10
- DCT16_PASS_2 -1 * 16
- movu [r6 + r3], m10
+ DCT16_PASS_2 -4 * 16, -3 * 16
+ movu [r6], xm15
+ movu [r6 + r2], xm14
+
+ DCT16_PASS_2 -2 * 16, -1 * 16
+ movu [r6 + r2 * 2], xm15
+ movu [r6 + r3], xm14
lea r6, [r6 + r2 * 4]
- DCT16_PASS_2 0 * 16
- movu [r6], m10
- DCT16_PASS_2 1 * 16
- movu [r6 + r2], m10
- DCT16_PASS_2 2 * 16
- movu [r6 + r2 * 2], m10
- DCT16_PASS_2 3 * 16
- movu [r6 + r3], m10
+ DCT16_PASS_2 0 * 16, 1 * 16
+ movu [r6], xm15
+ movu [r6 + r2], xm14
+
+ DCT16_PASS_2 2 * 16, 3 * 16
+ movu [r6 + r2 * 2], xm15
+ movu [r6 + r3], xm14
lea r6, [r6 + r2 * 4]
- DCT16_PASS_2 4 * 16
- movu [r6], m10
- DCT16_PASS_2 5 * 16
- movu [r6 + r2], m10
- DCT16_PASS_2 6 * 16
- movu [r6 + r2 * 2], m10
- DCT16_PASS_2 7 * 16
- movu [r6 + r3], m10
-
- add r1, 32
+ DCT16_PASS_2 4 * 16, 5 * 16
+ movu [r6], xm15
+ movu [r6 + r2], xm14
+
+ DCT16_PASS_2 6 * 16, 7 * 16
+ movu [r6 + r2 * 2], xm15
+ movu [r6 + r3], xm14
+
+ add r1, 16
add r5, 128
dec r4d
@@ -1609,6 +2028,7 @@ cglobal dct16, 3, 9, 15, 0-16*mmsize
paddd xm11, xm9
psrad xm11, DCT_SHIFT2
+ packssdw xm11, xm11
%endmacro
@@ -1704,7 +2124,7 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize
dec r4d
jnz .pass1
- mov r2d, 128
+ mov r2d, 64
lea r3, [r2 * 3]
mov r5, rsp
mov r4d, 8
@@ -1724,86 +2144,86 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize
mova m7, [r5 + 3 * 64 + 32]
DCT32_PASS_2 0 * 32
- movu [r1], xm11
+ movq [r1], xm11
DCT32_PASS_2 1 * 32
- movu [r1 + r2], xm11
+ movq [r1 + r2], xm11
DCT32_PASS_2 2 * 32
- movu [r1 + r2 * 2], xm11
+ movq [r1 + r2 * 2], xm11
DCT32_PASS_2 3 * 32
- movu [r1 + r3], xm11
+ movq [r1 + r3], xm11
lea r6, [r1 + r2 * 4]
DCT32_PASS_2 4 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 5 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 6 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 7 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 8 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 9 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 10 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 11 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 12 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 13 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 14 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 15 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 16 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 17 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 18 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 19 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 20 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 21 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 22 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 23 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 24 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 25 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 26 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 27 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
lea r6, [r6 + r2 * 4]
DCT32_PASS_2 28 * 32
- movu [r6], xm11
+ movq [r6], xm11
DCT32_PASS_2 29 * 32
- movu [r6 + r2], xm11
+ movq [r6 + r2], xm11
DCT32_PASS_2 30 * 32
- movu [r6 + r2 * 2], xm11
+ movq [r6 + r2 * 2], xm11
DCT32_PASS_2 31 * 32
- movu [r6 + r3], xm11
+ movq [r6 + r3], xm11
add r5, 256
- add r1, 16
+ add r1, 8
dec r4d
jnz .pass2
@@ -1926,28 +2346,25 @@ cglobal idct8, 3, 7, 13, 0-8*16
lea r6, [avx2_idct8_2]
;pass1
- mova m0, [r0 + 0 * 32]
- mova m1, [r0 + 4 * 32]
- packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
- mova m1, [r0 + 2 * 32]
- mova m2, [r0 + 6 * 32]
- packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
- mova m2, [r0 + 1 * 32]
- mova m3, [r0 + 5 * 32]
- packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
- mova m3, [r0 + 3 * 32]
- mova m4, [r0 + 7 * 32]
- packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
+ mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
+ mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
+ vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+ vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+ vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
+ vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3]
+ vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
+
+ mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
+ mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
+ vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+ vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+ vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
+ vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7]
+ vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
mova m5, [idct8_shuf1]
-
- punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
- punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
vpermd m4, m5, m4
vpermd m0, m5, m0
-
- punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
- punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
vpermd m1, m5, m1
vpermd m2, m5, m2
@@ -2065,7 +2482,7 @@ cglobal idct8, 3, 7, 13, 0-8*16
%endmacro
;-------------------------------------------------------
-; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct16, 3, 7, 16, 0-16*mmsize
@@ -2087,37 +2504,53 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize
mov r4d, 2
.pass1:
- movu m0, [r0 + 0 * 64]
- movu m1, [r0 + 8 * 64]
- packssdw m0, m1 ;[0L 8L 0H 8H]
-
- movu m1, [r0 + 1 * 64]
- movu m2, [r0 + 9 * 64]
- packssdw m1, m2 ;[1L 9L 1H 9H]
-
- movu m2, [r0 + 2 * 64]
- movu m3, [r0 + 10 * 64]
- packssdw m2, m3 ;[2L 10L 2H 10H]
-
- movu m3, [r0 + 3 * 64]
- movu m4, [r0 + 11 * 64]
- packssdw m3, m4 ;[3L 11L 3H 11H]
-
- movu m4, [r0 + 4 * 64]
- movu m5, [r0 + 12 * 64]
- packssdw m4, m5 ;[4L 12L 4H 12H]
-
- movu m5, [r0 + 5 * 64]
- movu m6, [r0 + 13 * 64]
- packssdw m5, m6 ;[5L 13L 5H 13H]
-
- movu m6, [r0 + 6 * 64]
- movu m7, [r0 + 14 * 64]
- packssdw m6, m7 ;[6L 14L 6H 14H]
-
- movu m7, [r0 + 7 * 64]
- movu m8, [r0 + 15 * 64]
- packssdw m7, m8 ;[7L 15L 7H 15H]
+ movu xm0, [r0 + 0 * 32]
+ movu xm1, [r0 + 8 * 32]
+ punpckhqdq xm2, xm0, xm1
+ punpcklqdq xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+
+ movu xm1, [r0 + 1 * 32]
+ movu xm2, [r0 + 9 * 32]
+ punpckhqdq xm3, xm1, xm2
+ punpcklqdq xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+
+ movu xm2, [r0 + 2 * 32]
+ movu xm3, [r0 + 10 * 32]
+ punpckhqdq xm4, xm2, xm3
+ punpcklqdq xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+
+ movu xm3, [r0 + 3 * 32]
+ movu xm4, [r0 + 11 * 32]
+ punpckhqdq xm5, xm3, xm4
+ punpcklqdq xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+
+ movu xm4, [r0 + 4 * 32]
+ movu xm5, [r0 + 12 * 32]
+ punpckhqdq xm6, xm4, xm5
+ punpcklqdq xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+
+ movu xm5, [r0 + 5 * 32]
+ movu xm6, [r0 + 13 * 32]
+ punpckhqdq xm7, xm5, xm6
+ punpcklqdq xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+
+ movu xm6, [r0 + 6 * 32]
+ movu xm7, [r0 + 14 * 32]
+ punpckhqdq xm8, xm6, xm7
+ punpcklqdq xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+
+ movu xm7, [r0 + 7 * 32]
+ movu xm8, [r0 + 15 * 32]
+ punpckhqdq xm9, xm7, xm8
+ punpcklqdq xm7, xm8
+ vinserti128 m7, m7, xm9, 1
punpckhwd m8, m0, m2 ;[8 10]
punpcklwd m0, m2 ;[0 2]
@@ -2160,7 +2593,7 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize
IDCT_PASS1 4, 10
IDCT_PASS1 6, 8
- add r0, 32
+ add r0, 16
add r3, 16
dec r4d
jnz .pass1
@@ -2328,7 +2761,7 @@ cglobal idct16, 3, 7, 16, 0-16*mmsize
%endmacro
;-------------------------------------------------------
-; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
; TODO: Reduce PHADDD instruction by PADDD
@@ -2345,54 +2778,69 @@ cglobal idct32, 3, 6, 16, 0-32*64
mov r5d, 8
.pass1:
- movu xm0, [r0 + 2 * 128]
- movu xm1, [r0 + 18 * 128]
- vinserti128 m0, m0, [r0 + 0 * 128], 1
- vinserti128 m1, m1, [r0 + 16 * 128], 1
-
- packssdw m0, m1 ;[2 18 0 16]
-
- movu xm1, [r0 + 1 * 128]
- movu xm2, [r0 + 9 * 128]
- vinserti128 m1, m1, [r0 + 17 * 128], 1
- vinserti128 m2, m2, [r0 + 25 * 128], 1
- packssdw m1, m2 ;[1 9 17 25]
-
- movu xm2, [r0 + 6 * 128]
- movu xm3, [r0 + 22 * 128]
- vinserti128 m2, m2, [r0 + 4 * 128], 1
- vinserti128 m3, m3, [r0 + 20 * 128], 1
- packssdw m2, m3 ;[6 22 4 20]
-
- movu xm3, [r0 + 3 * 128]
- movu xm4, [r0 + 11 * 128]
- vinserti128 m3, m3, [r0 + 19 * 128], 1
- vinserti128 m4, m4, [r0 + 27 * 128], 1
- packssdw m3, m4 ;[3 11 19 27]
-
- movu xm4, [r0 + 10 * 128]
- movu xm5, [r0 + 26 * 128]
- vinserti128 m4, m4, [r0 + 8 * 128], 1
- vinserti128 m5, m5, [r0 + 24 * 128], 1
- packssdw m4, m5 ;[10 26 8 24]
-
- movu xm5, [r0 + 5 * 128]
- movu xm6, [r0 + 13 * 128]
- vinserti128 m5, m5, [r0 + 21 * 128], 1
- vinserti128 m6, m6, [r0 + 29 * 128], 1
- packssdw m5, m6 ;[5 13 21 29]
-
- movu xm6, [r0 + 14 * 128]
- movu xm7, [r0 + 30 * 128]
- vinserti128 m6, m6, [r0 + 12 * 128], 1
- vinserti128 m7, m7, [r0 + 28 * 128], 1
- packssdw m6, m7 ;[14 30 12 28]
-
- movu xm7, [r0 + 7 * 128]
- movu xm8, [r0 + 15 * 128]
- vinserti128 m7, m7, [r0 + 23 * 128], 1
- vinserti128 m8, m8, [r0 + 31 * 128], 1
- packssdw m7, m8 ;[7 15 23 31]
+ movq xm0, [r0 + 2 * 64]
+ movq xm1, [r0 + 18 * 64]
+ punpcklqdq xm0, xm0, xm1
+ movq xm1, [r0 + 0 * 64]
+ movq xm2, [r0 + 16 * 64]
+ punpcklqdq xm1, xm1, xm2
+ vinserti128 m0, m0, xm1, 1 ;[2 18 0 16]
+
+ movq xm1, [r0 + 1 * 64]
+ movq xm2, [r0 + 9 * 64]
+ punpcklqdq xm1, xm1, xm2
+ movq xm2, [r0 + 17 * 64]
+ movq xm3, [r0 + 25 * 64]
+ punpcklqdq xm2, xm2, xm3
+ vinserti128 m1, m1, xm2, 1 ;[1 9 17 25]
+
+ movq xm2, [r0 + 6 * 64]
+ movq xm3, [r0 + 22 * 64]
+ punpcklqdq xm2, xm2, xm3
+ movq xm3, [r0 + 4 * 64]
+ movq xm4, [r0 + 20 * 64]
+ punpcklqdq xm3, xm3, xm4
+ vinserti128 m2, m2, xm3, 1 ;[6 22 4 20]
+
+ movq xm3, [r0 + 3 * 64]
+ movq xm4, [r0 + 11 * 64]
+ punpcklqdq xm3, xm3, xm4
+ movq xm4, [r0 + 19 * 64]
+ movq xm5, [r0 + 27 * 64]
+ punpcklqdq xm4, xm4, xm5
+ vinserti128 m3, m3, xm4, 1 ;[3 11 17 25]
+
+ movq xm4, [r0 + 10 * 64]
+ movq xm5, [r0 + 26 * 64]
+ punpcklqdq xm4, xm4, xm5
+ movq xm5, [r0 + 8 * 64]
+ movq xm6, [r0 + 24 * 64]
+ punpcklqdq xm5, xm5, xm6
+ vinserti128 m4, m4, xm5, 1 ;[10 26 8 24]
+
+ movq xm5, [r0 + 5 * 64]
+ movq xm6, [r0 + 13 * 64]
+ punpcklqdq xm5, xm5, xm6
+ movq xm6, [r0 + 21 * 64]
+ movq xm7, [r0 + 29 * 64]
+ punpcklqdq xm6, xm6, xm7
+ vinserti128 m5, m5, xm6, 1 ;[5 13 21 9]
+
+ movq xm6, [r0 + 14 * 64]
+ movq xm7, [r0 + 30 * 64]
+ punpcklqdq xm6, xm6, xm7
+ movq xm7, [r0 + 12 * 64]
+ movq xm8, [r0 + 28 * 64]
+ punpcklqdq xm7, xm7, xm8
+ vinserti128 m6, m6, xm7, 1 ;[14 30 12 28]
+
+ movq xm7, [r0 + 7 * 64]
+ movq xm8, [r0 + 15 * 64]
+ punpcklqdq xm7, xm7, xm8
+ movq xm8, [r0 + 23 * 64]
+ movq xm9, [r0 + 31 * 64]
+ punpcklqdq xm8, xm8, xm9
+ vinserti128 m7, m7, xm8, 1 ;[7 15 23 31]
punpckhwd m8, m0, m2 ;[18 22 16 20]
punpcklwd m0, m2 ;[2 6 0 4]
@@ -2451,7 +2899,7 @@ cglobal idct32, 3, 6, 16, 0-32*64
IDCT32_PASS1 6
IDCT32_PASS1 7
- add r0, 16
+ add r0, 8
add r3, 4
add r4, 4
dec r5d
@@ -2612,7 +3060,7 @@ cglobal idct32, 3, 6, 16, 0-32*64
RET
;-------------------------------------------------------
-; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct4, 3, 4, 6
@@ -2632,13 +3080,14 @@ cglobal idct4, 3, 4, 6
add r2d, r2d
lea r3, [r2 * 3]
- movu m0, [r0] ;[00 01 02 03 10 11 12 13]
- movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33]
+ movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
- packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
- pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
- vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
- vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
+ pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
+ vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33]
+ punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23]
+ punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33]
+ vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
+ vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
mova m1, [avx2_idct4_1]
mova m3, [avx2_idct4_1 + 32]
diff --git a/source/common/x86/dct8.h b/source/common/x86/dct8.h
index 3b74f2a..f9516d6 100644
--- a/source/common/x86/dct8.h
+++ b/source/common/x86/dct8.h
@@ -23,23 +23,24 @@
#ifndef X265_DCT8_H
#define X265_DCT8_H
-void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
-void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
-void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
-void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
#endif // ifndef X265_DCT8_H
diff --git a/source/common/x86/intrapred.h b/source/common/x86/intrapred.h
index 9a71457..f919c46 100644
--- a/source/common/x86/intrapred.h
+++ b/source/common/x86/intrapred.h
@@ -26,18 +26,18 @@
#ifndef X265_INTRAPRED_H
#define X265_INTRAPRED_H
-void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
-void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
-void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
-void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
+void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
-void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
-void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
-void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
-void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
+void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
#define DECL_ANG(bsize, mode, cpu) \
- void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel * dst, intptr_t dstStride, pixel * refLeft, pixel * refAbove, int dirMode, int bFilter);
+ void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
DECL_ANG(4, 2, ssse3);
DECL_ANG(4, 3, sse4);
@@ -157,8 +157,8 @@ DECL_ANG(32, 32, sse4);
DECL_ANG(32, 33, sse4);
#undef DECL_ANG
-void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
-void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
-void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
-void x265_all_angs_pred_32x32_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
+void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
+void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
+void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
+void x265_all_angs_pred_32x32_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
#endif // ifndef X265_INTRAPRED_H
diff --git a/source/common/x86/intrapred16.asm b/source/common/x86/intrapred16.asm
index 236be2c..ba17afe 100644
--- a/source/common/x86/intrapred16.asm
+++ b/source/common/x86/intrapred16.asm
@@ -56,10 +56,35 @@ const pw_swap16, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
+;; (blkSize - 1 - x)
+pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
+pw_planar4_1: dw 3, 3, 3, 3, 3, 3, 3, 3
+pw_planar8_0: dw 7, 6, 5, 4, 3, 2, 1, 0
+pw_planar8_1: dw 7, 7, 7, 7, 7, 7, 7, 7
+pw_planar16_0: dw 15, 14, 13, 12, 11, 10, 9, 8
+pw_planar16_1: dw 15, 15, 15, 15, 15, 15, 15, 15
+pd_planar32_1: dd 31, 31, 31, 31
+
+const planar32_table
+%assign x 31
+%rep 8
+ dd x, x-1, x-2, x-3
+%assign x x-4
+%endrep
+
+const planar32_table1
+%assign x 1
+%rep 8
+ dd x, x+1, x+2, x+3
+%assign x x+4
+%endrep
+
SECTION .text
cextern pw_1
+cextern pw_4
cextern pw_8
+cextern pw_16
cextern pw_1023
cextern pd_16
cextern pd_32
@@ -71,14 +96,13 @@ cextern pw_swap
cextern pb_unpackwq1
cextern pb_unpackwq2
-;-------------------------------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-------------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
+;-----------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc4, 4,6,2
- mov r4d, r5m
+cglobal intra_pred_dc4, 5,6,2
+ lea r3, [r2 + 18]
add r2, 2
- add r3, 2
movh m0, [r3] ; sumAbove
movh m1, [r2] ; sumLeft
@@ -110,22 +134,22 @@ cglobal intra_pred_dc4, 4,6,2
pshuflw m0, m0, 0 ; m0 = pixDCx3
; filter top
- movu m1, [r3]
+ movu m1, [r2]
paddw m1, m0
psraw m1, 2
movh [r0], m1 ; overwrite top-left pixel, we will update it later
; filter top-left
- movzx r3d, word [r3]
- add r5d, r3d
- movzx r3d, word [r2]
- add r3d, r5d
- shr r3d, 2
- mov [r0], r3w
+ movzx r4d, word [r3]
+ add r5d, r4d
+ movzx r4d, word [r2]
+ add r4d, r5d
+ shr r4d, 2
+ mov [r0], r4w
; filter left
lea r0, [r0 + r1 * 2]
- movu m1, [r2 + 2]
+ movu m1, [r3 + 2]
paddw m1, m0
psraw m1, 2
movd r3d, m1
@@ -133,21 +157,16 @@ cglobal intra_pred_dc4, 4,6,2
shr r3d, 16
mov [r0 + r1 * 2], r3w
pextrw [r0 + r1 * 4], m1, 2
-
.end:
-
RET
-
-
-;-------------------------------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-------------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
+;-----------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc8, 4, 7, 2
- mov r4d, r5m
+cglobal intra_pred_dc8, 5, 7, 2
+ lea r3, [r2 + 34]
add r2, 2
- add r3, 2
add r1, r1
movu m0, [r3]
movu m1, [r2]
@@ -190,22 +209,22 @@ cglobal intra_pred_dc8, 4, 7, 2
pshufd m1, m1, 0
; filter top
- movu m0, [r3]
+ movu m0, [r2]
paddw m0, m1
psraw m0, 2
movu [r6], m0
; filter top-left
- movzx r3d, word [r3]
- add r4d, r3d
- movzx r3d, word [r2]
- add r3d, r4d
- shr r3d, 2
- mov [r6], r3w
+ movzx r5d, word [r3]
+ add r4d, r5d
+ movzx r5d, word [r2]
+ add r5d, r4d
+ shr r5d, 2
+ mov [r6], r5w
; filter left
add r6, r1
- movu m0, [r2 + 2]
+ movu m0, [r3 + 2]
paddw m0, m1
psraw m0, 2
pextrw [r6], m0, 0
@@ -217,19 +236,16 @@ cglobal intra_pred_dc8, 4, 7, 2
lea r6, [r6 + r1 * 2]
pextrw [r6 + r1], m0, 5
pextrw [r6 + r1 * 2], m0, 6
-
.end:
RET
-
;-------------------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
;-------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc16, 4, 7, 4
- mov r4d, r5m
+cglobal intra_pred_dc16, 5, 7, 4
+ lea r3, [r2 + 66]
add r2, 2
- add r3, 2
add r1, r1
movu m0, [r3]
movu m1, [r3 + 16]
@@ -304,26 +320,26 @@ cglobal intra_pred_dc16, 4, 7, 4
pshufd m1, m1, 0
; filter top
- movu m2, [r3]
+ movu m2, [r2]
paddw m2, m1
psraw m2, 2
movu [r6], m2
- movu m3, [r3 + 16]
+ movu m3, [r2 + 16]
paddw m3, m1
psraw m3, 2
movu [r6 + 16], m3
; filter top-left
- movzx r3d, word [r3]
- add r4d, r3d
- movzx r3d, word [r2]
- add r3d, r4d
- shr r3d, 2
- mov [r6], r3w
+ movzx r5d, word [r3]
+ add r4d, r5d
+ movzx r5d, word [r2]
+ add r5d, r4d
+ shr r5d, 2
+ mov [r6], r5w
; filter left
add r6, r1
- movu m2, [r2 + 2]
+ movu m2, [r3 + 2]
paddw m2, m1
psraw m2, 2
@@ -340,7 +356,7 @@ cglobal intra_pred_dc16, 4, 7, 4
pextrw [r6 + r1], m2, 7
lea r6, [r6 + r1 * 2]
- movu m3, [r2 + 18]
+ movu m3, [r3 + 18]
paddw m3, m1
psraw m3, 2
@@ -354,19 +370,16 @@ cglobal intra_pred_dc16, 4, 7, 4
pextrw [r6 + r1], m3, 5
lea r6, [r6 + r1 * 2]
pextrw [r6], m3, 6
-
.end:
RET
-
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
;-------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc32, 4, 5, 6
- mov r4d, r5m
+cglobal intra_pred_dc32, 3, 5, 6
+ lea r3, [r2 + 130]
add r2, 2
- add r3, 2
add r1, r1
movu m0, [r3]
movu m1, [r3 + 16]
@@ -435,516 +448,427 @@ cglobal intra_pred_dc32, 4, 5, 6
jnz .loop
RET
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_planar4, 4,7,5
- add r2, 2
- add r3, 2
- add r1, r1
- movh m0, [r3] ; topRow[i] = above[i];
- punpcklqdq m0, m0
-
- pxor m1, m1
- movd m2, [r2 + 8] ; bottomLeft = left[4]
- movzx r6d, word [r3 + 8] ; topRight = above[4];
- pshuflw m2, m2, 0
- pshufd m2, m2, 0
-
- psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
- psllw m0, 2
- punpcklqdq m3, m2, m1
- psubw m0, m3
- paddw m2, m2
-
-%macro COMP_PRED_PLANAR_2ROW 1
- movzx r4d, word [r2 + %1]
- lea r4d, [r4d * 4 + 4]
- movd m3, r4d
- pshuflw m3, m3, 0
-
- movzx r4d, word [r2 + %1 + 2]
- lea r4d, [r4d * 4 + 4]
- movd m4, r4d
- pshuflw m4, m4, 0
- punpcklqdq m3, m4 ; horPred
-
- movzx r4d, word [r2 + %1]
- mov r5d, r6d
- sub r5d, r4d
- movd m4, r5d
- pshuflw m4, m4, 0
-
- movzx r4d, word [r2 + %1 + 2]
- mov r5d, r6d
- sub r5d, r4d
- movd m1, r5d
- pshuflw m1, m1, 0
- punpcklqdq m4, m1 ; rightColumnN
-
- pmullw m4, [multi_2Row]
- paddw m3, m4
- paddw m0, m2
- paddw m3, m0
- psraw m3, 3
-
- movh [r0], m3
- pshufd m3, m3, 0xAE
- movh [r0 + r1], m3
- lea r0, [r0 + 2 * r1]
-%endmacro
-
- COMP_PRED_PLANAR_2ROW 0
- COMP_PRED_PLANAR_2ROW 4
-%undef COMP_PRED_PLANAR_2ROW
+cglobal intra_pred_planar4, 3,3,5
+ add r1, r1
+ movu m1, [r2 + 2]
+ movu m2, [r2 + 18]
+ pshufhw m3, m1, 0 ; topRight
+ pshufd m3, m3, 0xAA
+ pshufhw m4, m2, 0 ; bottomLeft
+ pshufd m4, m4, 0xAA
+
+ pmullw m3, [multi_2Row] ; (x + 1) * topRight
+ pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x]
+
+ paddw m3, [pw_4]
+ paddw m3, m4
+ paddw m3, m0
+ psubw m4, m1
+ mova m0, [pw_planar4_0]
+
+ pshuflw m1, m2, 0
+ pmullw m1, m0
+ paddw m1, m3
+ paddw m3, m4
+ psraw m1, 3
+ movh [r0], m1
+
+ pshuflw m1, m2, 01010101b
+ pmullw m1, m0
+ paddw m1, m3
+ paddw m3, m4
+ psraw m1, 3
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+
+ pshuflw m1, m2, 10101010b
+ pmullw m1, m0
+ paddw m1, m3
+ paddw m3, m4
+ psraw m1, 3
+ movh [r0], m1
+
+ pshuflw m1, m2, 11111111b
+ pmullw m1, m0
+ paddw m1, m3
+ paddw m3, m4
+ psraw m1, 3
+ movh [r0 + r1], m1
RET
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_planar8, 4,4,7
- add r2, 2
- add r3, 2
- add r1, r1
- movu m1, [r3] ; v_topRow
- movu m2, [r2] ; v_leftColumn
-
- movd m3, [r3 + 16] ; topRight = above[8];
- movd m4, [r2 + 16] ; bottomLeft = left[8];
-
- pshuflw m3, m3, 0
- pshufd m3, m3, 0
- pshuflw m4, m4, 0
- pshufd m4, m4, 0
-
- psubw m4, m1 ; v_bottomRow
- psubw m3, m2 ; v_rightColumn
-
- psllw m1, 3 ; v_topRow
- psllw m2, 3 ; v_leftColumn
-
- paddw m6, m2, [pw_8]
-
-%macro PRED_PLANAR_ROW8 1
- %if (%1 < 4)
- pshuflw m5, m6, 0x55 * %1
- pshufd m5, m5, 0
- pshuflw m2, m3, 0x55 * %1
- pshufd m2, m2, 0
- %else
- pshufhw m5, m6, 0x55 * (%1 - 4)
- pshufd m5, m5, 0xAA
- pshufhw m2, m3, 0x55 * (%1 - 4)
- pshufd m2, m2, 0xAA
- %endif
-
- pmullw m2, [multiL]
- paddw m5, m2
- paddw m1, m4
- paddw m5, m1
- psraw m5, 4
-
- movu [r0], m5
- add r0, r1
-
+cglobal intra_pred_planar8, 3,3,5
+ add r1, r1
+ movu m1, [r2 + 2]
+ movu m2, [r2 + 34]
+
+ movd m3, [r2 + 18] ; topRight = above[8];
+ movd m4, [r2 + 50] ; bottomLeft = left[8];
+
+ pshuflw m3, m3, 0
+ pshuflw m4, m4, 0
+ pshufd m3, m3, 0 ; v_topRight
+ pshufd m4, m4, 0 ; v_bottomLeft
+
+ pmullw m3, [multiL] ; (x + 1) * topRight
+ pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x]
+ paddw m3, [pw_8]
+ paddw m3, m4
+ paddw m3, m0
+ psubw m4, m1
+ mova m0, [pw_planar8_0]
+
+%macro INTRA_PRED_PLANAR8 1
+%if (%1 < 4)
+ pshuflw m1, m2, 0x55 * %1
+ pshufd m1, m1, 0
+%else
+ pshufhw m1, m2, 0x55 * (%1 - 4)
+ pshufd m1, m1, 0xAA
+%endif
+ pmullw m1, m0
+ paddw m1, m3
+ paddw m3, m4
+ psraw m1, 4
+ movu [r0], m1
+ lea r0, [r0 + r1]
%endmacro
- PRED_PLANAR_ROW8 0
- PRED_PLANAR_ROW8 1
- PRED_PLANAR_ROW8 2
- PRED_PLANAR_ROW8 3
- PRED_PLANAR_ROW8 4
- PRED_PLANAR_ROW8 5
- PRED_PLANAR_ROW8 6
- PRED_PLANAR_ROW8 7
-
-%undef PRED_PLANAR_ROW8
+ INTRA_PRED_PLANAR8 0
+ INTRA_PRED_PLANAR8 1
+ INTRA_PRED_PLANAR8 2
+ INTRA_PRED_PLANAR8 3
+ INTRA_PRED_PLANAR8 4
+ INTRA_PRED_PLANAR8 5
+ INTRA_PRED_PLANAR8 6
+ INTRA_PRED_PLANAR8 7
RET
-
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-%if (BIT_DEPTH == 12)
-
-%if (ARCH_X86_64 == 1)
-cglobal intra_pred_planar16, 4,7,8+3
-%define bottomRow0 m7
-%define bottomRow1 m8
-%define bottomRow2 m9
-%define bottomRow3 m10
+cglobal intra_pred_planar16, 3,3,8
+ add r1, r1
+ movu m2, [r2 + 2]
+ movu m7, [r2 + 18]
+
+ movd m3, [r2 + 34] ; topRight = above[16]
+ movd m6, [r2 + 98] ; bottomLeft = left[16]
+
+ pshuflw m3, m3, 0
+ pshuflw m6, m6, 0
+ pshufd m3, m3, 0 ; v_topRight
+ pshufd m6, m6, 0 ; v_bottomLeft
+
+ pmullw m4, m3, [multiH] ; (x + 1) * topRight
+ pmullw m3, [multiL] ; (x + 1) * topRight
+ pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ paddw m4, [pw_16]
+ paddw m3, [pw_16]
+ paddw m4, m6
+ paddw m3, m6
+ paddw m4, m5
+ paddw m3, m1
+ psubw m1, m6, m7
+ psubw m6, m2
+
+ movu m2, [r2 + 66]
+ movu m7, [r2 + 82]
+
+%macro INTRA_PRED_PLANAR16 1
+%if (%1 < 4)
+ pshuflw m5, m2, 0x55 * %1
+ pshufd m5, m5, 0
+%else
+%if (%1 < 8)
+ pshufhw m5, m2, 0x55 * (%1 - 4)
+ pshufd m5, m5, 0xAA
%else
-cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize
-%define bottomRow0 [rsp + 0*mmsize]
-%define bottomRow1 [rsp + 1*mmsize]
-%define bottomRow2 [rsp + 2*mmsize]
-%define bottomRow3 m7
+%if (%1 < 12)
+ pshuflw m5, m7, 0x55 * (%1 - 8)
+ pshufd m5, m5, 0
+%else
+ pshufhw m5, m7, 0x55 * (%1 - 12)
+ pshufd m5, m5, 0xAA
%endif
-
- add r2, 2
- add r3, 2
- add r1, r1
-
- pxor m0, m0
-
- ; bottomRow
- movzx r4d, word [r2 + 16*2]
- movd m1, r4d
- pshufd m1, m1, 0 ; m1 = bottomLeft
- movu m2, [r3]
- pmovzxwd m3, m2
- punpckhwd m2, m0
- psubd m4, m1, m3
- mova bottomRow0, m4
- psubd m4, m1, m2
- mova bottomRow1, m4
- movu m2, [r3 + 16]
- pmovzxwd m3, m2
- punpckhwd m2, m0
- psubd m4, m1, m3
- mova bottomRow2, m4
- psubd m1, m2
- mova bottomRow3, m1
-
- ; topRow
- pmovzxwd m0, [r3 + 0*8]
- pslld m0, 4
- pmovzxwd m1, [r3 + 1*8]
- pslld m1, 4
- pmovzxwd m2, [r3 + 2*8]
- pslld m2, 4
- pmovzxwd m3, [r3 + 3*8]
- pslld m3, 4
-
- xor r6, r6
-.loopH:
- movzx r4d, word [r2 + r6*2]
- movzx r5d, word [r3 + 16*2] ; r5 = topRight
- sub r5d, r4d
- movd m5, r5d
- pshuflw m5, m5, 0
- pmullw m5, [multiL]
- pmovsxwd m5, m5 ; m5 = rightCol
- add r4d, r4d
- lea r4d, [r4d * 8 + 16]
- movd m4, r4d
- pshufd m4, m4, 0 ; m4 = horPred
- paddd m4, m5
- pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
-
- ; 0-3
- paddd m0, bottomRow0
- paddd m5, m0, m4
- psrad m5, 5
- packusdw m5, m5
- movh [r0 + 0*8], m5
-
- ; 4-7
- paddd m4, m6
- paddd m1, bottomRow1
- paddd m5, m1, m4
- psrad m5, 5
- packusdw m5, m5
- movh [r0 + 1*8], m5
-
- ; 8-11
- paddd m4, m6
- paddd m2, bottomRow2
- paddd m5, m2, m4
- psrad m5, 5
- packusdw m5, m5
- movh [r0 + 2*8], m5
-
- ; 12-15
- paddd m4, m6
- paddd m3, bottomRow3
- paddd m5, m3, m4
- psrad m5, 5
- packusdw m5, m5
- movh [r0 + 3*8], m5
-
- add r0, r1
- inc r6d
- cmp r6d, 16
- jnz .loopH
- RET
-
-%else ; BIT_DEPTH == 10
-INIT_XMM sse4
-cglobal intra_pred_planar16, 4,6,7
- add r2, 2
- add r3, 2
- add r1, r1
-
- movu m1, [r3] ; topRow[0-7]
- movu m2, [r3 + 16] ; topRow[8-15]
-
- movd m3, [r2 + 32]
- pshuflw m3, m3, 0
- pshufd m3, m3, 0
- movzx r4d, word [r3 + 32] ; topRight = above[16]
-
- psubw m4, m3, m1 ; v_bottomRow[0]
- psubw m3, m2 ; v_bottomRow[1]
-
- psllw m1, 4
- psllw m2, 4
-
-%macro PRED_PLANAR_ROW16 1
- movzx r5d, word [r2 + %1 * 2]
- add r5d, r5d
- lea r5d, [r5d * 8 + 16]
- movd m5, r5d
- pshuflw m5, m5, 0
- pshufd m5, m5, 0 ; horPred
-
- movzx r5d, word [r2 + %1 * 2]
- mov r3d, r4d
- sub r3d, r5d
- movd m0, r3d
- pshuflw m0, m0, 0
- pshufd m0, m0, 0
-
- pmullw m6, m0, [multiL]
- paddw m6, m5
- paddw m1, m4
- paddw m6, m1
- psraw m6, 5
-
- pmullw m0, m0, [multiH]
- paddw m5, m0
- paddw m2, m3
- paddw m5, m2
- psraw m5, 5
-
- movu [r0], m6
- movu [r0 + 16], m5
- add r0, r1
+%endif
+%endif
+ pmullw m0, m5, [pw_planar8_0]
+ pmullw m5, [pw_planar16_0]
+ paddw m0, m4
+ paddw m5, m3
+ paddw m3, m6
+ paddw m4, m1
+ psraw m5, 5
+ psraw m0, 5
+ movu [r0], m5
+ movu [r0 + 16], m0
+ lea r0, [r0 + r1]
%endmacro
- PRED_PLANAR_ROW16 0
- PRED_PLANAR_ROW16 1
- PRED_PLANAR_ROW16 2
- PRED_PLANAR_ROW16 3
- PRED_PLANAR_ROW16 4
- PRED_PLANAR_ROW16 5
- PRED_PLANAR_ROW16 6
- PRED_PLANAR_ROW16 7
- PRED_PLANAR_ROW16 8
- PRED_PLANAR_ROW16 9
- PRED_PLANAR_ROW16 10
- PRED_PLANAR_ROW16 11
- PRED_PLANAR_ROW16 12
- PRED_PLANAR_ROW16 13
- PRED_PLANAR_ROW16 14
- PRED_PLANAR_ROW16 15
-%undef PRED_PLANAR_ROW16
+ INTRA_PRED_PLANAR16 0
+ INTRA_PRED_PLANAR16 1
+ INTRA_PRED_PLANAR16 2
+ INTRA_PRED_PLANAR16 3
+ INTRA_PRED_PLANAR16 4
+ INTRA_PRED_PLANAR16 5
+ INTRA_PRED_PLANAR16 6
+ INTRA_PRED_PLANAR16 7
+ INTRA_PRED_PLANAR16 8
+ INTRA_PRED_PLANAR16 9
+ INTRA_PRED_PLANAR16 10
+ INTRA_PRED_PLANAR16 11
+ INTRA_PRED_PLANAR16 12
+ INTRA_PRED_PLANAR16 13
+ INTRA_PRED_PLANAR16 14
+ INTRA_PRED_PLANAR16 15
RET
-%endif
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-%if (ARCH_X86_64 == 1)
-cglobal intra_pred_planar32, 4,7,8+8, 0-4*mmsize
- %define bottomRow0 m8
- %define bottomRow1 m9
- %define bottomRow2 m10
- %define bottomRow3 m11
- %define bottomRow4 m12
- %define bottomRow5 m13
- %define bottomRow6 m14
- %define bottomRow7 m15
- %define tmp0 [rsp + 0*mmsize]
- %define tmp1 [rsp + 1*mmsize]
- %define tmp2 [rsp + 2*mmsize]
- %define tmp3 [rsp + 3*mmsize]
+%if ARCH_X86_64 == 1
+cglobal intra_pred_planar32, 3,7,16
+ ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+ mov r6, rsp
+ sub rsp, 4*mmsize
+ and rsp, ~63
+ %define m16 [rsp + 0 * mmsize]
+ %define m17 [rsp + 1 * mmsize]
+ %define m18 [rsp + 2 * mmsize]
+ %define m19 [rsp + 3 * mmsize]
%else
-cglobal intra_pred_planar32, 4,7,8, 0-12*mmsize
- %define bottomRow0 [rsp + 0*mmsize]
- %define bottomRow1 [rsp + 1*mmsize]
- %define bottomRow2 [rsp + 2*mmsize]
- %define bottomRow3 [rsp + 3*mmsize]
- %define bottomRow4 [rsp + 4*mmsize]
- %define bottomRow5 [rsp + 5*mmsize]
- %define bottomRow6 [rsp + 6*mmsize]
- %define bottomRow7 [rsp + 7*mmsize]
- %define tmp0 [rsp + 8*mmsize]
- %define tmp1 [rsp + 9*mmsize]
- %define tmp2 [rsp + 10*mmsize]
- %define tmp3 [rsp + 11*mmsize]
+cglobal intra_pred_planar32, 3,7,8
+ ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+ mov r6, rsp
+ sub rsp, 12*mmsize
+ and rsp, ~63
+ %define m8 [rsp + 0 * mmsize]
+ %define m9 [rsp + 1 * mmsize]
+ %define m10 [rsp + 2 * mmsize]
+ %define m11 [rsp + 3 * mmsize]
+ %define m12 [rsp + 4 * mmsize]
+ %define m13 [rsp + 5 * mmsize]
+ %define m14 [rsp + 6 * mmsize]
+ %define m15 [rsp + 7 * mmsize]
+ %define m16 [rsp + 8 * mmsize]
+ %define m17 [rsp + 9 * mmsize]
+ %define m18 [rsp + 10 * mmsize]
+ %define m19 [rsp + 11 * mmsize]
%endif
-
- add r2, 2
- add r3, 2
add r1, r1
-
- pxor m0, m0
-
- ; bottomRow
- movzx r4d, word [r2 + 32*2]
- movd m1, r4d
- pshufd m1, m1, 0 ; m1 = bottomLeft
- movu m2, [r3]
- pmovzxwd m3, m2
- punpckhwd m2, m0
- psubd m4, m1, m3
- mova bottomRow0, m4
- psubd m4, m1, m2
- mova bottomRow1, m4
- movu m2, [r3 + 16]
- pmovzxwd m3, m2
- punpckhwd m2, m0
- psubd m4, m1, m3
- mova bottomRow2, m4
- psubd m4, m1, m2
- mova bottomRow3, m4
-
- movu m2, [r3 + 32]
- pmovzxwd m3, m2
- punpckhwd m2, m0
- psubd m4, m1, m3
- mova bottomRow4, m4
- psubd m4, m1, m2
- mova bottomRow5, m4
- movu m2, [r3 + 48]
- pmovzxwd m3, m2
- punpckhwd m2, m0
- psubd m4, m1, m3
- mova bottomRow6, m4
- psubd m1, m2
- mova bottomRow7, m1
-
- ; topRow
- pmovzxwd m0, [r3 + 0*8]
- pslld m0, 5
- pmovzxwd m1, [r3 + 1*8]
- pslld m1, 5
- pmovzxwd m2, [r3 + 2*8]
- pslld m2, 5
- pmovzxwd m3, [r3 + 3*8]
- pslld m3, 5
-
- pmovzxwd m4, [r3 + 4*8]
- pslld m4, 5
- mova tmp0, m4
- pmovzxwd m4, [r3 + 5*8]
- pslld m4, 5
- mova tmp1, m4
- pmovzxwd m4, [r3 + 6*8]
- pslld m4, 5
- mova tmp2, m4
- pmovzxwd m4, [r3 + 7*8]
- pslld m4, 5
- mova tmp3, m4
-
- xor r6, r6
-.loopH:
- movzx r4d, word [r2 + r6*2]
- movzx r5d, word [r3 + 32*2] ; r5 = topRight
- sub r5d, r4d
- movd m5, r5d
- pshuflw m5, m5, 0
- pmullw m5, [multiL]
- pmovsxwd m5, m5 ; m5 = rightCol
- shl r4d, 5
- add r4d, 32
- movd m4, r4d
- pshufd m4, m4, 0 ; m4 = horPred
- paddd m4, m5
- pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
-
- ; 0-3
- paddd m0, bottomRow0
- paddd m5, m0, m4
- psrad m5, 6
- packusdw m5, m5
- movh [r0 + 0*8], m5
-
- ; 4-7
+ lea r5, [planar32_table1]
+
+ movzx r3d, word [r2 + 66] ; topRight = above[32]
+ movd m7, r3d
+ pshufd m7, m7, 0 ; v_topRight
+
+ pmulld m0, m7, [r5 + 0 ] ; (x + 1) * topRight
+ pmulld m1, m7, [r5 + 16 ]
+ pmulld m2, m7, [r5 + 32 ]
+ pmulld m3, m7, [r5 + 48 ]
+ pmulld m4, m7, [r5 + 64 ]
+ pmulld m5, m7, [r5 + 80 ]
+ pmulld m6, m7, [r5 + 96 ]
+ pmulld m7, m7, [r5 + 112]
+
+ mova m12, m4
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+
+ movzx r3d, word [r2 + 194] ; bottomLeft = left[32]
+ movd m6, r3d
+ pshufd m6, m6, 0 ; v_bottomLeft
+
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+ paddd m0, [pd_32]
+ paddd m1, [pd_32]
+ paddd m2, [pd_32]
+ paddd m3, [pd_32]
+
+ mova m4, m12
+ mova m5, m13
paddd m4, m6
- paddd m1, bottomRow1
- paddd m5, m1, m4
- psrad m5, 6
- packusdw m5, m5
- movh [r0 + 1*8], m5
-
- ; 8-11
+ paddd m5, m6
+ paddd m4, [pd_32]
+ paddd m5, [pd_32]
+ mova m12, m4
+ mova m13, m5
+
+ mova m4, m14
+ mova m5, m15
paddd m4, m6
- paddd m2, bottomRow2
- paddd m5, m2, m4
+ paddd m5, m6
+ paddd m4, [pd_32]
+ paddd m5, [pd_32]
+ mova m14, m4
+ mova m15, m5
+
+ ; above[0-3] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 2]
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m0, m5
+ psubd m5, m6, m4
+ mova m8, m5
+
+ ; above[4-7] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 10]
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m1, m5
+ psubd m5, m6, m4
+ mova m9, m5
+
+ ; above[8-11] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 18]
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m2, m5
+ psubd m5, m6, m4
+ mova m10, m5
+
+ ; above[12-15] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 26]
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m3, m5
+ psubd m5, m6, m4
+ mova m11, m5
+
+ ; above[16-19] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 34]
+ mova m7, m12
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m7, m5
+ mova m12, m7
+ psubd m5, m6, m4
+ mova m16, m5
+
+ ; above[20-23] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 42]
+ mova m7, m13
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m7, m5
+ mova m13, m7
+ psubd m5, m6, m4
+ mova m17, m5
+
+ ; above[24-27] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 50]
+ mova m7, m14
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m7, m5
+ mova m14, m7
+ psubd m5, m6, m4
+ mova m18, m5
+
+ ; above[28-31] * (blkSize - 1 - y)
+ pmovzxwd m4, [r2 + 58]
+ mova m7, m15
+ pmulld m5, m4, [pd_planar32_1]
+ paddd m7, m5
+ mova m15, m7
+ psubd m5, m6, m4
+ mova m19, m5
+
+ add r2, 130 ; (2 * blkSize + 1)
+ lea r5, [planar32_table]
+
+%macro INTRA_PRED_PLANAR32 0
+ movzx r3d, word [r2]
+ movd m4, r3d
+ pshufd m4, m4, 0
+
+ pmulld m5, m4, [r5]
+ pmulld m6, m4, [r5 + 16]
+ paddd m5, m0
+ paddd m6, m1
+ paddd m0, m8
+ paddd m1, m9
psrad m5, 6
- packusdw m5, m5
- movh [r0 + 2*8], m5
+ psrad m6, 6
+ packusdw m5, m6
+ movu [r0], m5
- ; 12-15
- paddd m4, m6
- paddd m3, bottomRow3
- paddd m5, m3, m4
+ pmulld m5, m4, [r5 + 32]
+ pmulld m6, m4, [r5 + 48]
+ paddd m5, m2
+ paddd m6, m3
+ paddd m2, m10
+ paddd m3, m11
psrad m5, 6
- packusdw m5, m5
- movh [r0 + 3*8], m5
+ psrad m6, 6
+ packusdw m5, m6
+ movu [r0 + 16], m5
- ; 16-19
- paddd m4, m6
- mova m7, tmp0
- paddd m7, bottomRow4
- mova tmp0, m7
- paddd m7, m4
- psrad m7, 6
- packusdw m7, m7
- movh [r0 + 4*8], m7
-
- ; 20-23
- paddd m4, m6
- mova m7, tmp1
- paddd m7, bottomRow5
- mova tmp1, m7
- paddd m7, m4
- psrad m7, 6
- packusdw m7, m7
- movh [r0 + 5*8], m7
-
- ; 24-27
- paddd m4, m6
- mova m7, tmp2
- paddd m7, bottomRow6
- mova tmp2, m7
- paddd m7, m4
- psrad m7, 6
- packusdw m7, m7
- movh [r0 + 6*8], m7
-
- ; 28-31
- paddd m4, m6
- mova m7, tmp3
- paddd m7, bottomRow7
- mova tmp3, m7
- paddd m7, m4
- psrad m7, 6
- packusdw m7, m7
- movh [r0 + 7*8], m7
-
- add r0, r1
- inc r6d
- cmp r6d, 32
- jnz .loopH
+ pmulld m5, m4, [r5 + 64]
+ pmulld m6, m4, [r5 + 80]
+ paddd m5, m12
+ paddd m6, m13
+ psrad m5, 6
+ psrad m6, 6
+ packusdw m5, m6
+ movu [r0 + 32], m5
+ mova m5, m12
+ mova m6, m13
+ paddd m5, m16
+ paddd m6, m17
+ mova m12, m5
+ mova m13, m6
+
+ pmulld m5, m4, [r5 + 96]
+ pmulld m4, [r5 + 112]
+ paddd m5, m14
+ paddd m4, m15
+ psrad m5, 6
+ psrad m4, 6
+ packusdw m5, m4
+ movu [r0 + 48], m5
+ mova m4, m14
+ mova m5, m15
+ paddd m4, m18
+ paddd m5, m19
+ mova m14, m4
+ mova m15, m5
+
+ lea r0, [r0 + r1]
+ add r2, 2
+%endmacro
+ mov r4, 8
+.loop:
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ dec r4
+ jnz .loop
+ mov rsp, r6
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------
+; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,3,4
- cmp r4m, byte 34
- cmove r2, r3mp
+cglobal intra_pred_ang4_2, 3,5,4
+ lea r4, [r2 + 4]
+ add r2, 20
+ cmp r3m, byte 34
+ cmove r2, r4
+
add r1, r1
- movu m0, [r2 + 4]
+ movu m0, [r2]
movh [r0], m0
palignr m1, m0, 2
movh [r0 + r1], m1
@@ -956,11 +880,13 @@ cglobal intra_pred_ang4_2, 3,3,4
RET
INIT_XMM sse4
-cglobal intra_pred_ang4_3, 3,4,8
- cmp r4m, byte 33
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_3, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 33
+ mov r3, 18
+ cmove r3, r4
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
palignr m5, m0, 4 ; [x x 8 7 6 5 4 3]
@@ -970,6 +896,7 @@ cglobal intra_pred_ang4_3, 3,4,8
movhlps m0, m0 ; [x x x x 8 7 6 5]
punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4]
+ lea r3, [ang_table + 20 * 16]
mova m0, [r3 + 6 * 16] ; [26]
mova m1, [r3] ; [20]
mova m6, [r3 - 6 * 16] ; [14]
@@ -1013,11 +940,13 @@ ALIGN 16
movhps [r0 + r1], m4
RET
-cglobal intra_pred_ang4_4, 3,4,8
- cmp r4m, byte 32
- cmove r2, r3mp
- lea r3, [ang_table + 18 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_4, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 32
+ mov r3, 18
+ cmove r3, r4
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
@@ -1026,17 +955,20 @@ cglobal intra_pred_ang4_4, 3,4,8
palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
+ lea r3, [ang_table + 18 * 16]
mova m0, [r3 + 3 * 16] ; [21]
mova m1, [r3 - 8 * 16] ; [10]
mova m6, [r3 + 13 * 16] ; [31]
mova m7, [r3 + 2 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-cglobal intra_pred_ang4_5, 3,4,8
- cmp r4m, byte 31
- cmove r2, r3mp
- lea r3, [ang_table + 10 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_5, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 31
+ mov r3, 18
+ cmove r3, r4
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
@@ -1045,17 +977,20 @@ cglobal intra_pred_ang4_5, 3,4,8
palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
+ lea r3, [ang_table + 10 * 16]
mova m0, [r3 + 7 * 16] ; [17]
mova m1, [r3 - 8 * 16] ; [ 2]
mova m6, [r3 + 9 * 16] ; [19]
mova m7, [r3 - 6 * 16] ; [ 4]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-cglobal intra_pred_ang4_6, 3,4,8
- cmp r4m, byte 30
- cmove r2, r3mp
- lea r3, [ang_table + 19 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_6, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 30
+ mov r3, 18
+ cmove r3, r4
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
mova m3, m2
@@ -1063,17 +998,20 @@ cglobal intra_pred_ang4_6, 3,4,8
punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2]
mova m5, m4
+ lea r3, [ang_table + 19 * 16]
mova m0, [r3 - 6 * 16] ; [13]
mova m1, [r3 + 7 * 16] ; [26]
mova m6, [r3 - 12 * 16] ; [ 7]
mova m7, [r3 + 1 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-cglobal intra_pred_ang4_7, 3,4,8
- cmp r4m, byte 29
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_7, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 29
+ mov r3, 18
+ cmove r3, r4
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
mova m3, m2
@@ -1081,41 +1019,47 @@ cglobal intra_pred_ang4_7, 3,4,8
palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2]
+ lea r3, [ang_table + 20 * 16]
mova m0, [r3 - 11 * 16] ; [ 9]
mova m1, [r3 - 2 * 16] ; [18]
mova m6, [r3 + 7 * 16] ; [27]
mova m7, [r3 - 16 * 16] ; [ 4]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-cglobal intra_pred_ang4_8, 3,4,8
- cmp r4m, byte 28
- cmove r2, r3mp
- lea r3, [ang_table + 13 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_8, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 28
+ mov r3, 18
+ cmove r3, r4
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
mova m3, m2
mova m4, m2
mova m5, m2
+ lea r3, [ang_table + 13 * 16]
mova m0, [r3 - 8 * 16] ; [ 5]
mova m1, [r3 - 3 * 16] ; [10]
mova m6, [r3 + 2 * 16] ; [15]
mova m7, [r3 + 7 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_9, 3,5,8
+ mov r4, 2
+ cmp r3m, byte 27
+ mov r3, 18
+ cmove r3, r4
-cglobal intra_pred_ang4_9, 3,4,8
- cmp r4m, byte 27
- cmove r2, r3mp
- lea r3, [ang_table + 4 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
mova m3, m2
mova m4, m2
mova m5, m2
+ lea r3, [ang_table + 4 * 16]
mova m0, [r3 - 2 * 16] ; [ 2]
mova m1, [r3 - 0 * 16] ; [ 4]
mova m6, [r3 + 2 * 16] ; [ 6]
@@ -1123,7 +1067,7 @@ cglobal intra_pred_ang4_9, 3,4,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_10, 3,3,4
- movh m0, [r2 + 2] ; [4 3 2 1]
+ movh m0, [r2 + 18] ; [4 3 2 1]
pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3]
pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
add r1, r1
@@ -1134,11 +1078,10 @@ cglobal intra_pred_ang4_10, 3,3,4
lea r1, [r1 * 3]
movh [r0 + r1], m3
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
; filter
- mov r2, r3mp
movu m1, [r2] ; [7 6 5 4 3 2 1 0]
pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
palignr m1, m1, 2 ; [4 3 2 1]
@@ -1148,13 +1091,12 @@ cglobal intra_pred_ang4_10, 3,3,4
pxor m1, m1
pmaxsw m0, m1
pminsw m0, [pw_1023]
-
.quit:
movh [r0], m0
RET
-cglobal intra_pred_ang4_26, 4,4,3
- movh m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_26, 3,4,3
+ movh m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
add r1, r1
; store
movh [r0], m0
@@ -1164,11 +1106,12 @@ cglobal intra_pred_ang4_26, 4,4,3
movh [r0 + r3], m0
; filter
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
- movu m1, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 16]
+ pinsrw m1, [r2], 0 ; [7 6 5 4 3 2 1 0]
pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
palignr m1, m1, 2 ; [4 3 2 1]
psubw m1, m2
@@ -1182,55 +1125,63 @@ cglobal intra_pred_ang4_26, 4,4,3
pextrw [r0 + r1], m0, 1
pextrw [r0 + r1 * 2], m0, 2
pextrw [r0 + r3], m0, 3
-
.quit:
RET
-cglobal intra_pred_ang4_11, 3,4,8
- cmp r4m, byte 25
- cmove r2, r3mp
- lea r3, [ang_table + 24 * 16]
- movu m2, [r2] ; [x x x 4 3 2 1 0]
+cglobal intra_pred_ang4_11, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 25
+ mov r3, 16
+ cmove r3, r4
+
+ movu m2, [r2 + r3] ; [x x x 4 3 2 1 0]
+ pinsrw m2, [r2], 0
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
mova m3, m2
mova m4, m2
mova m5, m2
+ lea r3, [ang_table + 24 * 16]
mova m0, [r3 + 6 * 16] ; [24]
mova m1, [r3 + 4 * 16] ; [26]
mova m6, [r3 + 2 * 16] ; [28]
mova m7, [r3 + 0 * 16] ; [30]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_12, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 24
+ mov r3, 16
+ cmove r3, r4
-cglobal intra_pred_ang4_12, 3,4,8
- cmp r4m, byte 24
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movu m2, [r2] ; [x x x 4 3 2 1 0]
+ movu m2, [r2 + r3] ; [x x x 4 3 2 1 0]
+ pinsrw m2, [r2], 0
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
mova m3, m2
mova m4, m2
mova m5, m2
+ lea r3, [ang_table + 20 * 16]
mova m0, [r3 + 7 * 16] ; [27]
mova m1, [r3 + 2 * 16] ; [22]
mova m6, [r3 - 3 * 16] ; [17]
mova m7, [r3 - 8 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_13, 4,4,8
- cmp r4m, byte 23
- jnz .load
- xchg r2, r3
-.load:
- movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_13, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 23
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+ movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
+ pinsrw m5, [r2], 1
palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
palignr m0, m5, 4 ; [x x x x 4 3 2 1]
- pinsrw m5, [r3 + 8], 0
+ pinsrw m5, [r2 + r3 + 8], 0
punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
mova m3, m2
@@ -1243,15 +1194,18 @@ cglobal intra_pred_ang4_13, 4,4,8
mova m7, [r3 + 7 * 16] ; [28]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-cglobal intra_pred_ang4_14, 4,4,8
- cmp r4m, byte 22
- jnz .load
- xchg r2, r3
-.load:
- movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_14, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 22
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+ movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
+ pinsrw m5, [r2], 1
palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
palignr m0, m5, 4 ; [x x x x 4 3 2 1]
- pinsrw m5, [r3 + 4], 0
+ pinsrw m5, [r2 + r3 + 4], 0
punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
mova m3, m2
@@ -1264,18 +1218,20 @@ cglobal intra_pred_ang4_14, 4,4,8
mova m7, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_15, 4,4,8
- cmp r4m, byte 21
- jnz .load
- xchg r2, r3
-.load:
- movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_15, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 21
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+ movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
+ pinsrw m3, [r2], 1
palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
palignr m0, m3, 4 ; [x x x x 4 3 2 1]
- pinsrw m3, [r3 + 4], 0
+ pinsrw m3, [r2 + r3 + 4], 0
pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
- pinsrw m5, [r3 + 8], 0
+ pinsrw m5, [r2 + r3 + 8], 0
punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
@@ -1288,18 +1244,20 @@ cglobal intra_pred_ang4_15, 4,4,8
mova m7, [r3 + 5 * 16] ; [28]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_16, 4,4,8
- cmp r4m, byte 20
- jnz .load
- xchg r2, r3
-.load:
- movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_16, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 20
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+ movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
+ pinsrw m3, [r2], 1
palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
palignr m0, m3, 4 ; [x x x x 4 3 2 1]
- pinsrw m3, [r3 + 4], 0
+ pinsrw m3, [r2 + r3 + 4], 0
pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
- pinsrw m5, [r3 + 6], 0
+ pinsrw m5, [r2 + r3 + 6], 0
punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
@@ -1312,24 +1270,27 @@ cglobal intra_pred_ang4_16, 4,4,8
mova m7, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-cglobal intra_pred_ang4_17, 4,4,8
- cmp r4m, byte 19
- jnz .load
- xchg r2, r3
-.load:
- movu m6, [r2 - 2] ; [- - 4 3 2 1 0 x]
+cglobal intra_pred_ang4_17, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 19
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+ movu m6, [r2 + r4 - 2] ; [- - 4 3 2 1 0 x]
+ pinsrw m6, [r2], 1
palignr m2, m6, 2 ; [- - - 4 3 2 1 0]
palignr m1, m6, 4 ; [- - - - 4 3 2 1]
mova m4, m2
punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
- pinsrw m6, [r3 + 2], 0
+ pinsrw m6, [r2 + r3 + 2], 0
punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x]
pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y]
- pinsrw m4, [r3 + 4], 0
+ pinsrw m4, [r2 + r3 + 4], 0
pslldq m5, m4, 2 ; [4 3 2 1 0 x y z]
- pinsrw m5, [r3 + 8], 0
+ pinsrw m5, [r2 + r3 + 8], 0
punpcklwd m5, m4 ; [1 0 0 x x y y z]
punpcklwd m4, m6 ; [2 1 1 0 0 x x y]
@@ -1340,11 +1301,11 @@ cglobal intra_pred_ang4_17, 4,4,8
mova m7, [r3 + 10 * 16] ; [24]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_18, 4,4,1
- movh m0, [r2]
+cglobal intra_pred_ang4_18, 3,3,1
+ movh m0, [r2 + 16]
+ pinsrw m0, [r2], 0
pshufb m0, [pw_swap]
- movhps m0, [r3 + 2]
+ movhps m0, [r2 + 2]
add r1, r1
lea r2, [r1 * 3]
movh [r0 + r2], m0
@@ -1356,13 +1317,15 @@ cglobal intra_pred_ang4_18, 4,4,1
movh [r0], m0
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------
+; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
-cglobal intra_pred_ang8_2, 3,4,3
- cmp r4m, byte 34
- cmove r2, r3mp
+cglobal intra_pred_ang8_2, 3,5,3
+ lea r4, [r2]
+ add r2, 32
+ cmp r3m, byte 34
+ cmove r2, r4
add r1, r1
lea r3, [r1 * 3]
movu m0, [r2 + 4]
@@ -1387,6 +1350,7 @@ cglobal intra_pred_ang8_2, 3,4,3
INIT_XMM sse4
cglobal intra_pred_ang8_3, 3,5,8
+ add r2, 32
lea r3, [ang_table + 14 * 16]
add r1, r1
@@ -1520,10 +1484,10 @@ cglobal intra_pred_ang8_3, 3,5,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m3
movhps [r0 + r4 + 8], m3
-
RET
cglobal intra_pred_ang8_4, 3,6,8
+ add r2, 32
lea r3, [ang_table + 19 * 16]
add r1, r1
@@ -1657,10 +1621,10 @@ cglobal intra_pred_ang8_4, 3,6,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m3
movhps [r0 + r4 + 8], m3
-
RET
cglobal intra_pred_ang8_5, 3,5,8
+ add r2, 32
lea r3, [ang_table + 13 * 16]
add r1, r1
@@ -1793,10 +1757,10 @@ cglobal intra_pred_ang8_5, 3,5,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m3
movhps [r0 + r4 + 8], m3
-
RET
cglobal intra_pred_ang8_6, 3,5,8
+ add r2, 32
lea r3, [ang_table + 14 * 16]
add r1, r1
@@ -1927,10 +1891,10 @@ cglobal intra_pred_ang8_6, 3,5,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m3
movhps [r0 + r4 + 8], m3
-
RET
cglobal intra_pred_ang8_7, 3,5,8
+ add r2, 32
lea r3, [ang_table + 18 * 16]
add r1, r1
@@ -2061,10 +2025,10 @@ cglobal intra_pred_ang8_7, 3,5,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m3
movhps [r0 + r4 + 8], m3
-
RET
cglobal intra_pred_ang8_8, 3,6,7
+ add r2, 32
lea r3, [ang_table + 17 * 16]
add r1, r1
@@ -2196,10 +2160,10 @@ cglobal intra_pred_ang8_8, 3,6,7
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m3
movhps [r0 + r4 + 8], m3
-
RET
cglobal intra_pred_ang8_9, 3,5,7
+ add r2, 32
lea r3, [ang_table + 9 * 16]
add r1, r1
@@ -2327,14 +2291,13 @@ cglobal intra_pred_ang8_9, 3,5,7
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_10, 4,5,3
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang8_10, 3,6,3
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
add r1, r1
- lea r4, [r1 * 3]
+ lea r3, [r1 * 3]
psrldq m1, 2
pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
@@ -2344,37 +2307,36 @@ cglobal intra_pred_ang8_10, 4,5,3
movu [r0 + r1 * 2], m2
psrldq m1, 2
pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
- movu [r0 + r4], m2
+ movu [r0 + r3], m2
- lea r2, [r0 + r1 *4]
+ lea r5, [r0 + r1 *4]
psrldq m1, 2
pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
- movu [r2], m2
+ movu [r5], m2
psrldq m1, 2
pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
- movu [r2 + r1], m2
+ movu [r5 + r1], m2
psrldq m1, 2
pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
- movu [r2 + r1 * 2], m2
+ movu [r5 + r1 * 2], m2
psrldq m1, 2
pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
- movu [r2 + r4], m2
+ movu [r5 + r3], m2
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
; filter
- movh m1, [r3] ; [3 2 1 0]
+ movh m1, [r2] ; [3 2 1 0]
pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
psubw m1, m2
psraw m1, 1
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
pminsw m0, [pw_1023]
-
.quit:
movu [r0], m0
RET
@@ -2383,8 +2345,9 @@ cglobal intra_pred_ang8_11, 3,5,7
lea r3, [ang_table + 23 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -2507,15 +2470,15 @@ cglobal intra_pred_ang8_11, 3,5,7
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_12, 4,6,7
+cglobal intra_pred_ang8_12, 3,6,7
lea r5, [ang_table + 16 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -2575,11 +2538,11 @@ cglobal intra_pred_ang8_12, 4,6,7
movhps [r0 + r1], m5
movh [r0 + r1 * 2], m4
movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
+ lea r3, [r0 + r1 * 4]
+ movh [r3], m6
+ movhps [r3 + r1], m6
+ movh [r3 + r1 * 2], m1
+ movhps [r3 + r4], m1
mova m4, m3
pmaddwd m4, [r5 - 9 * 16] ; [7]
@@ -2602,7 +2565,7 @@ cglobal intra_pred_ang8_12, 4,6,7
packusdw m2, m1
palignr m0, m3, 12
- movu m1, [r3]
+ movu m1, [r2]
pshufb m1, [pw_ang8_12]
palignr m3, m1, 12
@@ -2643,15 +2606,15 @@ cglobal intra_pred_ang8_12, 4,6,7
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_13, 4,6,8
+cglobal intra_pred_ang8_13, 3,6,8
lea r5, [ang_table + 14 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -2687,7 +2650,7 @@ cglobal intra_pred_ang8_13, 4,6,8
packusdw m6, m1
palignr m0, m3, 12
- movu m1, [r3]
+ movu m1, [r2]
pshufb m1, [pw_ang8_13]
palignr m3, m1, 12
@@ -2783,15 +2746,15 @@ cglobal intra_pred_ang8_13, 4,6,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_14, 4,6,8
+cglobal intra_pred_ang8_14, 3,6,8
lea r5, [ang_table + 18 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -2817,7 +2780,7 @@ cglobal intra_pred_ang8_14, 4,6,8
packusdw m2, m1
palignr m0, m3, 12
- movu m1, [r3]
+ movu m1, [r2]
pshufb m1, [pw_ang8_14]
palignr m3, m1, 12
@@ -2927,15 +2890,15 @@ cglobal intra_pred_ang8_14, 4,6,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_15, 4,6,8
+cglobal intra_pred_ang8_15, 3,6,8
lea r5, [ang_table + 20 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -2951,7 +2914,7 @@ cglobal intra_pred_ang8_15, 4,6,8
packusdw m4, m2
palignr m0, m3, 12
- movu m1, [r3]
+ movu m1, [r2]
pshufb m1, [pw_ang8_15]
palignr m3, m1, 12
@@ -3004,11 +2967,11 @@ cglobal intra_pred_ang8_15, 4,6,8
movhps [r0 + r1], m5
movh [r0 + r1 * 2], m4
movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
+ lea r3, [r0 + r1 * 4]
+ movh [r3], m6
+ movhps [r3 + r1], m6
+ movh [r3 + r1 * 2], m7
+ movhps [r3 + r4], m7
mova m4, m3
pmaddwd m4, [r5 - 9 * 16] ; [11]
@@ -3047,7 +3010,7 @@ cglobal intra_pred_ang8_15, 4,6,8
pslldq m1, 2
palignr m0, m3, 12
palignr m3, m1, 12
- pinsrw m3, [r3 + 16], 0
+ pinsrw m3, [r2 + 16], 0
pmaddwd m3, [r5 + 4 * 16] ; [24]
paddd m3, [pd_16]
@@ -3076,15 +3039,15 @@ cglobal intra_pred_ang8_15, 4,6,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_16, 4,6,8
+cglobal intra_pred_ang8_16, 3,6,8
lea r5, [ang_table + 13 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -3100,7 +3063,7 @@ cglobal intra_pred_ang8_16, 4,6,8
packusdw m4, m2
palignr m0, m3, 12
- movu m1, [r3]
+ movu m1, [r2]
pshufb m1, [pw_ang8_16]
palignr m3, m1, 12
@@ -3153,11 +3116,11 @@ cglobal intra_pred_ang8_16, 4,6,8
movhps [r0 + r1], m5
movh [r0 + r1 * 2], m4
movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
+ lea r3, [r0 + r1 * 4]
+ movh [r3], m6
+ movhps [r3 + r1], m6
+ movh [r3 + r1 * 2], m7
+ movhps [r3 + r4], m7
pslldq m1, 2
palignr m0, m3, 12
@@ -3200,7 +3163,7 @@ cglobal intra_pred_ang8_16, 4,6,8
pslldq m1, 2
palignr m0, m3, 12
palignr m3, m1, 12
- pinsrw m3, [r3 + 16], 0
+ pinsrw m3, [r2 + 16], 0
pmaddwd m3, [r5 + 11 * 16] ; [24]
paddd m3, [pd_16]
@@ -3229,15 +3192,15 @@ cglobal intra_pred_ang8_16, 4,6,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_17, 4,6,8
+cglobal intra_pred_ang8_17, 3,6,8
lea r5, [ang_table + 17 * 16]
add r1, r1
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pinsrw m0, [r2], 0
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
@@ -3253,7 +3216,7 @@ cglobal intra_pred_ang8_17, 4,6,8
packusdw m4, m2
palignr m0, m3, 12
- movu m1, [r3]
+ movu m1, [r2]
pshufb m1, [pw_ang8_17]
palignr m3, m1, 12
@@ -3310,11 +3273,11 @@ cglobal intra_pred_ang8_17, 4,6,8
movhps [r0 + r1], m5
movh [r0 + r1 * 2], m4
movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
+ lea r3, [r0 + r1 * 4]
+ movh [r3], m6
+ movhps [r3 + r1], m6
+ movh [r3 + r1 * 2], m7
+ movhps [r3 + r4], m7
pslldq m1, 2
palignr m0, m3, 12
@@ -3385,14 +3348,13 @@ cglobal intra_pred_ang8_17, 4,6,8
movhps [r0 + r1 + 8], m6
movh [r0 + r1 * 2 + 8], m5
movhps [r0 + r4 + 8], m5
-
RET
-cglobal intra_pred_ang8_18, 4,5,3
+cglobal intra_pred_ang8_18, 3,4,3
add r1, r1
- lea r4, [r1 * 3]
- movu m1, [r3]
- movu m0, [r2 + 2]
+ lea r3, [r1 * 3]
+ movu m1, [r2]
+ movu m0, [r2 + 34]
pshufb m0, [pw_swap16]
movu [r0], m1
palignr m2, m1, m0, 14
@@ -3400,7 +3362,7 @@ cglobal intra_pred_ang8_18, 4,5,3
palignr m2, m1, m0, 12
movu [r0 + r1 * 2], m2
palignr m2, m1, m0, 10
- movu [r0 + r4], m2
+ movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 8
movu [r0], m2
@@ -3409,40 +3371,41 @@ cglobal intra_pred_ang8_18, 4,5,3
palignr m2, m1, m0, 4
movu [r0 + r1 * 2], m2
palignr m1, m0, 2
- movu [r0 + r4], m1
+ movu [r0 + r3], m1
RET
-cglobal intra_pred_ang8_19, 4,6,8
- lea r5, [ang_table + 17 * 16]
+cglobal intra_pred_ang8_19, 3,5,8
+ lea r3, [ang_table + 17 * 16]
add r1, r1
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
mova m4, m3
- pmaddwd m4, [r5 - 11 * 16] ; [6]
+ pmaddwd m4, [r3 - 11 * 16] ; [6]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 - 11 * 16]
+ pmaddwd m2, [r3 - 11 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
palignr m0, m3, 12
- movu m1, [r2]
+ movu m1, [r2 + 32]
+ pinsrw m1, [r2], 0
pshufb m1, [pw_ang8_17]
palignr m3, m1, 12
mova m2, m3
- pmaddwd m2, [r5 - 5 * 16] ; [12]
+ pmaddwd m2, [r3 - 5 * 16] ; [12]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 - 5 * 16]
+ pmaddwd m5, [r3 - 5 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
@@ -3452,11 +3415,11 @@ cglobal intra_pred_ang8_19, 4,6,8
palignr m3, m1, 12
mova m6, m3
- pmaddwd m6, [r5 + 1 * 16] ; [18]
+ pmaddwd m6, [r3 + 1 * 16] ; [18]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 + 1 * 16]
+ pmaddwd m5, [r3 + 1 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3466,11 +3429,11 @@ cglobal intra_pred_ang8_19, 4,6,8
palignr m3, m1, 12
mova m5, m3
- pmaddwd m5, [r5 + 7 * 16] ; [24]
+ pmaddwd m5, [r3 + 7 * 16] ; [24]
paddd m5, [pd_16]
psrld m5, 5
mova m7, m0
- pmaddwd m7, [r5 + 7 * 16]
+ pmaddwd m7, [r3 + 7 * 16]
paddd m7, [pd_16]
psrld m7, 5
packusdw m5, m7
@@ -3486,21 +3449,21 @@ cglobal intra_pred_ang8_19, 4,6,8
palignr m3, m1, 12
mova m4, m3
- pmaddwd m4, [r5 + 13 * 16] ; [30]
+ pmaddwd m4, [r3 + 13 * 16] ; [30]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 13 * 16]
+ pmaddwd m2, [r3 + 13 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5 - 13 * 16] ; [4]
+ pmaddwd m2, [r3 - 13 * 16] ; [4]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
+ pmaddwd m5, [r3 - 13 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
@@ -3510,11 +3473,11 @@ cglobal intra_pred_ang8_19, 4,6,8
palignr m3, m1, 12
mova m6, m3
- pmaddwd m6, [r5 - 7 * 16] ; [10]
+ pmaddwd m6, [r3 - 7 * 16] ; [10]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 - 7 * 16]
+ pmaddwd m5, [r3 - 7 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3523,10 +3486,10 @@ cglobal intra_pred_ang8_19, 4,6,8
palignr m0, m3, 12
palignr m3, m1, 12
- pmaddwd m3, [r5 - 1 * 16] ; [16]
+ pmaddwd m3, [r3 - 1 * 16] ; [16]
paddd m3, [pd_16]
psrld m3, 5
- pmaddwd m0, [r5 - 1 * 16]
+ pmaddwd m0, [r3 - 1 * 16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m3, m0
@@ -3536,50 +3499,50 @@ cglobal intra_pred_ang8_19, 4,6,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
-cglobal intra_pred_ang8_20, 4,6,8
- lea r5, [ang_table + 13 * 16]
+cglobal intra_pred_ang8_20, 3,5,8
+ lea r3, [ang_table + 13 * 16]
add r1, r1
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
mova m4, m3
- pmaddwd m4, [r5 - 2 * 16] ; [11]
+ pmaddwd m4, [r3 - 2 * 16] ; [11]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 - 2 * 16]
+ pmaddwd m2, [r3 - 2 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
palignr m0, m3, 12
- movu m1, [r2]
+ movu m1, [r2 + 32]
+ pinsrw m1, [r2], 0
pshufb m1, [pw_ang8_16]
palignr m3, m1, 12
mova m2, m3
- pmaddwd m2, [r5 + 9 * 16] ; [22]
+ pmaddwd m2, [r3 + 9 * 16] ; [22]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 + 9 * 16]
+ pmaddwd m5, [r3 + 9 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
mova m6, m3
- pmaddwd m6, [r5 - 12 * 16] ; [1]
+ pmaddwd m6, [r3 - 12 * 16] ; [1]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 - 12 * 16]
+ pmaddwd m5, [r3 - 12 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3589,11 +3552,11 @@ cglobal intra_pred_ang8_20, 4,6,8
palignr m3, m1, 12
mova m5, m3
- pmaddwd m5, [r5 - 1 * 16] ; [12]
+ pmaddwd m5, [r3 - 1 * 16] ; [12]
paddd m5, [pd_16]
psrld m5, 5
mova m7, m0
- pmaddwd m7, [r5 - 1 * 16]
+ pmaddwd m7, [r3 - 1 * 16]
paddd m7, [pd_16]
psrld m7, 5
packusdw m5, m7
@@ -3609,21 +3572,21 @@ cglobal intra_pred_ang8_20, 4,6,8
palignr m3, m1, 12
mova m4, m3
- pmaddwd m4, [r5 + 10 * 16] ; [23]
+ pmaddwd m4, [r3 + 10 * 16] ; [23]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 10 * 16]
+ pmaddwd m2, [r3 + 10 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5 - 11 * 16] ; [2]
+ pmaddwd m2, [r3 - 11 * 16] ; [2]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 - 11 * 16]
+ pmaddwd m5, [r3 - 11 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
@@ -3633,11 +3596,11 @@ cglobal intra_pred_ang8_20, 4,6,8
palignr m3, m1, 12
mova m6, m3
- pmaddwd m6, [r5] ; [13]
+ pmaddwd m6, [r3] ; [13]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5]
+ pmaddwd m5, [r3]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3645,12 +3608,12 @@ cglobal intra_pred_ang8_20, 4,6,8
pslldq m1, 2
palignr m0, m3, 12
palignr m3, m1, 12
- pinsrw m3, [r2 + 16], 0
+ pinsrw m3, [r2 + 16 + 32], 0
- pmaddwd m3, [r5 + 11 * 16] ; [24]
+ pmaddwd m3, [r3 + 11 * 16] ; [24]
paddd m3, [pd_16]
psrld m3, 5
- pmaddwd m0, [r5 + 11 * 16]
+ pmaddwd m0, [r3 + 11 * 16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m3, m0
@@ -3660,50 +3623,50 @@ cglobal intra_pred_ang8_20, 4,6,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
-cglobal intra_pred_ang8_21, 4,6,8
- lea r5, [ang_table + 20 * 16]
+cglobal intra_pred_ang8_21, 3,5,8
+ lea r3, [ang_table + 20 * 16]
add r1, r1
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
mova m4, m3
- pmaddwd m4, [r5 - 5 * 16] ; [15]
+ pmaddwd m4, [r3 - 5 * 16] ; [15]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 - 5 * 16]
+ pmaddwd m2, [r3 - 5 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
palignr m0, m3, 12
- movu m1, [r2]
+ movu m1, [r2 + 32]
+ pinsrw m1, [r2], 0
pshufb m1, [pw_ang8_15]
palignr m3, m1, 12
mova m2, m3
- pmaddwd m2, [r5 + 10 * 16] ; [30]
+ pmaddwd m2, [r3 + 10 * 16] ; [30]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 + 10 * 16]
+ pmaddwd m5, [r3 + 10 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
mova m6, m3
- pmaddwd m6, [r5 - 7 * 16] ; [13]
+ pmaddwd m6, [r3 - 7 * 16] ; [13]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 - 7 * 16]
+ pmaddwd m5, [r3 - 7 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3713,11 +3676,11 @@ cglobal intra_pred_ang8_21, 4,6,8
palignr m3, m1, 12
mova m5, m3
- pmaddwd m5, [r5 + 8 * 16] ; [28]
+ pmaddwd m5, [r3 + 8 * 16] ; [28]
paddd m5, [pd_16]
psrld m5, 5
mova m7, m0
- pmaddwd m7, [r5 + 8 * 16]
+ pmaddwd m7, [r3 + 8 * 16]
paddd m7, [pd_16]
psrld m7, 5
packusdw m5, m7
@@ -3729,11 +3692,11 @@ cglobal intra_pred_ang8_21, 4,6,8
movu [r0 + r4], m5
mova m4, m3
- pmaddwd m4, [r5 - 9 * 16] ; [11]
+ pmaddwd m4, [r3 - 9 * 16] ; [11]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 - 9 * 16]
+ pmaddwd m2, [r3 - 9 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
@@ -3743,21 +3706,21 @@ cglobal intra_pred_ang8_21, 4,6,8
palignr m3, m1, 12
mova m2, m3
- pmaddwd m2, [r5 + 6 * 16] ; [26]
+ pmaddwd m2, [r3 + 6 * 16] ; [26]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 + 6 * 16]
+ pmaddwd m5, [r3 + 6 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
mova m6, m3
- pmaddwd m6, [r5 - 11 * 16] ; [9]
+ pmaddwd m6, [r3 - 11 * 16] ; [9]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 - 11 * 16]
+ pmaddwd m5, [r3 - 11 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3765,12 +3728,12 @@ cglobal intra_pred_ang8_21, 4,6,8
pslldq m1, 2
palignr m0, m3, 12
palignr m3, m1, 12
- pinsrw m3, [r2 + 16], 0
+ pinsrw m3, [r2 + 16 + 32], 0
- pmaddwd m3, [r5 + 4 * 16] ; [24]
+ pmaddwd m3, [r3 + 4 * 16] ; [24]
paddd m3, [pd_16]
psrld m3, 5
- pmaddwd m0, [r5 + 4 * 16]
+ pmaddwd m0, [r3 + 4 * 16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m3, m0
@@ -3780,60 +3743,60 @@ cglobal intra_pred_ang8_21, 4,6,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
-cglobal intra_pred_ang8_22, 4,6,8
- lea r5, [ang_table + 18 * 16]
+cglobal intra_pred_ang8_22, 3,5,8
+ lea r3, [ang_table + 18 * 16]
add r1, r1
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
mova m4, m3
- pmaddwd m4, [r5 + 1 * 16] ; [19]
+ pmaddwd m4, [r3 + 1 * 16] ; [19]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 1 * 16]
+ pmaddwd m2, [r3 + 1 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5 - 12 * 16] ; [6]
+ pmaddwd m2, [r3 - 12 * 16] ; [6]
paddd m2, [pd_16]
psrld m2, 5
mova m1, m0
- pmaddwd m1, [r5 - 12 * 16]
+ pmaddwd m1, [r3 - 12 * 16]
paddd m1, [pd_16]
psrld m1, 5
packusdw m2, m1
palignr m0, m3, 12
- movu m1, [r2]
+ movu m1, [r2 + 32]
+ pinsrw m1, [r2], 0
pshufb m1, [pw_ang8_14]
palignr m3, m1, 12
mova m6, m3
- pmaddwd m6, [r5 + 7 * 16] ; [25]
+ pmaddwd m6, [r3 + 7 * 16] ; [25]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 + 7 * 16]
+ pmaddwd m5, [r3 + 7 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
mova m5, m3
- pmaddwd m5, [r5 - 6 * 16] ; [12]
+ pmaddwd m5, [r3 - 6 * 16] ; [12]
paddd m5, [pd_16]
psrld m5, 5
mova m7, m0
- pmaddwd m7, [r5 - 6 * 16]
+ pmaddwd m7, [r3 - 6 * 16]
paddd m7, [pd_16]
psrld m7, 5
packusdw m5, m7
@@ -3849,31 +3812,31 @@ cglobal intra_pred_ang8_22, 4,6,8
palignr m3, m1, 12
mova m4, m3
- pmaddwd m4, [r5 + 13 * 16] ; [31]
+ pmaddwd m4, [r3 + 13 * 16] ; [31]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 13 * 16]
+ pmaddwd m2, [r3 + 13 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5] ; [18]
+ pmaddwd m2, [r3] ; [18]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5]
+ pmaddwd m5, [r3]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
mova m6, m3
- pmaddwd m6, [r5 - 13 * 16] ; [5]
+ pmaddwd m6, [r3 - 13 * 16] ; [5]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
+ pmaddwd m5, [r3 - 13 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3882,10 +3845,10 @@ cglobal intra_pred_ang8_22, 4,6,8
palignr m0, m3, 12
palignr m3, m1, 12
- pmaddwd m3, [r5 + 6 * 16] ; [24]
+ pmaddwd m3, [r3 + 6 * 16] ; [24]
paddd m3, [pd_16]
psrld m3, 5
- pmaddwd m0, [r5 + 6 * 16]
+ pmaddwd m0, [r3 + 6 * 16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m3, m0
@@ -3895,60 +3858,60 @@ cglobal intra_pred_ang8_22, 4,6,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
-cglobal intra_pred_ang8_23, 4,6,8
- lea r5, [ang_table + 14 * 16]
+cglobal intra_pred_ang8_23, 3,5,8
+ lea r3, [ang_table + 14 * 16]
add r1, r1
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
mova m4, m3
- pmaddwd m4, [r5 + 9 * 16] ; [23]
+ pmaddwd m4, [r3 + 9 * 16] ; [23]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 9 * 16]
+ pmaddwd m2, [r3 + 9 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5] ; [14]
+ pmaddwd m2, [r3] ; [14]
paddd m2, [pd_16]
psrld m2, 5
mova m1, m0
- pmaddwd m1, [r5]
+ pmaddwd m1, [r3]
paddd m1, [pd_16]
psrld m1, 5
packusdw m2, m1
mova m6, m3
- pmaddwd m6, [r5 - 9 * 16] ; [5]
+ pmaddwd m6, [r3 - 9 * 16] ; [5]
paddd m6, [pd_16]
psrld m6, 5
mova m1, m0
- pmaddwd m1, [r5 - 9 * 16]
+ pmaddwd m1, [r3 - 9 * 16]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
palignr m0, m3, 12
- movu m1, [r2]
+ movu m1, [r2 + 32]
+ pinsrw m1, [r2], 0
pshufb m1, [pw_ang8_13]
palignr m3, m1, 12
mova m5, m3
- pmaddwd m5, [r5 + 14 * 16] ; [28]
+ pmaddwd m5, [r3 + 14 * 16] ; [28]
paddd m5, [pd_16]
psrld m5, 5
mova m7, m0
- pmaddwd m7, [r5 + 14 * 16]
+ pmaddwd m7, [r3 + 14 * 16]
paddd m7, [pd_16]
psrld m7, 5
packusdw m5, m7
@@ -3960,31 +3923,31 @@ cglobal intra_pred_ang8_23, 4,6,8
movu [r0 + r4], m5
mova m4, m3
- pmaddwd m4, [r5 + 5 * 16] ; [19]
+ pmaddwd m4, [r3 + 5 * 16] ; [19]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 5 * 16]
+ pmaddwd m2, [r3 + 5 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5 - 4 * 16] ; [10]
+ pmaddwd m2, [r3 - 4 * 16] ; [10]
paddd m2, [pd_16]
psrld m2, 5
mova m5, m0
- pmaddwd m5, [r5 - 4 * 16]
+ pmaddwd m5, [r3 - 4 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m2, m5
mova m6, m3
- pmaddwd m6, [r5 - 13 * 16] ; [1]
+ pmaddwd m6, [r3 - 13 * 16] ; [1]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
+ pmaddwd m5, [r3 - 13 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
@@ -3993,10 +3956,10 @@ cglobal intra_pred_ang8_23, 4,6,8
palignr m0, m3, 12
palignr m3, m1, 12
- pmaddwd m3, [r5 + 10 * 16] ; [24]
+ pmaddwd m3, [r3 + 10 * 16] ; [24]
paddd m3, [pd_16]
psrld m3, 5
- pmaddwd m0, [r5 + 10 * 16]
+ pmaddwd m0, [r3 + 10 * 16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m3, m0
@@ -4006,55 +3969,54 @@ cglobal intra_pred_ang8_23, 4,6,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
-cglobal intra_pred_ang8_24, 4,6,7
- lea r5, [ang_table + 16 * 16]
+cglobal intra_pred_ang8_24, 3,5,7
+ lea r3, [ang_table + 16 * 16]
add r1, r1
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
mova m4, m3
- pmaddwd m4, [r5 + 11 * 16] ; [27]
+ pmaddwd m4, [r3 + 11 * 16] ; [27]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 + 11 * 16]
+ pmaddwd m2, [r3 + 11 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5 + 6 * 16] ; [22]
+ pmaddwd m2, [r3 + 6 * 16] ; [22]
paddd m2, [pd_16]
psrld m2, 5
mova m1, m0
- pmaddwd m1, [r5 + 6 * 16]
+ pmaddwd m1, [r3 + 6 * 16]
paddd m1, [pd_16]
psrld m1, 5
packusdw m2, m1
mova m6, m3
- pmaddwd m6, [r5 + 1 * 16] ; [17]
+ pmaddwd m6, [r3 + 1 * 16] ; [17]
paddd m6, [pd_16]
psrld m6, 5
mova m1, m0
- pmaddwd m1, [r5 + 1 * 16]
+ pmaddwd m1, [r3 + 1 * 16]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
mova m5, m3
- pmaddwd m5, [r5 - 4 * 16] ; [12]
+ pmaddwd m5, [r3 - 4 * 16] ; [12]
paddd m5, [pd_16]
psrld m5, 5
mova m1, m0
- pmaddwd m1, [r5 - 4 * 16]
+ pmaddwd m1, [r3 - 4 * 16]
paddd m1, [pd_16]
psrld m1, 5
packusdw m5, m1
@@ -4066,44 +4028,45 @@ cglobal intra_pred_ang8_24, 4,6,7
movu [r0 + r4], m5
mova m4, m3
- pmaddwd m4, [r5 - 9 * 16] ; [7]
+ pmaddwd m4, [r3 - 9 * 16] ; [7]
paddd m4, [pd_16]
psrld m4, 5
mova m2, m0
- pmaddwd m2, [r5 - 9 * 16]
+ pmaddwd m2, [r3 - 9 * 16]
paddd m2, [pd_16]
psrld m2, 5
packusdw m4, m2
mova m2, m3
- pmaddwd m2, [r5 - 14 * 16] ; [2]
+ pmaddwd m2, [r3 - 14 * 16] ; [2]
paddd m2, [pd_16]
psrld m2, 5
mova m1, m0
- pmaddwd m1, [r5 - 14 * 16]
+ pmaddwd m1, [r3 - 14 * 16]
paddd m1, [pd_16]
psrld m1, 5
packusdw m2, m1
palignr m0, m3, 12
- movu m1, [r2]
+ movu m1, [r2 + 32]
+ pinsrw m1, [r2], 0
pshufb m1, [pw_ang8_12]
palignr m3, m1, 12
mova m6, m3
- pmaddwd m6, [r5 + 13 * 16] ; [29]
+ pmaddwd m6, [r3 + 13 * 16] ; [29]
paddd m6, [pd_16]
psrld m6, 5
mova m5, m0
- pmaddwd m5, [r5 + 13 * 16]
+ pmaddwd m5, [r3 + 13 * 16]
paddd m5, [pd_16]
psrld m5, 5
packusdw m6, m5
- pmaddwd m3, [r5 + 8 * 16] ; [24]
+ pmaddwd m3, [r3 + 8 * 16] ; [24]
paddd m3, [pd_16]
psrld m3, 5
- pmaddwd m0, [r5 + 8 * 16]
+ pmaddwd m0, [r3 + 8 * 16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m3, m0
@@ -4113,11 +4076,9 @@ cglobal intra_pred_ang8_24, 4,6,7
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
cglobal intra_pred_ang8_25, 3,5,7
- mov r2, r3mp
lea r3, [ang_table + 23 * 16]
add r1, r1
@@ -4216,34 +4177,32 @@ cglobal intra_pred_ang8_25, 3,5,7
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
-cglobal intra_pred_ang8_26, 4,5,3
- movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang8_26, 3,6,3
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
add r1, r1
- lea r4, [r1 * 3]
+ lea r5, [r1 * 3]
movu [r0], m0
movu [r0 + r1], m0
movu [r0 + r1 * 2], m0
- movu [r0 + r4], m0
+ movu [r0 + r5], m0
lea r3, [r0 + r1 *4]
movu [r3], m0
movu [r3 + r1], m0
movu [r3 + r1 * 2], m0
- movu [r3 + r4], m0
+ movu [r3 + r5], m0
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
; filter
-
pshufb m0, [pw_unpackwdq]
- movh m1, [r2] ; [3 2 1 0]
+ pinsrw m1, [r2], 0 ; [3 2 1 0]
pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 2 + 32] ; [8 7 6 5 4 3 2 1]
psubw m1, m2
psraw m1, 1
paddw m0, m1
@@ -4253,17 +4212,15 @@ cglobal intra_pred_ang8_26, 4,5,3
pextrw [r0], m0, 0
pextrw [r0 + r1], m0, 1
pextrw [r0 + r1 * 2], m0, 2
- pextrw [r0 + r4], m0, 3
+ pextrw [r0 + r5], m0, 3
pextrw [r3], m0, 4
pextrw [r3 + r1], m0, 5
pextrw [r3 + r1 * 2], m0, 6
- pextrw [r3 + r4], m0, 7
-
+ pextrw [r3 + r5], m0, 7
.quit:
RET
cglobal intra_pred_ang8_27, 3,5,7
- mov r2, r3mp
lea r3, [ang_table + 9 * 16]
add r1, r1
@@ -4362,11 +4319,9 @@ cglobal intra_pred_ang8_27, 3,5,7
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m3
-
RET
cglobal intra_pred_ang8_28, 3,5,7
- mov r2, r3mp
lea r3, [ang_table + 17 * 16]
add r1, r1
@@ -4469,11 +4424,9 @@ cglobal intra_pred_ang8_28, 3,5,7
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m5
-
RET
cglobal intra_pred_ang8_29, 3,5,8
- mov r2, r3mp
lea r3, [ang_table + 18 * 16]
add r1, r1
@@ -4575,11 +4528,9 @@ cglobal intra_pred_ang8_29, 3,5,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m7
-
RET
cglobal intra_pred_ang8_30, 3,5,8
- mov r2, r3mp
lea r3, [ang_table + 14 * 16]
add r1, r1
@@ -4681,11 +4632,9 @@ cglobal intra_pred_ang8_30, 3,5,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m7
-
RET
cglobal intra_pred_ang8_31, 3,5,8
- mov r2, r3mp
lea r3, [ang_table + 13 * 16]
add r1, r1
@@ -4789,11 +4738,9 @@ cglobal intra_pred_ang8_31, 3,5,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m7
-
RET
-cglobal intra_pred_ang8_32, 3,6,8
- mov r2, r3mp
+cglobal intra_pred_ang8_32, 3,5,8
lea r3, [ang_table + 19 * 16]
add r1, r1
@@ -4898,11 +4845,9 @@ cglobal intra_pred_ang8_32, 3,6,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m7
-
RET
cglobal intra_pred_ang8_33, 3,5,8
- mov r2, r3mp
lea r3, [ang_table + 14 * 16]
add r1, r1
@@ -5007,90 +4952,6 @@ cglobal intra_pred_ang8_33, 3,5,8
movu [r0 + r1], m2
movu [r0 + r1 * 2], m6
movu [r0 + r4], m7
-
- RET
-
-;-----------------------------------------------------------------------------
-; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang16_2, 3,4,5
- cmp r4m, byte 34
- cmove r2, r3mp
- add r1, r1
- lea r3, [r1 * 3]
- movu m0, [r2 + 4]
- movu m1, [r2 + 20]
- movu m2, [r2 + 36]
-
- movu [r0], m0
- movu [r0 + 16], m1
- palignr m3, m1, m0, 2
- palignr m4, m2, m1, 2
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m1, m0, 4
- palignr m4, m2, m1, 4
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m1, m0, 6
- palignr m4, m2, m1, 6
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- lea r0, [r0 + r1 * 4]
- palignr m3, m1, m0, 8
- palignr m4, m2, m1, 8
- movu [r0], m3
- movu [r0 + 16], m4
- palignr m3, m1, m0, 10
- palignr m4, m2, m1, 10
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m1, m0, 12
- palignr m4, m2, m1, 12
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m1, m0, 14
- palignr m4, m2, m1, 14
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- movu m0, [r2 + 52]
- lea r0, [r0 + r1 * 4]
- movu [r0], m1
- movu [r0 + 16], m2
- palignr m3, m2, m1, 2
- palignr m4, m0, m2, 2
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m2, m1, 4
- palignr m4, m0, m2, 4
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m2, m1, 6
- palignr m4, m0, m2, 6
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- lea r0, [r0 + r1 * 4]
- palignr m3, m2, m1, 8
- palignr m4, m0, m2, 8
- movu [r0], m3
- movu [r0 + 16], m4
- palignr m3, m2, m1, 10
- palignr m4, m0, m2, 10
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m2, m1, 12
- palignr m4, m0, m2, 12
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m2, m1, 14
- palignr m4, m0, m2, 14
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
RET
%macro TRANSPOSE_STORE 6
@@ -5312,38 +5173,6 @@ cglobal ang16_mode_3_33
ret
-cglobal intra_pred_ang16_3, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_3_33
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_3_33
-
- RET
-
-cglobal intra_pred_ang16_33, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_3_33
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_3_33
-
- RET
-
cglobal ang16_mode_4_32
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -5540,38 +5369,6 @@ cglobal ang16_mode_4_32
ret
-cglobal intra_pred_ang16_4, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 18 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_4_32
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_4_32
-
- RET
-
-cglobal intra_pred_ang16_32, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 18 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_4_32
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_4_32
-
- RET
-
cglobal ang16_mode_5_31
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -5764,38 +5561,6 @@ cglobal ang16_mode_5_31
ret
-cglobal intra_pred_ang16_5, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_5_31
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_5_31
-
- RET
-
-cglobal intra_pred_ang16_31, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_5_31
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_5_31
-
- RET
-
cglobal ang16_mode_6_30
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -5989,38 +5754,6 @@ cglobal ang16_mode_6_30
ret
-cglobal intra_pred_ang16_6, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_6_30
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_6_30
-
- RET
-
-cglobal intra_pred_ang16_30, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_6_30
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_6_30
-
- RET
-
cglobal ang16_mode_7_29
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -6208,38 +5941,6 @@ cglobal ang16_mode_7_29
ret
-cglobal intra_pred_ang16_7, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 17 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_7_29
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_7_29
-
- RET
-
-cglobal intra_pred_ang16_29, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 17 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_7_29
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_7_29
-
- RET
-
cglobal ang16_mode_8_28
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -6429,38 +6130,6 @@ cglobal ang16_mode_8_28
ret
-cglobal intra_pred_ang16_8, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_8_28
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_8_28
-
- RET
-
-cglobal intra_pred_ang16_28, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_8_28
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_8_28
-
- RET
-
cglobal ang16_mode_9_27
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -6637,38 +6306,6 @@ cglobal ang16_mode_9_27
ret
-cglobal intra_pred_ang16_9, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_9_27
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_9_27
-
- RET
-
-cglobal intra_pred_ang16_27, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_9_27
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_9_27
-
- RET
-
cglobal ang16_mode_11_25
test r6d, r6d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -6847,38 +6484,6 @@ cglobal ang16_mode_11_25
ret
-cglobal intra_pred_ang16_11, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_11_25
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_11_25
-
- RET
-
-cglobal intra_pred_ang16_25, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_11_25
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_11_25
-
- RET
-
cglobal ang16_mode_12_24
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -7070,46 +6675,6 @@ cglobal ang16_mode_12_24
ret
-cglobal intra_pred_ang16_12, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 16 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang8_12]
- pinsrw m5, [r3 + 26], 5
- xor r3d, r3d
-
- call ang16_mode_12_24
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_12_24
-
- RET
-
-cglobal intra_pred_ang16_24, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 16 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang8_12]
- pinsrw m5, [r3 + 26], 5
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_12_24
-
- lea r0, [r0 + 16]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_12_24
-
- RET
-
cglobal ang16_mode_13_23
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -7309,52 +6874,6 @@ cglobal ang16_mode_13_23
ret
-cglobal intra_pred_ang16_13, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang16_13]
- movu m6, [r3 + 14]
- pshufb m6, [pw_ang8_13]
- pslldq m6, 2
- palignr m5, m6, 6
- xor r3d, r3d
-
- call ang16_mode_13_23
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_13_23
-
- RET
-
-cglobal intra_pred_ang16_23, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang16_13]
- movu m6, [r3 + 14]
- pshufb m6, [pw_ang8_13]
- pslldq m6, 2
- palignr m5, m6, 6
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_13_23
-
- lea r0, [r0 + 16]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_13_23
-
- RET
-
cglobal ang16_mode_14_22
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -7562,72 +7081,28 @@ cglobal ang16_mode_14_22
ret
-cglobal intra_pred_ang16_14, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 18 * 16]
- movu m6, [r3]
- pshufb m6, [pw_ang8_14]
- movu m5, [r3 + 20]
- pshufb m5, [pw_ang8_14]
- punpckhqdq m5, m6
- xor r3d, r3d
+cglobal ang16_mode_15_21
+ test r3d, r3d
+ movu m0, [r2] ; [7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- call ang16_mode_14_22
+ palignr m6, m0, m5, 2
- lea r0, [r0 + r1 * 8]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
+ punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
+ punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
- call ang16_mode_14_22
+ mova m4, m3
+ pmaddwd m4, [r6] ; [15]
+ paddd m4, [pd_16]
+ psrld m4, 5
+ mova m2, m0
+ pmaddwd m2, [r6]
+ paddd m2, [pd_16]
+ psrld m2, 5
+ packusdw m4, m2
- RET
-
-cglobal intra_pred_ang16_22, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 18 * 16]
- movu m6, [r3]
- pshufb m6, [pw_ang8_14]
- movu m5, [r3 + 20]
- pshufb m5, [pw_ang8_14]
- punpckhqdq m5, m6
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_14_22
-
- lea r0, [r0 + 16]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_14_22
-
- RET
-
-cglobal ang16_mode_15_21
- test r3d, r3d
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- palignr m6, m0, m5, 2
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r6] ; [15]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r6]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- palignr m3, m6, 12
+ palignr m0, m3, 12
+ palignr m3, m6, 12
mova m2, m3
pmaddwd m2, [r6 + 15 * 16] ; [30]
@@ -7822,50 +7297,6 @@ cglobal ang16_mode_15_21
ret
-cglobal intra_pred_ang16_15, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang8_15]
- movu m5, [r3 + 18]
- pshufb m5, [pw_ang8_15]
- punpckhqdq m5, m6
- xor r3d, r3d
-
- call ang16_mode_15_21
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2]
- lea r2, [r2 + 16]
-
- call ang16_mode_15_21
-
- RET
-
-cglobal intra_pred_ang16_21, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang8_15]
- movu m5, [r3 + 18]
- pshufb m5, [pw_ang8_15]
- punpckhqdq m5, m6
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_15_21
-
- lea r0, [r0 + 16]
- movu m5, [r2]
- lea r2, [r2 + 16]
-
- call ang16_mode_15_21
-
- RET
-
cglobal ang16_mode_16_20
test r4d, r4d
lea r4, [r1 * 3]
@@ -8093,57 +7524,6 @@ cglobal ang16_mode_16_20
ret
-cglobal intra_pred_ang16_16, 4,7,8,0-(1*mmsize)
- add r1, r1
- lea r6, [ang_table + 13 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 16]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 24]
- xor r4, r4
-
- call ang16_mode_16_20
-
- lea r0, [r0 + r1 * 8]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
-
- call ang16_mode_16_20
-
- RET
-
-cglobal intra_pred_ang16_20, 4,7,8,0-(1*mmsize)
- xchg r2, r3
- add r1, r1
- lea r6, [ang_table + 13 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 16]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 24]
- xor r4, r4
- inc r4
-
- call ang16_mode_16_20
-
- lea r0, [r0 + 16]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
- inc r4
-
- call ang16_mode_16_20
-
- RET
-
cglobal ang16_mode_17_19
test r4d, r4d
lea r4, [r1 * 3]
@@ -8379,324 +7759,965 @@ cglobal ang16_mode_17_19
ret
-cglobal intra_pred_ang16_17, 4,7,8,0-(1*mmsize)
- add r1, r1
- lea r6, [ang_table + 16 * 16]
- movu m6, [r3 + 2]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 12]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 20]
- xor r4, r4
+;------------------------------------------------------------------------------------------
+; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang16_2, 3,5,5
+ lea r4, [r2]
+ add r2, 64
+ cmp r3m, byte 34
+ cmove r2, r4
+ add r1, r1
+ lea r3, [r1 * 3]
+ movu m0, [r2 + 4]
+ movu m1, [r2 + 20]
+ movu m2, [r2 + 36]
- call ang16_mode_17_19
+ movu [r0], m0
+ movu [r0 + 16], m1
+ palignr m3, m1, m0, 2
+ palignr m4, m2, m1, 2
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ palignr m3, m1, m0, 4
+ palignr m4, m2, m1, 4
+ movu [r0 + r1 * 2], m3
+ movu [r0 + r1 * 2 + 16], m4
+ palignr m3, m1, m0, 6
+ palignr m4, m2, m1, 6
+ movu [r0 + r3], m3
+ movu [r0 + r3 + 16], m4
- lea r0, [r0 + r1 * 8]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
+ lea r0, [r0 + r1 * 4]
+ palignr m3, m1, m0, 8
+ palignr m4, m2, m1, 8
+ movu [r0], m3
+ movu [r0 + 16], m4
+ palignr m3, m1, m0, 10
+ palignr m4, m2, m1, 10
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ palignr m3, m1, m0, 12
+ palignr m4, m2, m1, 12
+ movu [r0 + r1 * 2], m3
+ movu [r0 + r1 * 2 + 16], m4
+ palignr m3, m1, m0, 14
+ palignr m4, m2, m1, 14
+ movu [r0 + r3], m3
+ movu [r0 + r3 + 16], m4
- call ang16_mode_17_19
+ movu m0, [r2 + 52]
+ lea r0, [r0 + r1 * 4]
+ movu [r0], m1
+ movu [r0 + 16], m2
+ palignr m3, m2, m1, 2
+ palignr m4, m0, m2, 2
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ palignr m3, m2, m1, 4
+ palignr m4, m0, m2, 4
+ movu [r0 + r1 * 2], m3
+ movu [r0 + r1 * 2 + 16], m4
+ palignr m3, m2, m1, 6
+ palignr m4, m0, m2, 6
+ movu [r0 + r3], m3
+ movu [r0 + r3 + 16], m4
+ lea r0, [r0 + r1 * 4]
+ palignr m3, m2, m1, 8
+ palignr m4, m0, m2, 8
+ movu [r0], m3
+ movu [r0 + 16], m4
+ palignr m3, m2, m1, 10
+ palignr m4, m0, m2, 10
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ palignr m3, m2, m1, 12
+ palignr m4, m0, m2, 12
+ movu [r0 + r1 * 2], m3
+ movu [r0 + r1 * 2 + 16], m4
+ palignr m3, m2, m1, 14
+ palignr m4, m0, m2, 14
+ movu [r0 + r3], m3
+ movu [r0 + r3 + 16], m4
RET
-cglobal intra_pred_ang16_19, 4,7,8,0-(1*mmsize)
- xchg r2, r3
+INIT_XMM sse4
+cglobal intra_pred_ang16_3, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 16 * 16]
add r1, r1
- lea r6, [ang_table + 16 * 16]
- movu m6, [r3 + 2]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 12]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 20]
- xor r4, r4
- inc r4
+ lea r4, [r1 * 3]
- call ang16_mode_17_19
+ call ang16_mode_3_33
- lea r0, [r0 + 16]
- mov r3, [rsp]
- movu m5, [r2]
lea r2, [r2 + 16]
- xor r4, r4
- inc r4
+ lea r0, [r0 + r1 * 8]
- call ang16_mode_17_19
+ call ang16_mode_3_33
+ RET
+cglobal intra_pred_ang16_33, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_3_33
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
+
+ call ang16_mode_3_33
RET
-cglobal intra_pred_ang16_18, 4,5,4
- add r1, r1
- lea r4, [r1 * 3]
- movu m1, [r3]
- movu m3, [r3 + 16]
- movu m0, [r2 + 2]
- pshufb m0, [pw_swap16]
- movu [r0], m1
- movu [r0 + 16], m3
- palignr m2, m1, m0, 14
- movu [r0 + r1], m2
- palignr m2, m3, m1, 14
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 12
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 12
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 10
- movu [r0 + r4], m2
- palignr m2, m3, m1, 10
- movu [r0 + r4 + 16], m2
-
- lea r0, [r0 + r1 * 4]
- palignr m2, m1, m0, 8
- movu [r0], m2
- palignr m2, m3, m1, 8
- movu [r0 + 16], m2
- palignr m2, m1, m0, 6
- movu [r0 + r1], m2
- palignr m2, m3, m1, 6
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 4
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 4
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 2
- movu [r0 + r4], m2
- palignr m3, m1, 2
- movu [r0 + r4 + 16], m3
+cglobal intra_pred_ang16_4, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 18 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
- lea r0, [r0 + r1 * 4]
- movu [r0], m0
- movu [r0 + 16], m1
- movu m3, [r2 + 18]
- pshufb m3, [pw_swap16]
- palignr m2, m0, m3, 14
- movu [r0 + r1], m2
- palignr m2, m1, m0, 14
- movu [r0 + r1 + 16], m2
- palignr m2, m0, m3, 12
- movu [r0 + r1 * 2], m2
- palignr m2, m1, m0, 12
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m0, m3, 10
- movu [r0 + r4], m2
- palignr m2, m1, m0, 10
- movu [r0 + r4 + 16], m2
+ call ang16_mode_4_32
- lea r0, [r0 + r1 * 4]
- palignr m2, m0, m3, 8
- movu [r0], m2
- palignr m2, m1, m0, 8
- movu [r0 + 16], m2
- palignr m2, m0, m3, 6
- movu [r0 + r1], m2
- palignr m2, m1, m0, 6
- movu [r0 + r1 + 16], m2
- palignr m2, m0, m3, 4
- movu [r0 + r1 * 2], m2
- palignr m2, m1, m0, 4
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m0, m3, 2
- movu [r0 + r4], m2
- palignr m1, m0, 2
- movu [r0 + r4 + 16], m1
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
+ call ang16_mode_4_32
RET
-cglobal intra_pred_ang16_10, 4,5,4
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
- add r1, r1
- lea r4, [r1 * 3]
-
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
- movu [r0 + r1 * 2], m2
- movu [r0 + r1 * 2 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
- movu [r0 + r4], m2
- movu [r0 + r4 + 16], m2
+cglobal intra_pred_ang16_32, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 18 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
- lea r2, [r0 + r1 *4]
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
- movu [r2], m2
- movu [r2 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
- movu [r2 + r1], m2
- movu [r2 + r1 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
- movu [r2 + r1 * 2], m2
- movu [r2 + r1 * 2 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
- movu [r2 + r4], m2
- movu [r2 + r4 + 16], m2
+ call ang16_mode_4_32
- lea r2, [r2 + r1 *4]
- pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9]
- movu [r2], m2
- movu [r2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10]
- movu [r2 + r1], m2
- movu [r2 + r1 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11]
- movu [r2 + r1 * 2], m2
- movu [r2 + r1 * 2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12]
- movu [r2 + r4], m2
- movu [r2 + r4 + 16], m2
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
- lea r2, [r2 + r1 *4]
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13]
- movu [r2], m2
- movu [r2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14]
- movu [r2 + r1], m2
- movu [r2 + r1 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15]
- movu [r2 + r1 * 2], m2
- movu [r2 + r1 * 2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16]
- movu [r2 + r4], m2
- movu [r2 + r4 + 16], m2
- mova m3, m0
+ call ang16_mode_4_32
+ RET
- cmp r5m, byte 0
- jz .quit
+cglobal intra_pred_ang16_5, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
- ; filter
+ call ang16_mode_5_31
- movh m1, [r3] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
- psubw m1, m2
- psubw m3, m2
- psraw m1, 1
- psraw m3, 1
- paddw m3, m0
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
- pmaxsw m3, m1
- pminsw m3, [pw_1023]
-.quit:
- movu [r0], m0
- movu [r0 + 16], m3
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
+ call ang16_mode_5_31
RET
-cglobal intra_pred_ang16_26, 4,5,4
- movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
- add r1, r1
- lea r4, [r1 * 3]
+cglobal intra_pred_ang16_31, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
- movu [r0], m0
- movu [r0 + 16], m3
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m3
- movu [r0 + r1 * 2], m0
- movu [r0 + r1 * 2 + 16], m3
- movu [r0 + r4], m0
- movu [r0 + r4 + 16], m3
+ call ang16_mode_5_31
- lea r3, [r0 + r1 *4]
- movu [r3], m0
- movu [r3 + 16], m3
- movu [r3 + r1], m0
- movu [r3 + r1 + 16], m3
- movu [r3 + r1 * 2], m0
- movu [r3 + r1 * 2 + 16], m3
- movu [r3 + r4], m0
- movu [r3 + r4 + 16], m3
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
- lea r3, [r3 + r1 *4]
- movu [r3], m0
- movu [r3 + 16], m3
- movu [r3 + r1], m0
- movu [r3 + r1 + 16], m3
- movu [r3 + r1 * 2], m0
- movu [r3 + r1 * 2 + 16], m3
- movu [r3 + r4], m0
- movu [r3 + r4 + 16], m3
+ call ang16_mode_5_31
+ RET
- lea r3, [r3 + r1 *4]
- movu [r3], m0
- movu [r3 + 16], m3
- movu [r3 + r1], m0
- movu [r3 + r1 + 16], m3
- movu [r3 + r1 * 2], m0
- movu [r3 + r1 * 2 + 16], m3
- movu [r3 + r4], m0
- movu [r3 + r4 + 16], m3
+cglobal intra_pred_ang16_6, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 15 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
- cmp r5m, byte 0
- jz .quit
+ call ang16_mode_6_30
- ; filter
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
- pshufb m0, [pw_unpackwdq]
- movh m1, [r2] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- psubw m1, m2
- psubw m3, m2
- psraw m1, 1
- psraw m3, 1
- paddw m3, m0
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
- pmaxsw m3, m1
- pminsw m3, [pw_1023]
- pextrw [r0], m0, 0
- pextrw [r0 + r1], m0, 1
- pextrw [r0 + r1 * 2], m0, 2
- pextrw [r0 + r4], m0, 3
- lea r0, [r0 + r1 * 4]
- pextrw [r0], m0, 4
- pextrw [r0 + r1], m0, 5
- pextrw [r0 + r1 * 2], m0, 6
- pextrw [r0 + r4], m0, 7
- lea r0, [r0 + r1 * 4]
- pextrw [r0], m3, 0
- pextrw [r0 + r1], m3, 1
- pextrw [r0 + r1 * 2], m3, 2
- pextrw [r0 + r4], m3, 3
- pextrw [r3], m3, 4
- pextrw [r3 + r1], m3, 5
- pextrw [r3 + r1 * 2], m3, 6
- pextrw [r3 + r4], m3, 7
+ call ang16_mode_6_30
+ RET
-.quit:
+cglobal intra_pred_ang16_30, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 15 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_6_30
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
+
+ call ang16_mode_6_30
RET
-%macro MODE_2_34 0
+cglobal intra_pred_ang16_7, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 17 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_7_29
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
+
+ call ang16_mode_7_29
+ RET
+
+cglobal intra_pred_ang16_29, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 17 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_7_29
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
+
+ call ang16_mode_7_29
+ RET
+
+cglobal intra_pred_ang16_8, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 15 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_8_28
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
+
+ call ang16_mode_8_28
+ RET
+
+cglobal intra_pred_ang16_28, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 15 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_8_28
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
+
+ call ang16_mode_8_28
+ RET
+
+cglobal intra_pred_ang16_9, 3,7,8
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_9_27
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
+
+ call ang16_mode_9_27
+ RET
+
+cglobal intra_pred_ang16_27, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_9_27
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
+
+ call ang16_mode_9_27
+ RET
+
+cglobal intra_pred_ang16_11, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r2, 64
+ xor r6d, r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_11_25
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + r1 * 8]
+
+ call ang16_mode_11_25
+
+ mov r6d, [rsp]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_25, 3,7,8
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table + 16 * 16]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ call ang16_mode_11_25
+
+ lea r2, [r2 + 16]
+ lea r0, [r0 + 16]
+
+ call ang16_mode_11_25
+ RET
+
+cglobal intra_pred_ang16_12, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 16 * 16]
+ movu m5, [r2]
+ pshufb m5, [pw_ang8_12]
+ pinsrw m5, [r2 + 26], 5
+ xor r3d, r3d
+ add r2, 64
+
+ call ang16_mode_12_24
+
+ lea r0, [r0 + r1 * 8]
+ movu m5, [r2 + 2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_12_24
+
+ mov r6d, [rsp]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_24, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 16 * 16]
+ movu m5, [r2 + 64]
+ pshufb m5, [pw_ang8_12]
+ pinsrw m5, [r2 + 26 + 64], 5
+ xor r3d, r3d
+ inc r3d
+
+ call ang16_mode_12_24
+
+ lea r0, [r0 + 16]
+ movu m5, [r2 + 2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_12_24
+
+ mov r6d, [rsp]
+ mov [r2 + 48], r6w
+ RET
+
+cglobal intra_pred_ang16_13, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 15 * 16]
+ movu m5, [r2]
+ pshufb m5, [pw_ang16_13]
+ movu m6, [r2 + 14]
+ pshufb m6, [pw_ang8_13]
+ pslldq m6, 2
+ palignr m5, m6, 6
+ xor r3d, r3d
+ add r2, 64
+
+ call ang16_mode_13_23
+
+ lea r0, [r0 + r1 * 8]
+ movu m5, [r2 + 2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_13_23
+
+ mov r6d, [rsp]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_23, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 15 * 16]
+ movu m5, [r2 + 64]
+ pshufb m5, [pw_ang16_13]
+ movu m6, [r2 + 14 + 64]
+ pshufb m6, [pw_ang8_13]
+ pslldq m6, 2
+ palignr m5, m6, 6
+ xor r3d, r3d
+ inc r3d
+
+ call ang16_mode_13_23
+
+ lea r0, [r0 + 16]
+ movu m5, [r2 + 2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_13_23
+
+ mov r6d, [rsp]
+ mov [r2 + 48], r6w
+ RET
+
+cglobal intra_pred_ang16_14, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 18 * 16]
+ movu m6, [r2]
+ pshufb m6, [pw_ang8_14]
+ movu m5, [r2 + 20]
+ pshufb m5, [pw_ang8_14]
+ punpckhqdq m5, m6
+ xor r3d, r3d
+ add r2, 64
+
+ call ang16_mode_14_22
+
+ lea r0, [r0 + r1 * 8]
+ movu m5, [r2 + 2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_14_22
+
+ mov r6d, [rsp]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_22, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 18 * 16]
+ movu m6, [r2 + 64]
+ pshufb m6, [pw_ang8_14]
+ movu m5, [r2 + 20 + 64]
+ pshufb m5, [pw_ang8_14]
+ punpckhqdq m5, m6
+ xor r3d, r3d
+ inc r3d
+
+ call ang16_mode_14_22
+
+ lea r0, [r0 + 16]
+ movu m5, [r2 + 2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_14_22
+
+ mov r6d, [rsp]
+ mov [r2 + 48], r6w
+ RET
+
+cglobal intra_pred_ang16_15, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 15 * 16]
+ movu m6, [r2 + 4]
+ pshufb m6, [pw_ang8_15]
+ movu m5, [r2 + 18]
+ pshufb m5, [pw_ang8_15]
+ punpckhqdq m5, m6
+ xor r3d, r3d
+ add r2, 64
+
+ call ang16_mode_15_21
+
+ lea r0, [r0 + r1 * 8]
+ movu m5, [r2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_15_21
+
+ mov r6d, [rsp]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_21, 3,7,8, 0-4
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r4, [r1 * 3]
+ lea r6, [ang_table + 15 * 16]
+ movu m6, [r2 + 4 + 64]
+ pshufb m6, [pw_ang8_15]
+ movu m5, [r2 + 18 + 64]
+ pshufb m5, [pw_ang8_15]
+ punpckhqdq m5, m6
+ xor r3d, r3d
+ inc r3d
+
+ call ang16_mode_15_21
+
+ lea r0, [r0 + 16]
+ movu m5, [r2]
+ lea r2, [r2 + 16]
+
+ call ang16_mode_15_21
+
+ mov r6d, [rsp]
+ mov [r2 + 48], r6w
+ RET
+
+cglobal intra_pred_ang16_16, 3,7,8,0-(1*mmsize+4)
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp + 16], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r6, [ang_table + 13 * 16]
+ movu m6, [r2 + 4]
+ pshufb m6, [pw_ang16_16]
+ movu m5, [r2 + 16]
+ pshufb m5, [pw_ang16_16]
+ punpckhqdq m5, m6
+ mov [rsp], r2
+ lea r3, [r2 + 24]
+ add r2, 64
+ xor r4, r4
+
+ call ang16_mode_16_20
+
+ lea r0, [r0 + r1 * 8]
+ mov r3, [rsp]
+ movu m5, [r2]
+ lea r2, [r2 + 16]
+ xor r4, r4
+
+ call ang16_mode_16_20
+
+ mov r6d, [rsp + 16]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_20, 3,7,8,0-(1*mmsize+4)
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp + 16], r5w
+ mov [r2 + 64], r6w
+
+ lea r3, [r2 + 64]
+ add r1, r1
+ lea r6, [ang_table + 13 * 16]
+ movu m6, [r3 + 4]
+ pshufb m6, [pw_ang16_16]
+ movu m5, [r3 + 16]
+ pshufb m5, [pw_ang16_16]
+ punpckhqdq m5, m6
+ mov [rsp], r3
+ lea r3, [r3 + 24]
+ xor r4, r4
+ inc r4
+
+ call ang16_mode_16_20
+
+ lea r0, [r0 + 16]
+ mov r3, [rsp]
+ movu m5, [r2]
+ lea r2, [r2 + 16]
+ xor r4, r4
+ inc r4
+
+ call ang16_mode_16_20
+ mov r6d, [rsp + 16]
+ mov [r3], r6w
+ RET
+
+cglobal intra_pred_ang16_17, 3,7,8,0-(1*mmsize+4)
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp + 16], r5w
+ mov [r2 + 64], r6w
+
+ add r1, r1
+ lea r6, [ang_table + 16 * 16]
+ movu m6, [r2 + 2]
+ pshufb m6, [pw_ang16_16]
+ movu m5, [r2 + 12]
+ pshufb m5, [pw_ang16_16]
+ punpckhqdq m5, m6
+ mov [rsp], r2
+ lea r3, [r2 + 20]
+ add r2, 64
+ xor r4, r4
+
+ call ang16_mode_17_19
+
+ lea r0, [r0 + r1 * 8]
+ mov r3, [rsp]
+ movu m5, [r2]
+ lea r2, [r2 + 16]
+ xor r4, r4
+
+ call ang16_mode_17_19
+
+ mov r6d, [rsp + 16]
+ mov [r2 - 16], r6w
+ RET
+
+cglobal intra_pred_ang16_19, 3,7,8,0-(1*mmsize+4)
+ movzx r5d, word [r2 + 64]
+ movzx r6d, word [r2]
+ mov [rsp + 16], r5w
+ mov [r2 + 64], r6w
+
+ lea r3, [r2 + 64]
+ add r1, r1
+ lea r6, [ang_table + 16 * 16]
+ movu m6, [r3 + 2]
+ pshufb m6, [pw_ang16_16]
+ movu m5, [r3 + 12]
+ pshufb m5, [pw_ang16_16]
+ punpckhqdq m5, m6
+ mov [rsp], r3
+ lea r3, [r3 + 20]
+ xor r4, r4
+ inc r4
+
+ call ang16_mode_17_19
+
+ lea r0, [r0 + 16]
+ mov r3, [rsp]
+ movu m5, [r2]
+ lea r2, [r2 + 16]
+ xor r4, r4
+ inc r4
+
+ call ang16_mode_17_19
+
+ mov r6d, [rsp + 16]
+ mov [r3], r6w
+ RET
+
+cglobal intra_pred_ang16_18, 3,5,4
+ add r1, r1
+ lea r4, [r1 * 3]
+ movu m1, [r2]
+ movu m3, [r2 + 16]
+ movu m0, [r2 + 2 + 64]
+ pshufb m0, [pw_swap16]
+ movu [r0], m1
+ movu [r0 + 16], m3
+ palignr m2, m1, m0, 14
+ movu [r0 + r1], m2
+ palignr m2, m3, m1, 14
+ movu [r0 + r1 + 16], m2
+ palignr m2, m1, m0, 12
+ movu [r0 + r1 * 2], m2
+ palignr m2, m3, m1, 12
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m1, m0, 10
+ movu [r0 + r4], m2
+ palignr m2, m3, m1, 10
+ movu [r0 + r4 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+ palignr m2, m1, m0, 8
+ movu [r0], m2
+ palignr m2, m3, m1, 8
+ movu [r0 + 16], m2
+ palignr m2, m1, m0, 6
+ movu [r0 + r1], m2
+ palignr m2, m3, m1, 6
+ movu [r0 + r1 + 16], m2
+ palignr m2, m1, m0, 4
+ movu [r0 + r1 * 2], m2
+ palignr m2, m3, m1, 4
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m1, m0, 2
+ movu [r0 + r4], m2
+ palignr m3, m1, 2
+ movu [r0 + r4 + 16], m3
+
+ lea r0, [r0 + r1 * 4]
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu m3, [r2 + 18 + 64]
+ pshufb m3, [pw_swap16]
+ palignr m2, m0, m3, 14
+ movu [r0 + r1], m2
+ palignr m2, m1, m0, 14
+ movu [r0 + r1 + 16], m2
+ palignr m2, m0, m3, 12
+ movu [r0 + r1 * 2], m2
+ palignr m2, m1, m0, 12
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m0, m3, 10
+ movu [r0 + r4], m2
+ palignr m2, m1, m0, 10
+ movu [r0 + r4 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+ palignr m2, m0, m3, 8
+ movu [r0], m2
+ palignr m2, m1, m0, 8
+ movu [r0 + 16], m2
+ palignr m2, m0, m3, 6
+ movu [r0 + r1], m2
+ palignr m2, m1, m0, 6
+ movu [r0 + r1 + 16], m2
+ palignr m2, m0, m3, 4
+ movu [r0 + r1 * 2], m2
+ palignr m2, m1, m0, 4
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m0, m3, 2
+ movu [r0 + r4], m2
+ palignr m1, m0, 2
+ movu [r0 + r4 + 16], m1
+ RET
+
+cglobal intra_pred_ang16_10, 3,6,4
+ mov r5d, r4m
+ movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9]
+ pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m2
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 16], m2
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
+ movu [r0 + r4], m2
+ movu [r0 + r4 + 16], m2
+
+ lea r3, [r0 + r1 *4]
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
+ movu [r3], m2
+ movu [r3 + 16], m2
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
+ movu [r3 + r1], m2
+ movu [r3 + r1 + 16], m2
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
+ movu [r3 + r1 * 2], m2
+ movu [r3 + r1 * 2 + 16], m2
+ psrldq m1, 2
+ pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
+ movu [r3 + r4], m2
+ movu [r3 + r4 + 16], m2
+
+ lea r3, [r3 + r1 *4]
+ pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9]
+ movu [r3], m2
+ movu [r3 + 16], m2
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10]
+ movu [r3 + r1], m2
+ movu [r3 + r1 + 16], m2
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11]
+ movu [r3 + r1 * 2], m2
+ movu [r3 + r1 * 2 + 16], m2
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12]
+ movu [r3 + r4], m2
+ movu [r3 + r4 + 16], m2
+
+ lea r3, [r3 + r1 *4]
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13]
+ movu [r3], m2
+ movu [r3 + 16], m2
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14]
+ movu [r3 + r1], m2
+ movu [r3 + r1 + 16], m2
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15]
+ movu [r3 + r1 * 2], m2
+ movu [r3 + r1 * 2 + 16], m2
+ psrldq m3, 2
+ pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16]
+ movu [r3 + r4], m2
+ movu [r3 + r4 + 16], m2
+ mova m3, m0
+
+ cmp r5d, byte 0
+ jz .quit
+
+ ; filter
+ pinsrw m1, [r2], 0 ; [3 2 1 0]
+ pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
+ movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ psubw m1, m2
+ psubw m3, m2
+ psraw m1, 1
+ psraw m3, 1
+ paddw m3, m0
+ paddw m0, m1
+ pxor m1, m1
+ pmaxsw m0, m1
+ pminsw m0, [pw_1023]
+ pmaxsw m3, m1
+ pminsw m3, [pw_1023]
+.quit:
+ movu [r0], m0
+ movu [r0 + 16], m3
+ RET
+
+cglobal intra_pred_ang16_26, 3,6,4
+ mov r5d, r4m
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ add r1, r1
+ lea r4, [r1 * 3]
+
+ movu [r0], m0
+ movu [r0 + 16], m3
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m3
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 16], m3
+ movu [r0 + r4], m0
+ movu [r0 + r4 + 16], m3
+
+ lea r3, [r0 + r1 *4]
+ movu [r3], m0
+ movu [r3 + 16], m3
+ movu [r3 + r1], m0
+ movu [r3 + r1 + 16], m3
+ movu [r3 + r1 * 2], m0
+ movu [r3 + r1 * 2 + 16], m3
+ movu [r3 + r4], m0
+ movu [r3 + r4 + 16], m3
+
+ lea r3, [r3 + r1 *4]
+ movu [r3], m0
+ movu [r3 + 16], m3
+ movu [r3 + r1], m0
+ movu [r3 + r1 + 16], m3
+ movu [r3 + r1 * 2], m0
+ movu [r3 + r1 * 2 + 16], m3
+ movu [r3 + r4], m0
+ movu [r3 + r4 + 16], m3
+
+ lea r3, [r3 + r1 *4]
+ movu [r3], m0
+ movu [r3 + 16], m3
+ movu [r3 + r1], m0
+ movu [r3 + r1 + 16], m3
+ movu [r3 + r1 * 2], m0
+ movu [r3 + r1 * 2 + 16], m3
+ movu [r3 + r4], m0
+ movu [r3 + r4 + 16], m3
+
+ cmp r5d, byte 0
+ jz .quit
+
+ ; filter
+
+ pshufb m0, [pw_unpackwdq]
+ pinsrw m1, [r2], 0 ; [3 2 1 0]
+ pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
+ movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9]
+ psubw m1, m2
+ psubw m3, m2
+ psraw m1, 1
+ psraw m3, 1
+ paddw m3, m0
+ paddw m0, m1
+ pxor m1, m1
+ pmaxsw m0, m1
+ pminsw m0, [pw_1023]
+ pmaxsw m3, m1
+ pminsw m3, [pw_1023]
+ pextrw [r0], m0, 0
+ pextrw [r0 + r1], m0, 1
+ pextrw [r0 + r1 * 2], m0, 2
+ pextrw [r0 + r4], m0, 3
+ lea r0, [r0 + r1 * 4]
+ pextrw [r0], m0, 4
+ pextrw [r0 + r1], m0, 5
+ pextrw [r0 + r1 * 2], m0, 6
+ pextrw [r0 + r4], m0, 7
+ lea r0, [r0 + r1 * 4]
+ pextrw [r0], m3, 0
+ pextrw [r0 + r1], m3, 1
+ pextrw [r0 + r1 * 2], m3, 2
+ pextrw [r0 + r4], m3, 3
+ pextrw [r3], m3, 4
+ pextrw [r3 + r1], m3, 5
+ pextrw [r3 + r1 * 2], m3, 6
+ pextrw [r3 + r4], m3, 7
+.quit:
+ RET
+
+%macro MODE_2_34 0
movu m0, [r2 + 4]
movu m1, [r2 + 20]
movu m2, [r2 + 36]
@@ -8828,322 +8849,658 @@ cglobal intra_pred_ang16_26, 4,5,4
movu [r0 + r4 + 48], m5
lea r0, [r0 + r1 * 4]
%endmacro
-;--------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_2_34(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;--------------------------------------------------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang32_2, 3,6,6
- cmp r4m, byte 34
- cmove r2, r3mp
- add r1, r1
- lea r3, [r1 * 2]
- lea r4, [r1 * 3]
- mov r5, 2
+%macro TRANSPOSE_STORE_8x8 6
+ %if %2 == 1
+ ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
+ punpckhwd m0, %3, %4
+ punpcklwd %3, %4
+ punpckhwd %4, %3, m0
+ punpcklwd %3, m0
+
+ punpckhwd m0, %5, %6
+ punpcklwd %5, %6
+ punpckhwd %6, %5, m0
+ punpcklwd %5, m0
+
+ punpckhqdq m0, %3, %5
+ punpcklqdq %3, %5
+ punpcklqdq %5, %4, %6
+ punpckhqdq %4, %6
+
+ movu [r0 + %1], %3
+ movu [r0 + r1 + %1], m0
+ movu [r0 + r1 * 2 + %1], %5
+ movu [r0 + r5 + %1], %4
+ %else
+ ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32
+ movh [r0], %3
+ movhps [r0 + r1], %3
+ movh [r0 + r1 * 2], %4
+ movhps [r0 + r5], %4
+ lea r0, [r0 + r1 * 4]
+ movh [r0], %5
+ movhps [r0 + r1], %5
+ movh [r0 + r1 * 2], %6
+ movhps [r0 + r5], %6
+ lea r0, [r0 + r1 * 4]
+ %endif
+%endmacro
+
+%macro MODE_3_33 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ mova m7, m0
+
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0
+
+ palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1
+ pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m5, m1, [r3 + 4 * 16] ; [20]
+ paddd m5, [pd_16]
+ psrld m5, 5
+ packusdw m4, m5
+
+ palignr m5, m2, m0, 8
+ pmaddwd m5, [r3 - 2 * 16] ; [14]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m2, m0, 12
+ pmaddwd m6, [r3 - 8 * 16] ; [ 8]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m2, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6]
+ pmaddwd m1, m0, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10]
+ palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7]
+
+ pmaddwd m2, [r3] ; [16]
+ paddd m2, [pd_16]
+ psrld m2, 5
+ packusdw m1, m2
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8]
+ movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16]
+ palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9]
+ punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12]
+ punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8]
+
+ palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9]
+ pmaddwd m4, m0, [r3 - 6 * 16] ; [10]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 14 * 16] ; [30]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10]
+ pmaddwd m6, [r3 + 8 * 16] ; [24]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11]
+ pmaddwd m6, m1, [r3 + 2 * 16] ; [18]
+ paddd m6, [pd_16]
+ psrld m6, 5
-.loop:
- MODE_2_34
- add r2, 32
- dec r5
- jnz .loop
- RET
+ pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
-%macro TRANSPOSE_STORE_8x8 6
- %if %2 == 1
- ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
- punpckhwd m0, %3, %4
- punpcklwd %3, %4
- punpckhwd %4, %3, m0
- punpcklwd %3, m0
+ palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13]
+ pmaddwd m1, m2, [r3 - 10 * 16] ; [6]
+ paddd m1, [pd_16]
+ psrld m1, 5
- punpckhwd m0, %5, %6
- punpcklwd %5, %6
- punpckhwd %6, %5, m0
- punpcklwd %5, m0
+ packusdw m1, m1
+ movhps m1, [r2 + 28] ; [00]
- punpckhqdq m0, %3, %5
- punpcklqdq %3, %5
- punpcklqdq %5, %4, %6
- punpckhqdq %4, %6
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- movu [r0 + %1], %3
- movu [r0 + r1 + %1], m0
- movu [r0 + r1 * 2 + %1], %5
- movu [r0 + r5 + %1], %4
- %else
- ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32
- movh [r0], %3
- movhps [r0 + r1], %3
- movh [r0 + r1 * 2], %4
- movhps [r0 + r5], %4
- lea r0, [r0 + r1 * 4]
- movh [r0], %5
- movhps [r0 + r1], %5
- movh [r0 + r1 * 2], %6
- movhps [r0 + r5], %6
- lea r0, [r0 + r1 * 4]
- %endif
+ movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28]
+ palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29]
+ punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32]
+ punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28]
+
+ pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29]
+ pmaddwd m1, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30]
+ pmaddwd m5, [r3 - 2 * 16] ; [14]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31]
+ pmaddwd m6, [r3 - 8 * 16] ; [ 8]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31]
+ pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m2, [r3 + 12 * 16] ; [28]
+ paddd m2, [pd_16]
+ psrld m2, 5
+ packusdw m6, m2
+
+ movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38]
+ palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39]
+ punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32]
+ punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28]
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m0, m2, m3, 4
+ pmaddwd m0, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ palignr m5, m2, m3, 8
+ pmaddwd m4, m5, [r3 - 6 * 16] ; [10]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m5, m2, m3, 12
+ pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 14 * 16] ; [30]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 46]
+ palignr m1, m3, 2
+ punpckhwd m2, m3, m1
+ punpcklwd m3, m1
+
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m6, m2, m3, 4
+ pmaddwd m6, [r3 + 2 * 16] ; [18]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ palignr m1, m2, m3, 8
+ pmaddwd m1, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m1, m2, m3, 12
+ pmaddwd m1, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2 + 54] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-%macro MODE_3_33 1
+%macro MODE_4_32 1
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- mova m7, m0
-
palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
+ pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 15 * 16] ; [31]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m2, m0, 8
+ pmaddwd m6, [r3 + 4 * 16] ; [ 20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m1, m2, m0, 12
+ pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
+
+ palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
+ palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
+ punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
+ punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
+
+ palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
+ pmaddwd m1, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m5, m2, m7, 8
+ mova m6, m5
+ pmaddwd m5, [r3 - 9 * 16] ; [07]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m6, m2, m7, 12
+ pmaddwd m6, [r3 + 16] ; [17]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m7, m3, m2, 4
+ pmaddwd m7, [r3] ; [16]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m1, m7
+ mova m7, m0
- palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1
- pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ palignr m0, m3, m2, 8
+ pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m5, m1, [r3 + 4 * 16] ; [20]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m4, m5
+ pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
- palignr m5, m2, m0, 8
- pmaddwd m5, [r3 - 2 * 16] ; [14]
+ palignr m5, m3, m2, 12
+ pmaddwd m5, [r3 - 16] ; [15]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m2, m0, 12
- pmaddwd m6, [r3 - 8 * 16] ; [ 8]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
- pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
+ pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m2, [r3 + 12 * 16] ; [28]
+ movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
+ palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
+ palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
+ punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
+ punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
+
+ palignr m1, m2, m3, 4
+ pmaddwd m1, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, m0, [r3 + 6 * 16] ; [22]
+ palignr m1, m2, m3, 8
+ mova m0, m1
+ pmaddwd m1, [r3 - 13 * 16] ; [3]
paddd m1, [pd_16]
psrld m1, 5
- psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10]
- palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7]
-
- pmaddwd m2, [r3] ; [16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m1, m2
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+ pmaddwd m0, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
- palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8]
- movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16]
- palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9]
- punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12]
- punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8]
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9]
- pmaddwd m4, m0, [r3 - 6 * 16] ; [10]
+ palignr m4, m2, m3, 12
+ pmaddwd m4, [r3 - 3 * 16] ; [13]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
+ pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, [r3 + 14 * 16] ; [30]
+ pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10]
- pmaddwd m6, [r3 + 8 * 16] ; [24]
+ palignr m6, m7, m2, 4
+ pmaddwd m6, [r3 - 4 * 16] ; [12]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11]
- pmaddwd m6, m1, [r3 + 2 * 16] ; [18]
+ palignr m1, m7, m2, 8
+ pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
+ pmaddwd m1, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13]
- pmaddwd m1, m2, [r3 - 10 * 16] ; [6]
+ palignr m1, m7, m2, 12
+ pmaddwd m1, [r3 - 5 * 16] ; [11]
paddd m1, [pd_16]
psrld m1, 5
-
packusdw m1, m1
- movhps m1, [r2 + 28] ; [00]
+ movhps m1, [r2 + 44] ; [00]
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
- movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28]
- palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29]
- punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32]
- punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28]
+%macro MODE_5_31 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
- pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
+ pmaddwd m4, m0, [r3 + 16] ; [17]
paddd m4, [pd_16]
psrld m4, 5
- palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29]
- pmaddwd m1, [r3 + 4 * 16] ; [20]
+ palignr m1, m2, m0, 4
+ mova m5, m1
+ pmaddwd m1, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30]
- pmaddwd m5, [r3 - 2 * 16] ; [14]
+ pmaddwd m5, [r3 + 3 * 16] ; [19]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31]
- pmaddwd m6, [r3 - 8 * 16] ; [ 8]
+ palignr m6, m2, m0, 8
+ mova m1, m6
+ pmaddwd m6, [r3 - 12 * 16] ; [4]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31]
- pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
+ pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m2, [r3 + 12 * 16] ; [28]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m6, m2
+ palignr m1, m2, m0, 12
+ mova m7, m1
+ pmaddwd m7, [r3 - 10 * 16] ; [6]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m6, m7
- movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38]
- palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39]
- punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32]
- punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28]
+ pmaddwd m1, [r3 + 7 * 16] ; [23]
+ paddd m1, [pd_16]
+ psrld m1, 5
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m1, m7
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
+ pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
+ packusdw m4, m1
- palignr m0, m2, m3, 4
+ pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
+ palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
+ palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
+ punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
+ punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
+
+ palignr m6, m2, m7, 4
+ pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ pmaddwd m6, [r3 + 13 * 16] ; [29]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ palignr m1, m2, m7, 8
+ mova m0, m1
+ pmaddwd m1, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m0, m2, m7, 12
pmaddwd m0, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- palignr m5, m2, m3, 8
- pmaddwd m4, m5, [r3 - 6 * 16] ; [10]
+ pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
paddd m4, [pd_16]
psrld m4, 5
- palignr m5, m2, m3, 12
- pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
+ pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, [r3 + 14 * 16] ; [30]
+ palignr m1, m3, m2, 4
+ pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 46]
- palignr m1, m3, 2
- punpckhwd m2, m3, m1
- punpcklwd m3, m1
+ pmaddwd m1, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ palignr m1, m3, m2, 8
+ pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
paddd m6, [pd_16]
psrld m6, 5
- packusdw m5, m6
- palignr m6, m2, m3, 4
- pmaddwd m6, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m7, m3, m2, 12
+ pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m7, [r3 + 8 * 16] ; [24]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m1, m7
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
+ palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
+ punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
+
+ palignr m1, m0, m3, 4
+ pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m1, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ palignr m1, m0, m3, 8
+ pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
paddd m6, [pd_16]
psrld m6, 5
- palignr m1, m2, m3, 8
- pmaddwd m1, [r3 - 4 * 16] ; [12]
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
-
- palignr m1, m2, m3, 12
- pmaddwd m1, [r3 - 10 * 16] ; [06]
+
+ palignr m1, m0, m3, 12
+ pmaddwd m1, [r3 - 16] ; [15]
paddd m1, [pd_16]
psrld m1, 5
-
packusdw m1, m1
- movhps m1, [r2 + 54] ; [00]
+ movhps m1, [r2 + 36] ; [00]
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_3, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_3_33 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-%macro MODE_4_32 1
+%macro MODE_6_30 1
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
- pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
+ pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
paddd m4, [pd_16]
psrld m4, 5
- palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
- pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
+ pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, [r3 + 15 * 16] ; [31]
+ palignr m1, m2, m0, 4
+ pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m2, m0, 8
- pmaddwd m6, [r3 + 4 * 16] ; [ 20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
+ pmaddwd m1, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
- palignr m1, m2, m0, 12
- pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
+ palignr m1, m2, m0, 8
+ pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
+ pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m6, m7
- pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
+ pmaddwd m1, [r3 + 11 * 16] ; [27]
paddd m1, [pd_16]
psrld m1, 5
- palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
+ palignr m7, m2, m0, 12
pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
paddd m0, [pd_16]
psrld m0, 5
@@ -9151,405 +9508,340 @@ cglobal intra_pred_ang32_3, 3,6,8
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
+ pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
paddd m4, [pd_16]
psrld m4, 5
- movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
-
- palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
- palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
- punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
- punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
-
- palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
- pmaddwd m1, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- palignr m5, m2, m7, 8
- mova m6, m5
- pmaddwd m5, [r3 - 9 * 16] ; [07]
+ pmaddwd m5, m2, [r3 - 16] ; [15]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, [r3 + 12 * 16] ; [28]
+ pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- palignr m6, m2, m7, 12
- pmaddwd m6, [r3 + 16] ; [17]
+ palignr m7, m3, m2, 4
+ pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
+ pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
+ movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
+ palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
+ palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
+ punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
+ punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
+
+ palignr m0, m2, m7, 4
+ pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
paddd m1, [pd_16]
psrld m1, 5
- palignr m7, m3, m2, 4
- pmaddwd m7, [r3] ; [16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m1, m7
- mova m7, m0
+ pmaddwd m0, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- palignr m0, m3, m2, 8
- pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
+ palignr m4, m2, m7, 4
+ pmaddwd m4, [r3 + 13 * 16] ; [29]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
+ palignr m5, m2, m7, 8
+ pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- palignr m5, m3, m2, 12
- pmaddwd m5, [r3 - 16] ; [15]
+ pmaddwd m5, [r3 + 7 * 16] ; [23]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
+ palignr m1, m2, m7, 12
+ pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
paddd m6, [pd_16]
psrld m6, 5
+ packusdw m5, m6
- movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
- palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
- palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
- punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
- punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
+ pmaddwd m6, m1, [r3 + 16] ; [17]
+ paddd m6, [pd_16]
+ psrld m6, 5
- palignr m1, m2, m3, 4
- pmaddwd m1, [r3 - 2 * 16] ; [14]
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- palignr m1, m2, m3, 8
- mova m0, m1
- pmaddwd m1, [r3 - 13 * 16] ; [3]
+ pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- palignr m4, m2, m3, 12
- pmaddwd m4, [r3 - 3 * 16] ; [13]
+ palignr m5, m3, m2, 4
+ pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
+ pmaddwd m5, [r3 + 15 * 16] ; [31]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m7, m2, 4
- pmaddwd m6, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
+ palignr m6, m3, m2, 8
+ pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
- palignr m1, m7, m2, 8
- pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
+ pmaddwd m6, [r3 + 9 * 16] ; [25]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
+ palignr m1, m3, m2, 12
+ pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m6, m0
- palignr m1, m7, m2, 12
- pmaddwd m1, [r3 - 5 * 16] ; [11]
+ pmaddwd m1, [r3 + 3 * 16] ; [19]
paddd m1, [pd_16]
psrld m1, 5
packusdw m1, m1
- movhps m1, [r2 + 44] ; [00]
+ movhps m1, [r2 + 28] ; [00]
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_4, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_4_32 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-%macro MODE_5_31 1
+%macro MODE_7_29 1
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
- pmaddwd m4, m0, [r3 + 16] ; [17]
+ pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
paddd m4, [pd_16]
psrld m4, 5
- palignr m1, m2, m0, 4
- mova m5, m1
- pmaddwd m1, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, [r3 + 3 * 16] ; [19]
+ pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m2, m0, 8
- mova m1, m6
- pmaddwd m6, [r3 - 12 * 16] ; [4]
+ palignr m1, m2, m0, 4
+ pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
+ pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
paddd m6, [pd_16]
psrld m6, 5
- palignr m1, m2, m0, 12
- mova m7, m1
- pmaddwd m7, [r3 - 10 * 16] ; [6]
+ pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
paddd m7, [pd_16]
psrld m7, 5
packusdw m6, m7
- pmaddwd m1, [r3 + 7 * 16] ; [23]
+ pmaddwd m1, [r3 + 15 * 16] ; [31]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m1, m7
+ mova m3, m0
+ palignr m7, m2, m0, 8
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
+ pmaddwd m4, m7, [r3 + 16] ; [17]
paddd m4, [pd_16]
psrld m4, 5
- palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
- pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
+ pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
+ palignr m1, m2, m3, 12
+ pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
paddd m5, [pd_16]
psrld m5, 5
- movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
- palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
- palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
- punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
- punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
-
- palignr m6, m2, m7, 4
- pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
+ pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
- pmaddwd m6, [r3 + 13 * 16] ; [29]
+ pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
paddd m6, [pd_16]
psrld m6, 5
- palignr m1, m2, m7, 8
- mova m0, m1
- pmaddwd m1, [r3 - 2 * 16] ; [14]
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
+ pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
paddd m1, [pd_16]
psrld m1, 5
- palignr m0, m2, m7, 12
- pmaddwd m0, [r3] ; [16]
+ pmaddwd m0, m2, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
+ pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
+ movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
+ punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
+
+ palignr m6, m7, m2, 4
+ pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- palignr m1, m3, m2, 4
- pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
+ pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m1, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
+ pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
- palignr m1, m3, m2, 8
- pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
+ pmaddwd m6, [r3 + 13 * 16] ; [29]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, [r3 + 6 * 16] ; [22]
+ palignr m0, m7, m2, 8
+ pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- palignr m7, m3, m2, 12
- pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
+ pmaddwd m1, m0, [r3 - 16] ; [15]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m7, [r3 + 8 * 16] ; [24]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m1, m7
+ pmaddwd m0, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
+ palignr m0, m7, m2, 12
+ pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
- palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
- punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
-
- palignr m1, m0, m3, 4
- pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
+ pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m1, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
+ pmaddwd m0, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
- palignr m1, m0, m3, 8
- pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
+ pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
+ pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m6, m0
- palignr m1, m0, m3, 12
- pmaddwd m1, [r3 - 16] ; [15]
+ pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
paddd m1, [pd_16]
psrld m1, 5
packusdw m1, m1
- movhps m1, [r2 + 36] ; [00]
+ movhps m1, [r2 + 20] ; [00]
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_5, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_5_31 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-%macro MODE_6_30 1
+%macro MODE_8_28 1
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
- pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
+ pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- palignr m1, m2, m0, 4
- pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
+ pmaddwd m5, m0, [r3 - 16] ; [15]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m1, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
+ pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
- palignr m1, m2, m0, 8
- pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
+ pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
+ pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
- pmaddwd m1, [r3 + 11 * 16] ; [27]
+ palignr m7, m2, m0, 4
+ pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
paddd m1, [pd_16]
psrld m1, 5
- palignr m7, m2, m0, 12
+ mova m3, m0
pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
paddd m0, [pd_16]
psrld m0, 5
@@ -9557,962 +9849,794 @@ cglobal intra_pred_ang32_5, 3,6,8
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
+ pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m2, [r3 - 16] ; [15]
+ pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
+ pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- palignr m7, m3, m2, 4
- pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
+ palignr m7, m2, m3, 8
+ pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
+ pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
- palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
- palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
- punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
- punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
-
- palignr m0, m2, m7, 4
- pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
+ pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, [r3] ; [16]
+ pmaddwd m0, m7, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- palignr m4, m2, m7, 4
- pmaddwd m4, [r3 + 13 * 16] ; [29]
+ pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
paddd m4, [pd_16]
psrld m4, 5
- palignr m5, m2, m7, 8
- pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
+ pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, [r3 + 7 * 16] ; [23]
+ pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
paddd m5, [pd_16]
psrld m5, 5
- palignr m1, m2, m7, 12
- pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
+ palignr m7, m2, m3, 12
+ pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
- pmaddwd m6, m1, [r3 + 16] ; [17]
+ pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
+ pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- palignr m5, m3, m2, 4
- pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
+ pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, [r3 + 15 * 16] ; [31]
+ pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
paddd m5, [pd_16]
psrld m5, 5
- palignr m6, m3, m2, 8
- pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
+ pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
- pmaddwd m6, [r3 + 9 * 16] ; [25]
+ pmaddwd m6, m2, [r3 + 16] ; [17]
paddd m6, [pd_16]
psrld m6, 5
- palignr m1, m3, m2, 12
- pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
+ pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
paddd m0, [pd_16]
psrld m0, 5
packusdw m6, m0
- pmaddwd m1, [r3 + 3 * 16] ; [19]
+ pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
paddd m1, [pd_16]
psrld m1, 5
packusdw m1, m1
- movhps m1, [r2 + 28] ; [00]
+ movhps m1, [r2 + 12] ; [00]
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_6, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_6_30 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-%macro MODE_7_29 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+%macro MODE_9_27 1
+ movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
- pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
+ pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
+ pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
paddd m5, [pd_16]
psrld m5, 5
- palignr m1, m2, m0, 4
- pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
+ pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
- pmaddwd m1, [r3 + 15 * 16] ; [31]
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
- mova m3, m0
- palignr m7, m2, m0, 8
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ pmaddwd m0, m3, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 + 16] ; [17]
+ pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- palignr m1, m2, m3, 12
- pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
+ pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
+ pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m2, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
+ packusdw m1, m1
+ movhps m1, [r2 + 4] ; [00]
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
+ palignr m7, m2, m3, 4
+ pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
paddd m4, [pd_16]
psrld m4, 5
- movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
- punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
-
- palignr m6, m7, m2, 4
- pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
+ pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
paddd m0, [pd_16]
psrld m0, 5
packusdw m5, m0
- pmaddwd m6, [r3 + 13 * 16] ; [29]
+ pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
paddd m6, [pd_16]
psrld m6, 5
- palignr m0, m7, m2, 8
- pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
+ pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m0, [r3 - 16] ; [15]
+ pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m7, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- palignr m0, m7, m2, 12
- pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
+ pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
+ pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
+ pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, [r3 + 12 * 16] ; [28]
+ pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m5, m0
- pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
+ pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
+ pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
paddd m0, [pd_16]
psrld m0, 5
packusdw m6, m0
- pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 20] ; [00]
+ pmaddwd m7, [r3 + 14 * 16] ; [30]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m7, m7
+ movhps m7, [r2 + 6] ; [00]
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_7, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_7_29 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-%macro MODE_8_28 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+%macro MODE_11_25 1
+ movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
- pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
+ pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m0, [r3 - 16] ; [15]
+ pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
+ pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- palignr m7, m2, m0, 4
- pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
- mova m3, m0
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ pmaddwd m0, m3, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
+ pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
+ pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- palignr m7, m2, m3, 8
- pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
+ pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m7, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
+ packusdw m1, m1
+ movhps m1, [r2 + 2] ; [00]
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
+ movu m3, [r2] ; [6 5 4 3 2 1 0 16]
+ pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
+
+ pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
+ pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
paddd m5, [pd_16]
psrld m5, 5
- palignr m7, m2, m3, 12
- pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m5, m0
- pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
+ pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m3, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
+ pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
+ pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
- pmaddwd m6, m2, [r3 + 16] ; [17]
+ pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m6, m0
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
- pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
+
packusdw m1, m1
- movhps m1, [r2 + 12] ; [00]
+ movhps m1, [r2] ; [00]
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_8, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_8_28 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-%macro MODE_9_27 1
- movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
+%macro MODE_12_24 1
+ movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
- pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
+ pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
+ pmaddwd m5, m3, [r3 + 16] ; [17]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
+ pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3] ; [16]
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
+ pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
paddd m1, [pd_16]
psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 4] ; [00]
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- palignr m7, m2, m3, 4
- pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
+ pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
+ pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
paddd m0, [pd_16]
psrld m0, 5
packusdw m5, m0
- pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
+ pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
+ pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m7, [r3] ; [16]
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
+ pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
+ pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
paddd m6, [pd_16]
psrld m6, 5
+ packusdw m5, m6
- pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m6, m0
-
- pmaddwd m7, [r3 + 14 * 16] ; [30]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m7, m7
- movhps m7, [r2 + 6] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_9, 3,6,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_9_27 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_10, 4,7,8
- mov r6d, 4
- add r1, r1
- lea r5, [r1 * 3]
- lea r4, [r1 * 2]
- lea r3, [r1 * 4]
- mova m7, [c_mode32_10_0]
-
-.loop:
- movu m0, [r2 + 2]
- pshufb m1, m0, m7
- movu [r0], m1
- movu [r0 + 16], m1
- movu [r0 + 32], m1
- movu [r0 + 48], m1
-
- palignr m1, m0, 2
- pshufb m1, m7
- movu [r0 + r1], m1
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m1
- movu [r0 + r1 + 48], m1
-
- palignr m1, m0, 4
- pshufb m1, m7
- movu [r0 + r4], m1
- movu [r0 + r4 + 16], m1
- movu [r0 + r4 + 32], m1
- movu [r0 + r4 + 48], m1
-
- palignr m1, m0, 6
- pshufb m1, m7
- movu [r0 + r5], m1
- movu [r0 + r5 + 16], m1
- movu [r0 + r5 + 32], m1
- movu [r0 + r5 + 48], m1
-
- add r0, r3
-
- palignr m1, m0, 8
- pshufb m1, m7
- movu [r0], m1
- movu [r0 + 16], m1
- movu [r0 + 32], m1
- movu [r0 + 48], m1
+ pmaddwd m6, m3, [r3 - 16] ; [15]
+ paddd m6, [pd_16]
+ psrld m6, 5
- palignr m1, m0, 10
- pshufb m1, m7
- movu [r0 + r1], m1
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m1
- movu [r0 + r1 + 48], m1
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
- palignr m1, m0, 12
- pshufb m1, m7
- movu [r0 + r4], m1
- movu [r0 + r4 + 16], m1
- movu [r0 + r4 + 32], m1
- movu [r0 + r4 + 48], m1
+ pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
+ paddd m1, [pd_16]
+ psrld m1, 5
- palignr m1, m0, 14
- pshufb m1, m7
- movu [r0 + r5], m1
- movu [r0 + r5 + 16], m1
- movu [r0 + r5 + 32], m1
- movu [r0 + r5 + 48], m1
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
- add r0, r3
- add r2, 16
- dec r6d
- jnz .loop
- RET
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
-%macro MODE_11_25 1
- movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
- pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
+%macro MODE_13_23 1
+ movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
- pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
+ pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3] ; [16]
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
+ pmaddwd m4, m3, [r3 - 16] ; [15]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
+ pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
paddd m1, [pd_16]
psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 2] ; [00]
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- movu m3, [r2] ; [6 5 4 3 2 1 0 16]
- pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
-
- pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
+ pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
paddd m0, [pd_16]
psrld m0, 5
packusdw m5, m0
- pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m3, [r3 + 16] ; [17]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3] ; [16]
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
+ pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 0*mmsize + 2], m0
- movu [rsp + 1*mmsize + 2], m1
- movu [rsp + 2*mmsize + 2], m2
- movu [rsp + 3*mmsize + 2], m3
- mov r4w, [r3+32]
- mov [rsp], r4w
- mov r4w, [r2+64]
- mov [rsp+66], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_11_25 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
+ psrld m1, 5
-%macro MODE_12_24 1
- movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_14_22 1
+ movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
- pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
+ pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 16] ; [17]
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
paddd m5, [pd_16]
psrld m5, 5
@@ -10521,22 +10645,25 @@ cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
+ movu m3, [r2 + 20]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
+ pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
paddd m1, [pd_16]
psrld m1, 5
+ movu m3, [r2 + 18]
+ pshufb m3, m2
+
pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
@@ -10544,37 +10671,43 @@ cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
+ pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
+ pmaddwd m5, m3, [r3 + 16] ; [17]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- movu m3, [r2 + 4]
+ movu m3, [r2 + 14]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
+ pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
paddd m1, [pd_16]
psrld m1, 5
@@ -10585,20 +10718,23 @@ cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
+ pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
+ pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 2]
+ movu m3, [r2 + 8]
pshufb m3, m2
pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
@@ -10606,57 +10742,66 @@ cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
psrld m0, 5
packusdw m5, m0
- pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
+ pmaddwd m6, m3, [r3 - 16] ; [15]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
paddd m4, [pd_16]
psrld m4, 5
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
+ pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
paddd m5, [pd_16]
psrld m5, 5
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 16] ; [15]
+ pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
+ pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
paddd m1, [pd_16]
psrld m1, 5
@@ -10665,55 +10810,69 @@ cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 0*mmsize + 8], m0
- movu [rsp + 1*mmsize + 8], m1
- movu [rsp + 2*mmsize + 8], m2
- movu [rsp + 3*mmsize + 8], m3
- mov r4w, [r2+64]
- mov [rsp+72], r4w
- mov r4w, [r3+12]
- mov [rsp+6], r4w
- mov r4w, [r3+26]
- mov [rsp+4], r4w
- mov r4w, [r3+38]
- mov [rsp+2], r4w
- mov r4w, [r3+52]
- mov [rsp], r4w
+%macro MODE_15_21 1
+ movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
+ pmaddwd m4, m3, [r3 - 16] ; [15]
+ paddd m4, [pd_16]
+ psrld m4, 5
-.loop:
- MODE_12_24 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
+ movu m3, [r2 + 30]
+ pshufb m3, m2
-%macro MODE_13_23 1
- movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
- pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
+ pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 28]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 26]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 24]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
@@ -10722,19 +10881,22 @@ cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 14]
+ movu m3, [r2 + 20]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
+ pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ movu m3, [r2 + 18]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
@@ -10743,17 +10905,70 @@ cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
paddd m1, [pd_16]
psrld m1, 5
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
movu m3, [r2 + 12]
pshufb m3, m2
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 16] ; [15]
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
paddd m4, [pd_16]
psrld m4, 5
@@ -10762,19 +10977,22 @@ cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
psrld m1, 5
packusdw m4, m1
- movu m3, [r2 + 10]
+ movu m3, [r2 + 4]
pshufb m3, m2
- pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
+ pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
paddd m6, [pd_16]
psrld m6, 5
@@ -10783,77 +11001,92 @@ cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
psrld m1, 5
packusdw m6, m1
- movu m3, [r2 + 8]
+ movu m3, [r2]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
+ pmaddwd m1, m3, [r3 + 16] ; [17]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
- pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
+%macro MODE_16_20 1
+ movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
paddd m4, [pd_16]
psrld m4, 5
- movu m3, [r2 + 6]
+ movu m3, [r2 + 38]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
+ pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
+ movu m3, [r2 + 36]
+ pshufb m3, m2
- pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
+ pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
paddd m6, [pd_16]
psrld m6, 5
+ packusdw m5, m6
- movu m3, [r2 + 4]
+ movu m3, [r2 + 34]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 + 16] ; [17]
+ movu m3, [r2 + 32]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
+ movu m3, [r2 + 30]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 2]
- pshufb m3, m2
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
+ pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ movu m3, [r2 + 28]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
+ movu m3, [r2 + 26]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
paddd m5, [pd_16]
psrld m5, 5
@@ -10862,74 +11095,39 @@ cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
psrld m6, 5
packusdw m5, m6
- movu m3, [r2]
+ movu m3, [r2 + 24]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
+ pmaddwd m6, m3, [r3 - 16] ; [15]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
+ pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
paddd m1, [pd_16]
psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 1*mmsize], m0
- movu [rsp + 2*mmsize], m1
- movu [rsp + 3*mmsize], m2
- movu [rsp + 4*mmsize], m3
-
- mov r4w, [r2+64]
- mov [rsp+80], r4w
- movu m0, [r3 + 8]
- movu m1, [r3 + 36]
- pshufb m0, [shuf_mode_13_23]
- pshufb m1, [shuf_mode_13_23]
- movh [rsp + 8], m0
- movh [rsp], m1
- mov r4w, [r3+28]
- mov [rsp+8], r4w
- mov r4w, [r3+56]
- mov [rsp], r4w
+ movu m3, [r2 + 20]
+ pshufb m3, m2
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
-.loop:
- MODE_13_23 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-%macro MODE_14_22 1
- movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+ movu m3, [r2 + 18]
+ pshufb m3, m2
- pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
+ pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
paddd m4, [pd_16]
psrld m4, 5
@@ -10938,49 +11136,55 @@ cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
psrld m1, 5
packusdw m4, m1
- movu m3, [r2 + 22]
+ movu m3, [r2 + 16]
pshufb m3, m2
- pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
+ pmaddwd m5, m3, [r3 + 16] ; [17]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 20]
+ movu m3, [r2 + 14]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
+ pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
paddd m6, [pd_16]
psrld m6, 5
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
paddd m1, [pd_16]
psrld m1, 5
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
paddd m4, [pd_16]
psrld m4, 5
- movu m3, [r2 + 16]
+ movu m3, [r2 + 6]
pshufb m3, m2
pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
@@ -10988,19 +11192,22 @@ cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 + 16] ; [17]
+ pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
paddd m5, [pd_16]
psrld m5, 5
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- movu m3, [r2 + 14]
+ movu m3, [r2 + 2]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
+ pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
paddd m6, [pd_16]
psrld m6, 5
@@ -11009,363 +11216,619 @@ cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
psrld m1, 5
packusdw m6, m1
- movu m3, [r2 + 12]
+ movu m3, [r2]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
+ pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
- pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
+%macro MODE_17_19 1
+ movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
paddd m4, [pd_16]
psrld m4, 5
- movu m3, [r2 + 10]
+ movu m3, [r2 + 48]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
+ movu m3, [r2 + 46]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 8]
+ movu m3, [r2 + 44]
pshufb m3, m2
- pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
- pmaddwd m6, m3, [r3 - 16] ; [15]
+ movu m3, [r2 + 42]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
paddd m6, [pd_16]
psrld m6, 5
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- movu m3, [r2 + 6]
+ movu m3, [r2 + 40]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
+ movu m3, [r2 + 38]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- movu m3, [r2 + 4]
+ movu m3, [r2 + 36]
pshufb m3, m2
- pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
+ pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
paddd m4, [pd_16]
psrld m4, 5
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ movu m3, [r2 + 34]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
+ pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 2]
+ movu m3, [r2 + 32]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
+ movu m3, [r2 + 30]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
paddd m6, [pd_16]
psrld m6, 5
- movu m3, [r2]
+ movu m3, [r2 + 28]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
+ movu m3, [r2 + 26]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
paddd m1, [pd_16]
psrld m1, 5
packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_14, 4,6,7,0-(5*mmsize+10)
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 1*mmsize + 8], m0
- movu [rsp + 2*mmsize + 8], m1
- movu [rsp + 3*mmsize + 8], m2
- movu [rsp + 4*mmsize + 8], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 88], r4w
- mov r4w, [r3+4]
- mov [rsp+22], r4w
- movu m0, [r3 + 10]
- movu m1, [r3 + 30]
- movu m2, [r3 + 50]
- pshufb m0, [shuf_mode_14_22]
- pshufb m1, [shuf_mode_14_22]
- pshufb m2, [shuf_mode_14_22]
- movh [rsp + 14], m0
- movh [rsp + 6], m1
- movh [rsp - 2], m2
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
+ movhps m1, [r2 + 26] ; [00]
-.loop:
- MODE_14_22 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-%macro MODE_15_21 1
- movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+ movu m3, [r2 + 24]
+ pshufb m3, m2
- pmaddwd m4, m3, [r3 - 16] ; [15]
+ pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
paddd m4, [pd_16]
psrld m4, 5
- movu m3, [r2 + 30]
+ movu m3, [r2 + 22]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
+ movu m3, [r2 + 20]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 28]
+ movu m3, [r2 + 18]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
- pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
paddd m6, [pd_16]
psrld m6, 5
- movu m3, [r2 + 26]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
paddd m1, [pd_16]
psrld m1, 5
- movu m3, [r2 + 24]
+ movu m3, [r2 + 12]
pshufb m3, m2
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ pmaddwd m0, m3, [r3] ; [16]
paddd m0, [pd_16]
psrld m0, 5
packusdw m1, m0
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
- pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
paddd m4, [pd_16]
psrld m4, 5
- movu m3, [r2 + 22]
+ movu m3, [r2 + 8]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
paddd m1, [pd_16]
psrld m1, 5
packusdw m4, m1
- pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
+ pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
paddd m5, [pd_16]
psrld m5, 5
- movu m3, [r2 + 20]
+ movu m3, [r2 + 6]
pshufb m3, m2
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
paddd m6, [pd_16]
psrld m6, 5
packusdw m5, m6
- pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
paddd m6, [pd_16]
psrld m6, 5
- movu m3, [r2 + 18]
+ movu m3, [r2 + 2]
pshufb m3, m2
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
paddd m1, [pd_16]
psrld m1, 5
packusdw m6, m1
- pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
- paddd m1, [pd_16]
- psrld m1, 5
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+;------------------------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang32_2, 3,6,6
+ lea r4, [r2]
+ add r2, 128
+ cmp r3m, byte 34
+ cmove r2, r4
+
+ add r1, r1
+ lea r3, [r1 * 2]
+ lea r4, [r1 * 3]
+ mov r5, 2
+
+.loop:
+ MODE_2_34
+ add r2, 32
+ dec r5
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_3, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
+
+.loop:
+ MODE_3_33 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_4, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
+
+.loop:
+ MODE_4_32 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_5, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
+
+.loop:
+ MODE_5_31 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_6, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
+
+.loop:
+ MODE_6_30 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_7, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
+
+.loop:
+ MODE_7_29 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_8, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
+
+.loop:
+ MODE_8_28 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- movu m3, [r2 + 16]
- pshufb m3, m2
+INIT_XMM sse4
+cglobal intra_pred_ang32_9, 3,6,8
+ add r2, 128
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ add r1, r1
+ lea r5, [r1 * 3]
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
+.loop:
+ MODE_9_27 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+INIT_XMM sse4
+cglobal intra_pred_ang32_10, 3,7,8
+ add r2, 128
+ mov r6d, 4
+ add r1, r1
+ lea r5, [r1 * 3]
+ lea r4, [r1 * 2]
+ lea r3, [r1 * 4]
+ mova m7, [c_mode32_10_0]
- movu m3, [r2 + 14]
- pshufb m3, m2
+.loop:
+ movu m0, [r2 + 2]
+ pshufb m1, m0, m7
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + 32], m1
+ movu [r0 + 48], m1
- pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
- paddd m4, [pd_16]
- psrld m4, 5
+ palignr m1, m0, 2
+ pshufb m1, m7
+ movu [r0 + r1], m1
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m1
+ movu [r0 + r1 + 48], m1
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
+ palignr m1, m0, 4
+ pshufb m1, m7
+ movu [r0 + r4], m1
+ movu [r0 + r4 + 16], m1
+ movu [r0 + r4 + 32], m1
+ movu [r0 + r4 + 48], m1
- movu m3, [r2 + 12]
- pshufb m3, m2
+ palignr m1, m0, 6
+ pshufb m1, m7
+ movu [r0 + r5], m1
+ movu [r0 + r5 + 16], m1
+ movu [r0 + r5 + 32], m1
+ movu [r0 + r5 + 48], m1
- pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
- paddd m5, [pd_16]
- psrld m5, 5
+ add r0, r3
- pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
+ palignr m1, m0, 8
+ pshufb m1, m7
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + 32], m1
+ movu [r0 + 48], m1
- movu m3, [r2 + 10]
- pshufb m3, m2
+ palignr m1, m0, 10
+ pshufb m1, m7
+ movu [r0 + r1], m1
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m1
+ movu [r0 + r1 + 48], m1
- pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
+ palignr m1, m0, 12
+ pshufb m1, m7
+ movu [r0 + r4], m1
+ movu [r0 + r4 + 16], m1
+ movu [r0 + r4 + 32], m1
+ movu [r0 + r4 + 48], m1
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
+ palignr m1, m0, 14
+ pshufb m1, m7
+ movu [r0 + r5], m1
+ movu [r0 + r5 + 16], m1
+ movu [r0 + r5 + 32], m1
+ movu [r0 + r5 + 48], m1
- movu m3, [r2 + 8]
- pshufb m3, m2
+ add r0, r3
+ add r2, 16
+ dec r6d
+ jnz .loop
+ RET
- pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
- paddd m1, [pd_16]
- psrld m1, 5
+INIT_XMM sse4
+cglobal intra_pred_ang32_11, 3,6,7,0-(4*mmsize+4)
+ mov r3, r2mp
+ add r2, 128
+ movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
+ movu m1, [r2 + 1*mmsize]
+ movu m2, [r2 + 2*mmsize]
+ movu m3, [r2 + 3*mmsize]
+ movu [rsp + 0*mmsize + 2], m0
+ movu [rsp + 1*mmsize + 2], m1
+ movu [rsp + 2*mmsize + 2], m2
+ movu [rsp + 3*mmsize + 2], m3
+ mov r4w, [r3+32]
+ mov [rsp], r4w
+ mov r4w, [r2+64]
+ mov [rsp+66], r4w
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ mov r2, rsp
+ add r1, r1
+ lea r5, [r1 * 3]
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+.loop:
+ MODE_11_25 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- movu m3, [r2 + 6]
- pshufb m3, m2
+INIT_XMM sse4
+cglobal intra_pred_ang32_12, 3,6,7,0-(4*mmsize+10)
+ mov r3, r2mp
+ add r2, 128
+ movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
+ movu m1, [r2 + 1*mmsize]
+ movu m2, [r2 + 2*mmsize]
+ movu m3, [r2 + 3*mmsize]
+ movu [rsp + 0*mmsize + 8], m0
+ movu [rsp + 1*mmsize + 8], m1
+ movu [rsp + 2*mmsize + 8], m2
+ movu [rsp + 3*mmsize + 8], m3
- pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
+ mov r4w, [r2+64]
+ mov [rsp+72], r4w
+ mov r4w, [r3+12]
+ mov [rsp+6], r4w
+ mov r4w, [r3+26]
+ mov [rsp+4], r4w
+ mov r4w, [r3+38]
+ mov [rsp+2], r4w
+ mov r4w, [r3+52]
+ mov [rsp], r4w
- pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ mov r2, rsp
+ add r1, r1
+ lea r5, [r1 * 3]
+ mova m2, [pw_punpcklwd]
- movu m3, [r2 + 4]
- pshufb m3, m2
+.loop:
+ MODE_12_24 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
- paddd m5, [pd_16]
- psrld m5, 5
+INIT_XMM sse4
+cglobal intra_pred_ang32_13, 3,6,7,0-(5*mmsize+2)
+ mov r3, r2mp
+ add r2, 128
+ movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
+ movu m1, [r2 + 1*mmsize]
+ movu m2, [r2 + 2*mmsize]
+ movu m3, [r2 + 3*mmsize]
+ movu [rsp + 1*mmsize], m0
+ movu [rsp + 2*mmsize], m1
+ movu [rsp + 3*mmsize], m2
+ movu [rsp + 4*mmsize], m3
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
+ mov r4w, [r2+64]
+ mov [rsp+80], r4w
+ movu m0, [r3 + 8]
+ movu m1, [r3 + 36]
+ pshufb m0, [shuf_mode_13_23]
+ pshufb m1, [shuf_mode_13_23]
+ movh [rsp + 8], m0
+ movh [rsp], m1
+ mov r4w, [r3+28]
+ mov [rsp+8], r4w
+ mov r4w, [r3+56]
+ mov [rsp], r4w
- movu m3, [r2 + 2]
- pshufb m3, m2
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ mov r2, rsp
+ add r1, r1
+ lea r5, [r1 * 3]
+ mova m2, [pw_punpcklwd]
- pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
+.loop:
+ MODE_13_23 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
+INIT_XMM sse4
+cglobal intra_pred_ang32_14, 3,6,7,0-(5*mmsize+10)
+ mov r3, r2mp
+ add r2, 128
+ movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
+ movu m1, [r2 + 1*mmsize]
+ movu m2, [r2 + 2*mmsize]
+ movu m3, [r2 + 3*mmsize]
+ movu [rsp + 1*mmsize + 8], m0
+ movu [rsp + 2*mmsize + 8], m1
+ movu [rsp + 3*mmsize + 8], m2
+ movu [rsp + 4*mmsize + 8], m3
- movu m3, [r2]
- pshufb m3, m2
+ mov r4w, [r2 + 64]
+ mov [rsp + 88], r4w
+ mov r4w, [r3+4]
+ mov [rsp+22], r4w
+ movu m0, [r3 + 10]
+ movu m1, [r3 + 30]
+ movu m2, [r3 + 50]
+ pshufb m0, [shuf_mode_14_22]
+ pshufb m1, [shuf_mode_14_22]
+ pshufb m2, [shuf_mode_14_22]
+ movh [rsp + 14], m0
+ movh [rsp + 6], m1
+ movh [rsp - 2], m2
- pmaddwd m1, m3, [r3 + 16] ; [17]
- paddd m1, [pd_16]
- psrld m1, 5
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ mov r2, rsp
+ add r1, r1
+ lea r5, [r1 * 3]
+ mova m2, [pw_punpcklwd]
- packusdw m1, m1
- movhps m1, [r2] ; [00]
+.loop:
+ MODE_14_22 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2)
+cglobal intra_pred_ang32_15, 3,6,7,0-(6*mmsize+2)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -11396,234 +11859,20 @@ cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2)
lea r5, [r1 * 3]
mova m2, [pw_punpcklwd]
-.loop:
- MODE_15_21 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-%macro MODE_16_20 1
- movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 38]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 36]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 34]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 32]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 30]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 28]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 26]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 24]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 16] ; [15]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 22]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 20]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 16]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 16] ; [17]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
+.loop:
+ MODE_15_21 1
+ lea r0, [r0 + r1 * 4 ]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10)
+cglobal intra_pred_ang32_16, 3,6,7,0-(6*mmsize+10)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -11662,239 +11911,12 @@ cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10)
jnz .loop
RET
-%macro MODE_17_19 1
- movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 48]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 46]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 44]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 42]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 40]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 38]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 36]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 34]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 32]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 30]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 28]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 26]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2 + 26] ; [00]
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 24]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 22]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 20]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- movu m3, [r2 + 16]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4)
+cglobal intra_pred_ang32_17, 3,6,7,0-(7*mmsize+4)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -11939,11 +11961,10 @@ cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4)
jnz .loop
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_18, 4,7,8
+cglobal intra_pred_ang32_18, 3,7,8
+ mov r3, r2mp
+ add r2, 128
movu m0, [r3] ; [7 6 5 4 3 2 1 0]
movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8]
movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16]
@@ -12251,12 +12272,9 @@ cglobal intra_pred_ang32_18, 4,7,8
movu [r0 + r3 + 48], m6
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4)
- xchg r2, r3
+cglobal intra_pred_ang32_19, 3,7,7,0-(7*mmsize+4)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12304,12 +12322,9 @@ cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4)
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10)
- xchg r2, r3
+cglobal intra_pred_ang32_20, 3,7,7,0-(6*mmsize+10)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12351,12 +12366,9 @@ cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10)
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2)
- xchg r2, r3
+cglobal intra_pred_ang32_21, 3,7,7,0-(6*mmsize+2)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12398,12 +12410,9 @@ cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2)
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10)
- xchg r2, r3
+cglobal intra_pred_ang32_22, 3,7,7,0-(5*mmsize+10)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12444,12 +12453,9 @@ cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10)
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2)
- xchg r2, r3
+cglobal intra_pred_ang32_23, 3,7,7,0-(5*mmsize+2)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12489,12 +12495,9 @@ cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2)
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10)
- xchg r2, r3
+cglobal intra_pred_ang32_24, 3,7,7,0-(4*mmsize+10)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12533,12 +12536,9 @@ cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10)
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_25(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4)
- xchg r2, r3
+cglobal intra_pred_ang32_25, 3,7,7,0-(4*mmsize+4)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -12552,38 +12552,35 @@ cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4)
mov r4w, [r2+64]
mov [rsp+66], r4w
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 8
+ mov r2, rsp
+ add r1, r1
+ lea r5, [r1 * 3]
+ mov r6, r0
.loop:
MODE_11_25 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
+ add r6, 8
+ mov r0, r6
+ add r2, 8
+ dec r4
+ jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_26, 4,7,5
+cglobal intra_pred_ang32_26, 3,7,5
mov r6d, 4
add r1, r1
- lea r2, [r1 * 2]
+ lea r3, [r1 * 2]
lea r4, [r1 * 3]
lea r5, [r1 * 4]
mova m4, [c_mode32_10_0]
- movu m0, [r3 + 2]
- movu m1, [r3 + 18]
- movu m2, [r3 + 34]
- movu m3, [r3 + 50]
+ movu m0, [r2 + 2 ]
+ movu m1, [r2 + 18]
+ movu m2, [r2 + 34]
+ movu m3, [r2 + 50]
.loop:
movu [r0], m0
@@ -12596,10 +12593,10 @@ cglobal intra_pred_ang32_26, 4,7,5
movu [r0 + r1 + 32], m2
movu [r0 + r1 + 48], m3
- movu [r0 + r2], m0
- movu [r0 + r2 + 16], m1
- movu [r0 + r2 + 32], m2
- movu [r0 + r2 + 48], m3
+ movu [r0 + r3], m0
+ movu [r0 + r3 + 16], m1
+ movu [r0 + r3 + 32], m2
+ movu [r0 + r3 + 48], m3
movu [r0 + r4], m0
movu [r0 + r4 + 16], m1
@@ -12618,10 +12615,10 @@ cglobal intra_pred_ang32_26, 4,7,5
movu [r0 + r1 + 32], m2
movu [r0 + r1 + 48], m3
- movu [r0 + r2], m0
- movu [r0 + r2 + 16], m1
- movu [r0 + r2 + 32], m2
- movu [r0 + r2 + 48], m3
+ movu [r0 + r3], m0
+ movu [r0 + r3 + 16], m1
+ movu [r0 + r3 + 32], m2
+ movu [r0 + r3 + 48], m3
movu [r0 + r4], m0
movu [r0 + r4 + 16], m1
@@ -12633,12 +12630,8 @@ cglobal intra_pred_ang32_26, 4,7,5
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_27, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_27, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
@@ -12654,12 +12647,8 @@ cglobal intra_pred_ang32_27, 4,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_28, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_28, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
@@ -12675,12 +12664,8 @@ cglobal intra_pred_ang32_28, 4,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_29, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_29, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
@@ -12696,12 +12681,8 @@ cglobal intra_pred_ang32_29, 4,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_30, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_30, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
@@ -12717,12 +12698,8 @@ cglobal intra_pred_ang32_30, 4,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_31, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_31, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
@@ -12738,12 +12715,8 @@ cglobal intra_pred_ang32_31, 4,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_32, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_32, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
@@ -12759,12 +12732,8 @@ cglobal intra_pred_ang32_32, 4,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_33, 4,7,8
- xchg r2, r3mp
+cglobal intra_pred_ang32_33, 3,7,8
lea r3, [ang_table + 16 * 16]
add r1, r1
lea r5, [r1 * 3]
diff --git a/source/common/x86/intrapred8.asm b/source/common/x86/intrapred8.asm
index 0ababc6..76a1026 100644
--- a/source/common/x86/intrapred8.asm
+++ b/source/common/x86/intrapred8.asm
@@ -54,6 +54,17 @@ c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4,
c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
+;; (blkSize - 1 - x)
+pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
+pw_planar4_1: dw 3, 3, 3, 3, 3, 3, 3, 3
+pw_planar8_0: dw 7, 6, 5, 4, 3, 2, 1, 0
+pw_planar8_1: dw 7, 7, 7, 7, 7, 7, 7, 7
+pw_planar16_0: dw 15, 14, 13, 12, 11, 10, 9, 8
+pw_planar16_1: dw 15, 15, 15, 15, 15, 15, 15, 15
+pw_planar32_1: dw 31, 31, 31, 31, 31, 31, 31, 31
+pw_planar32_L: dw 31, 30, 29, 28, 27, 26, 25, 24
+pw_planar32_H: dw 23, 22, 21, 20, 19, 18, 17, 16
+
const ang_table
%assign x 0
%rep 32
@@ -63,7 +74,10 @@ const ang_table
SECTION .text
+cextern pw_4
cextern pw_8
+cextern pw_16
+cextern pw_32
cextern pw_1024
cextern pb_unpackbd1
cextern multiL
@@ -72,17 +86,15 @@ cextern multiH2
cextern multiH3
cextern multi_2Row
-;-----------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc4, 4,6,3
- mov r4d, r5m
+cglobal intra_pred_dc4, 5,5,3
inc r2
- inc r3
pxor m0, m0
movd m1, [r2]
- movd m2, [r3]
+ movd m2, [r2 + 8]
punpckldq m1, m2
psadbw m1, m0 ; m1 = sum
@@ -95,37 +107,37 @@ cglobal intra_pred_dc4, 4,6,3
pshufb m1, m0 ; m1 = byte [dc_val ...]
; store DC 4x4
- lea r5, [r1 * 3]
+ lea r3, [r1 * 3]
movd [r0], m1
movd [r0 + r1], m1
movd [r0 + r1 * 2], m1
- movd [r0 + r5], m1
+ movd [r0 + r3], m1
; do DC filter
jz .end
- lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
- add r4d, r5d ; r4d = DC * 3 + 2
+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
+ add r4d, r3d ; r4d = DC * 3 + 2
movd m1, r4d
pshuflw m1, m1, 0 ; m1 = pixDCx3
; filter top
- pmovzxbw m2, [r3]
+ pmovzxbw m2, [r2]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movd [r0], m2 ; overwrite top-left pixel, we will update it later
; filter top-left
- movzx r3d, byte [r3]
- add r5d, r3d
- movzx r3d, byte [r2]
- add r3d, r5d
+ movzx r4d, byte [r2 + 8]
+ add r3d, r4d
+ movzx r4d, byte [r2]
+ add r3d, r4d
shr r3d, 2
mov [r0], r3b
; filter left
add r0, r1
- pmovzxbw m2, [r2 + 1]
+ pmovzxbw m2, [r2 + 9]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
@@ -136,15 +148,13 @@ cglobal intra_pred_dc4, 4,6,3
.end:
RET
-
-;-------------------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc8, 4, 7, 3
- mov r4d, r5m
+cglobal intra_pred_dc8, 5, 7, 3
+ lea r3, [r2 + 17]
inc r2
- inc r3
pxor m0, m0
movh m1, [r2]
movh m2, [r3]
@@ -184,15 +194,15 @@ cglobal intra_pred_dc8, 4, 7, 3
pshufd m1, m1, 0
; filter top
- pmovzxbw m2, [r3]
+ pmovzxbw m2, [r2]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movh [r6], m2
; filter top-left
- movzx r3d, byte [r3]
- add r4d, r3d
+ movzx r5d, byte [r3]
+ add r4d, r5d
movzx r3d, byte [r2]
add r3d, r4d
shr r3d, 2
@@ -200,7 +210,7 @@ cglobal intra_pred_dc8, 4, 7, 3
; filter left
add r6, r1
- pmovzxbw m2, [r2 + 1]
+ pmovzxbw m2, [r2 + 17]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
@@ -217,14 +227,13 @@ cglobal intra_pred_dc8, 4, 7, 3
.end:
RET
-;-------------------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-------------------------------------------------------------------------------------------
+;--------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;--------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc16, 5, 7, 4
- mov r4d, r5m
+ lea r3, [r2 + 33]
inc r2
- inc r3
pxor m0, m0
movu m1, [r2]
movu m2, [r3]
@@ -277,20 +286,20 @@ cglobal intra_pred_dc16, 5, 7, 4
pshufd m1, m1, 0
; filter top
- pmovzxbw m2, [r3]
+ pmovzxbw m2, [r2]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movh [r6], m2
- pmovzxbw m3, [r3 + 8]
+ pmovzxbw m3, [r2 + 8]
paddw m3, m1
psraw m3, 2
packuswb m3, m3
movh [r6 + 8], m3
; filter top-left
- movzx r3d, byte [r3]
- add r4d, r3d
+ movzx r5d, byte [r3]
+ add r4d, r5d
movzx r3d, byte [r2]
add r3d, r4d
shr r3d, 2
@@ -298,7 +307,7 @@ cglobal intra_pred_dc16, 5, 7, 4
; filter left
add r6, r1
- pmovzxbw m2, [r2 + 1]
+ pmovzxbw m2, [r2 + 33]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
@@ -314,7 +323,7 @@ cglobal intra_pred_dc16, 5, 7, 4
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m2, 7
- pmovzxbw m3, [r2 + 9]
+ pmovzxbw m3, [r2 + 41]
paddw m3, m1
psraw m3, 2
packuswb m3, m3
@@ -332,13 +341,13 @@ cglobal intra_pred_dc16, 5, 7, 4
.end:
RET
-;-------------------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc32, 4, 5, 5
+cglobal intra_pred_dc32, 3, 5, 5
+ lea r3, [r2 + 65]
inc r2
- inc r3
pxor m0, m0
movu m1, [r2]
movu m2, [r2 + 16]
@@ -406,324 +415,312 @@ cglobal intra_pred_dc32, 4, 5, 5
RET
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_planar4, 4,7,5
- inc r2
- inc r3
- pmovzxbw m0, [r3] ; topRow[i] = above[i];
- punpcklqdq m0, m0
-
- pxor m1, m1
- movd m2, [r2 + 4] ; bottomLeft = left[4]
- movzx r6d, byte [r3 + 4] ; topRight = above[4];
- pshufb m2, m1
- punpcklbw m2, m1
- psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
- psllw m0, 2
- punpcklqdq m3, m2, m1
- psubw m0, m3
- paddw m2, m2
-
-%macro COMP_PRED_PLANAR_2ROW 1
- movzx r4d, byte [r2 + %1]
- lea r4d, [r4d * 4 + 4]
- movd m3, r4d
- pshuflw m3, m3, 0
-
- movzx r4d, byte [r2 + %1 + 1]
- lea r4d, [r4d * 4 + 4]
- movd m4, r4d
- pshuflw m4, m4, 0
- punpcklqdq m3, m4 ; horPred
-
- movzx r4d, byte [r2 + %1]
- mov r5d, r6d
- sub r5d, r4d
- movd m4, r5d
- pshuflw m4, m4, 0
-
- movzx r4d, byte [r2 + %1 + 1]
- mov r5d, r6d
- sub r5d, r4d
- movd m1, r5d
- pshuflw m1, m1, 0
- punpcklqdq m4, m1 ; rightColumnN
-
- pmullw m4, [multi_2Row]
- paddw m3, m4
- paddw m0, m2
- paddw m3, m0
- psraw m3, 3
- packuswb m3, m3
-
- movd [r0], m3
- pshufd m3, m3, 0x55
- movd [r0 + r1], m3
- lea r0, [r0 + 2 * r1]
-%endmacro
-
- COMP_PRED_PLANAR_2ROW 0
- COMP_PRED_PLANAR_2ROW 2
-
+cglobal intra_pred_planar4, 3,3,7
+ pmovzxbw m1, [r2 + 1]
+ pmovzxbw m2, [r2 + 9]
+ pshufhw m3, m1, 0 ; topRight
+ pshufd m3, m3, 0xAA
+ pshufhw m4, m2, 0 ; bottomLeft
+ pshufd m4, m4, 0xAA
+
+ pmullw m3, [multi_2Row] ; (x + 1) * topRight
+ pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x]
+ mova m6, [pw_planar4_0]
+ paddw m3, [pw_4]
+ paddw m3, m4
+ paddw m3, m0
+ psubw m4, m1
+
+ pshuflw m5, m2, 0
+ pmullw m5, m6
+ paddw m5, m3
+ paddw m3, m4
+ psraw m5, 3
+ packuswb m5, m5
+ movd [r0], m5
+
+ pshuflw m5, m2, 01010101b
+ pmullw m5, m6
+ paddw m5, m3
+ paddw m3, m4
+ psraw m5, 3
+ packuswb m5, m5
+ movd [r0 + r1], m5
+ lea r0, [r0 + 2 * r1]
+
+ pshuflw m5, m2, 10101010b
+ pmullw m5, m6
+ paddw m5, m3
+ paddw m3, m4
+ psraw m5, 3
+ packuswb m5, m5
+ movd [r0], m5
+
+ pshuflw m5, m2, 11111111b
+ pmullw m5, m6
+ paddw m5, m3
+ paddw m3, m4
+ psraw m5, 3
+ packuswb m5, m5
+ movd [r0 + r1], m5
RET
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_planar8, 4,4,7
- inc r2
- inc r3
- pxor m0, m0
- pmovzxbw m1, [r3] ; v_topRow
- pmovzxbw m2, [r2] ; v_leftColumn
-
- movd m3, [r3 + 8] ; topRight = above[8];
- movd m4, [r2 + 8] ; bottomLeft = left[8];
-
- pshufb m3, m0
- pshufb m4, m0
- punpcklbw m3, m0 ; v_topRight
- punpcklbw m4, m0 ; v_bottomLeft
-
- psubw m4, m1 ; v_bottomRow
- psubw m3, m2 ; v_rightColumn
-
- psllw m1, 3 ; v_topRow
- psllw m2, 3 ; v_leftColumn
-
- paddw m6, m2, [pw_8]
-
-%macro PRED_PLANAR_ROW8 1
- %if (%1 < 4)
- pshuflw m5, m6, 0x55 * %1
- pshufd m5, m5, 0
- pshuflw m2, m3, 0x55 * %1
- pshufd m2, m2, 0
- %else
- pshufhw m5, m6, 0x55 * (%1 - 4)
- pshufd m5, m5, 0xAA
- pshufhw m2, m3, 0x55 * (%1 - 4)
- pshufd m2, m2, 0xAA
- %endif
-
- pmullw m2, [multiL]
- paddw m5, m2
- paddw m1, m4
- paddw m5, m1
- psraw m5, 4
- packuswb m5, m5
-
- movh [r0], m5
- lea r0, [r0 + r1]
-
+cglobal intra_pred_planar8, 3,3,7
+ pmovzxbw m1, [r2 + 1]
+ pmovzxbw m2, [r2 + 17]
+
+ movd m3, [r2 + 9] ; topRight = above[8];
+ movd m4, [r2 + 25] ; bottomLeft = left[8];
+
+ pxor m0, m0
+ pshufb m3, m0
+ pshufb m4, m0
+ punpcklbw m3, m0 ; v_topRight
+ punpcklbw m4, m0 ; v_bottomLeft
+
+ pmullw m3, [multiL] ; (x + 1) * topRight
+ pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x]
+ mova m6, [pw_planar8_0]
+ paddw m3, [pw_8]
+ paddw m3, m4
+ paddw m3, m0
+ psubw m4, m1
+
+%macro INTRA_PRED_PLANAR8 1
+%if (%1 < 4)
+ pshuflw m5, m2, 0x55 * %1
+ pshufd m5, m5, 0
+%else
+ pshufhw m5, m2, 0x55 * (%1 - 4)
+ pshufd m5, m5, 0xAA
+%endif
+ pmullw m5, m6
+ paddw m5, m3
+ paddw m3, m4
+ psraw m5, 4
+ packuswb m5, m5
+ movh [r0], m5
+ lea r0, [r0 + r1]
%endmacro
- PRED_PLANAR_ROW8 0
- PRED_PLANAR_ROW8 1
- PRED_PLANAR_ROW8 2
- PRED_PLANAR_ROW8 3
- PRED_PLANAR_ROW8 4
- PRED_PLANAR_ROW8 5
- PRED_PLANAR_ROW8 6
- PRED_PLANAR_ROW8 7
-
+ INTRA_PRED_PLANAR8 0
+ INTRA_PRED_PLANAR8 1
+ INTRA_PRED_PLANAR8 2
+ INTRA_PRED_PLANAR8 3
+ INTRA_PRED_PLANAR8 4
+ INTRA_PRED_PLANAR8 5
+ INTRA_PRED_PLANAR8 6
+ INTRA_PRED_PLANAR8 7
RET
-
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_planar16, 4,6,8
- inc r2
- inc r3
- pxor m0, m0
- pmovzxbw m1, [r3] ; topRow[0-7]
- pmovzxbw m2, [r3 + 8] ; topRow[8-15]
-
- movd m3, [r2 + 16]
- pshufb m3, m0
- punpcklbw m3, m0 ; v_bottomLeft = left[16]
- movzx r4d, byte [r3 + 16] ; topRight = above[16]
-
- psubw m4, m3, m1 ; v_bottomRow[0]
- psubw m5, m3, m2 ; v_bottomRow[1]
-
- psllw m1, 4
- psllw m2, 4
-
-%macro PRED_PLANAR_ROW16 1
- movzx r5d, byte [r2 + %1]
- add r5d, r5d
- lea r5d, [r5d * 8 + 16]
- movd m3, r5d
- pshuflw m3, m3, 0
- pshufd m3, m3, 0 ; horPred
-
- movzx r5d, byte [r2 + %1]
- mov r3d, r4d
- sub r3d, r5d
- movd m6, r3d
- pshuflw m6, m6, 0
- pshufd m6, m6, 0
-
- pmullw m7, m6, [multiL]
- paddw m7, m3
- paddw m1, m4
- paddw m7, m1
- psraw m7, 5
-
- pmullw m6, m6, [multiH]
- paddw m3, m6
- paddw m2, m5
- paddw m3, m2
- psraw m3, 5
-
- packuswb m7, m3
- movu [r0], m7
- lea r0, [r0 + r1]
+cglobal intra_pred_planar16, 3,3,8
+ pmovzxbw m2, [r2 + 1]
+ pmovzxbw m7, [r2 + 9]
+
+ movd m3, [r2 + 17] ; topRight = above[16]
+ movd m6, [r2 + 49] ; bottomLeft = left[16]
+
+ pxor m0, m0
+ pshufb m3, m0
+ pshufb m6, m0
+ punpcklbw m3, m0 ; v_topRight
+ punpcklbw m6, m0 ; v_bottomLeft
+
+ pmullw m4, m3, [multiH] ; (x + 1) * topRight
+ pmullw m3, [multiL] ; (x + 1) * topRight
+ pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ paddw m4, [pw_16]
+ paddw m3, [pw_16]
+ paddw m4, m6
+ paddw m3, m6
+ paddw m4, m5
+ paddw m3, m1
+ psubw m1, m6, m7
+ psubw m6, m2
+
+ pmovzxbw m2, [r2 + 33]
+ pmovzxbw m7, [r2 + 41]
+
+%macro INTRA_PRED_PLANAR16 1
+%if (%1 < 4)
+ pshuflw m5, m2, 0x55 * %1
+ pshufd m5, m5, 0
+%else
+%if (%1 < 8)
+ pshufhw m5, m2, 0x55 * (%1 - 4)
+ pshufd m5, m5, 0xAA
+%else
+%if (%1 < 12)
+ pshuflw m5, m7, 0x55 * (%1 - 8)
+ pshufd m5, m5, 0
+%else
+ pshufhw m5, m7, 0x55 * (%1 - 12)
+ pshufd m5, m5, 0xAA
+%endif
+%endif
+%endif
+ pmullw m0, m5, [pw_planar8_0]
+ pmullw m5, [pw_planar16_0]
+ paddw m0, m4
+ paddw m5, m3
+ paddw m3, m6
+ paddw m4, m1
+ psraw m5, 5
+ psraw m0, 5
+ packuswb m5, m0
+ movu [r0], m5
+ lea r0, [r0 + r1]
%endmacro
- PRED_PLANAR_ROW16 0
- PRED_PLANAR_ROW16 1
- PRED_PLANAR_ROW16 2
- PRED_PLANAR_ROW16 3
- PRED_PLANAR_ROW16 4
- PRED_PLANAR_ROW16 5
- PRED_PLANAR_ROW16 6
- PRED_PLANAR_ROW16 7
- PRED_PLANAR_ROW16 8
- PRED_PLANAR_ROW16 9
- PRED_PLANAR_ROW16 10
- PRED_PLANAR_ROW16 11
- PRED_PLANAR_ROW16 12
- PRED_PLANAR_ROW16 13
- PRED_PLANAR_ROW16 14
- PRED_PLANAR_ROW16 15
-
+ INTRA_PRED_PLANAR16 0
+ INTRA_PRED_PLANAR16 1
+ INTRA_PRED_PLANAR16 2
+ INTRA_PRED_PLANAR16 3
+ INTRA_PRED_PLANAR16 4
+ INTRA_PRED_PLANAR16 5
+ INTRA_PRED_PLANAR16 6
+ INTRA_PRED_PLANAR16 7
+ INTRA_PRED_PLANAR16 8
+ INTRA_PRED_PLANAR16 9
+ INTRA_PRED_PLANAR16 10
+ INTRA_PRED_PLANAR16 11
+ INTRA_PRED_PLANAR16 12
+ INTRA_PRED_PLANAR16 13
+ INTRA_PRED_PLANAR16 14
+ INTRA_PRED_PLANAR16 15
RET
-
-;-----------------------------------------------------------------------------------------------------------
-; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
-;-----------------------------------------------------------------------------------------------------------
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
INIT_XMM sse4
%if ARCH_X86_64 == 1
-cglobal intra_pred_planar32, 4,7,12
- %define bottomRow0 m8
- %define bottomRow1 m9
- %define bottomRow2 m10
- %define bottomRow3 m11
+cglobal intra_pred_planar32, 3,4,12
%else
-cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize)
- %define bottomRow0 [rsp + 0 * mmsize]
- %define bottomRow1 [rsp + 1 * mmsize]
- %define bottomRow2 [rsp + 2 * mmsize]
- %define bottomRow3 [rsp + 3 * mmsize]
+cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize)
+ %define m8 [rsp + 0 * mmsize]
+ %define m9 [rsp + 1 * mmsize]
+ %define m10 [rsp + 2 * mmsize]
+ %define m11 [rsp + 3 * mmsize]
%endif
+ movd m3, [r2 + 33] ; topRight = above[32]
+
+ pxor m7, m7
+ pshufb m3, m7
+ punpcklbw m3, m7 ; v_topRight
+
+ pmullw m0, m3, [multiL] ; (x + 1) * topRight
+ pmullw m1, m3, [multiH] ; (x + 1) * topRight
+ pmullw m2, m3, [multiH2] ; (x + 1) * topRight
+ pmullw m3, [multiH3] ; (x + 1) * topRight
+
+ movd m6, [r2 + 97] ; bottomLeft = left[32]
+ pshufb m6, m7
+ punpcklbw m6, m7 ; v_bottomLeft
+
+ paddw m0, m6
+ paddw m1, m6
+ paddw m2, m6
+ paddw m3, m6
+ paddw m0, [pw_32]
+ paddw m1, [pw_32]
+ paddw m2, [pw_32]
+ paddw m3, [pw_32]
+
+ pmovzxbw m4, [r2 + 1]
+ pmullw m5, m4, [pw_planar32_1]
+ paddw m0, m5
+ psubw m5, m6, m4
+ mova m8, m5
+
+ pmovzxbw m4, [r2 + 9]
+ pmullw m5, m4, [pw_planar32_1]
+ paddw m1, m5
+ psubw m5, m6, m4
+ mova m9, m5
+
+ pmovzxbw m4, [r2 + 17]
+ pmullw m5, m4, [pw_planar32_1]
+ paddw m2, m5
+ psubw m5, m6, m4
+ mova m10, m5
+
+ pmovzxbw m4, [r2 + 25]
+ pmullw m5, m4, [pw_planar32_1]
+ paddw m3, m5
+ psubw m5, m6, m4
+ mova m11, m5
+ add r2, 65 ; (2 * blkSize + 1)
+
+%macro INTRA_PRED_PLANAR32 0
+ movd m4, [r2]
+ pshufb m4, m7
+ punpcklbw m4, m7
+
+ pmullw m5, m4, [pw_planar32_L]
+ pmullw m6, m4, [pw_planar32_H]
+ paddw m5, m0
+ paddw m6, m1
+ paddw m0, m8
+ paddw m1, m9
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r0], m5
+
+ pmullw m5, m4, [pw_planar16_0]
+ pmullw m4, [pw_planar8_0]
+ paddw m5, m2
+ paddw m4, m3
+ paddw m2, m10
+ paddw m3, m11
+ psraw m5, 6
+ psraw m4, 6
+ packuswb m5, m4
+ movu [r0 + 16], m5
+
+ lea r0, [r0 + r1]
inc r2
- inc r3
- pxor m3, m3
- movd m0, [r2 + 32]
- pshufb m0, m3
- punpcklbw m0, m3 ; v_bottomLeft = left[32]
- movzx r4d, byte [r3 + 32] ; topRight = above[32]
-
- pmovzxbw m1, [r3 + 0] ; topRow[0]
- pmovzxbw m2, [r3 + 8] ; topRow[1]
- pmovzxbw m3, [r3 +16] ; topRow[2]
- pmovzxbw m4, [r3 +24] ; topRow[3]
-
- psubw m5, m0, m1 ; v_bottomRow[0]
- psubw m6, m0, m2 ; v_bottomRow[1]
- psubw m7, m0, m3 ; v_bottomRow[2]
- psubw m0, m4 ; v_bottomRow[3]
-
- mova bottomRow0, m5
- mova bottomRow1, m6
- mova bottomRow2, m7
- mova bottomRow3, m0
-
- psllw m1, 5
- psllw m2, 5
- psllw m3, 5
- psllw m4, 5
-
-%macro COMP_PRED_PLANAR_ROW 1
- movzx r5d, byte [r2]
- shl r5d, 5
- add r5d, 32
- movd m5, r5d
- pshuflw m5, m5, 0
- pshufd m5, m5, 0 ; horPred
-
- movzx r5d, byte [r2]
- mov r6d, r4d
- sub r6d, r5d
- movd m6, r6d
- pshuflw m6, m6, 0
- pshufd m6, m6, 0
-
-%if (%1 == 0)
- pmullw m7, m6, [multiL]
-%else
- pmullw m7, m6, [multiH2]
-%endif
-
- paddw m7, m5
-%if (%1 == 0)
- paddw m1, bottomRow0
- paddw m7, m1
-%else
- paddw m3, bottomRow2
- paddw m7, m3
-%endif
- psraw m7, 6
-
-%if (%1 == 0)
- pmullw m6, [multiH]
-%else
- pmullw m6, [multiH3]
-%endif
- paddw m6, m5
-%if (%1 == 0)
- paddw m2, bottomRow1
- paddw m6, m2
-%else
- paddw m4, bottomRow3
- paddw m6, m4
-%endif
- psraw m6, 6
-
- packuswb m7, m6
- movu [r0 + %1], m7
%endmacro
- mov r3, 32
+ mov r3, 4
.loop:
- COMP_PRED_PLANAR_ROW 0
- COMP_PRED_PLANAR_ROW 16
- inc r2
- lea r0, [r0 + r1]
-
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
+ INTRA_PRED_PLANAR32
dec r3
- jnz .loop
-%undef COMP_PRED_PLANAR_ROW
-
+ jnz .loop
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------
+; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,3,4
- cmp r4m, byte 34
- cmove r2, r3mp
- movh m0, [r2 + 2]
+cglobal intra_pred_ang4_2, 3,5,4
+ lea r4, [r2 + 2]
+ add r2, 10
+ cmp r3m, byte 34
+ cmove r2, r4
+
+ movh m0, [r2]
movd [r0], m0
palignr m1, m0, 1
movd [r0 + r1], m1
@@ -734,13 +731,14 @@ cglobal intra_pred_ang4_2, 3,3,4
movd [r0 + r1], m0
RET
-
INIT_XMM sse4
-cglobal intra_pred_ang4_3, 3,4,5
- cmp r4m, byte 33
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_3, 3,5,5
+ mov r4, 1
+ cmp r3m, byte 33
+ mov r3, 9
+ cmove r3, r4
+
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
@@ -749,6 +747,7 @@ cglobal intra_pred_ang4_3, 3,4,5
punpcklqdq m0, m1
punpcklqdq m2, m3
+ lea r3, [ang_table + 20 * 16]
movh m3, [r3 + 6 * 16] ; [26]
movhps m3, [r3] ; [20]
movh m4, [r3 - 6 * 16] ; [14]
@@ -781,12 +780,14 @@ ALIGN 16
pextrd [r0 + r1], m0, 3
RET
+cglobal intra_pred_ang4_4, 3,5,5
+ xor r4, r4
+ inc r4
+ cmp r3m, byte 32
+ mov r3, 9
+ cmove r3, r4
-cglobal intra_pred_ang4_4, 3,4,5
- cmp r4m, byte 32
- cmove r2, r3mp
- lea r3, [ang_table + 18 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
@@ -794,18 +795,21 @@ cglobal intra_pred_ang4_4, 3,4,5
punpcklqdq m0, m1
punpcklqdq m2, m1, m3
+ lea r3, [ang_table + 18 * 16]
movh m3, [r3 + 3 * 16] ; [21]
movhps m3, [r3 - 8 * 16] ; [10]
movh m4, [r3 + 13 * 16] ; [31]
movhps m4, [r3 + 2 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_5, 3,5,5
+ xor r4, r4
+ inc r4
+ cmp r3m, byte 31
+ mov r3, 9
+ cmove r3, r4
-cglobal intra_pred_ang4_5, 3,4,5
- cmp r4m, byte 31
- cmove r2, r3mp
- lea r3, [ang_table + 10 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
@@ -813,87 +817,98 @@ cglobal intra_pred_ang4_5, 3,4,5
punpcklqdq m0, m1
punpcklqdq m2, m1, m3
+ lea r3, [ang_table + 10 * 16]
movh m3, [r3 + 7 * 16] ; [17]
movhps m3, [r3 - 8 * 16] ; [ 2]
movh m4, [r3 + 9 * 16] ; [19]
movhps m4, [r3 - 6 * 16] ; [ 4]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_6, 3,5,5
+ xor r4, r4
+ inc r4
+ cmp r3m, byte 30
+ mov r3, 9
+ cmove r3, r4
-cglobal intra_pred_ang4_6, 3,4,5
- cmp r4m, byte 30
- cmove r2, r3mp
- lea r3, [ang_table + 19 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
punpcklqdq m0, m0
punpcklqdq m2, m2
+ lea r3, [ang_table + 19 * 16]
movh m3, [r3 - 6 * 16] ; [13]
movhps m3, [r3 + 7 * 16] ; [26]
movh m4, [r3 - 12 * 16] ; [ 7]
movhps m4, [r3 + 1 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_7, 3,5,5
+ xor r4, r4
+ inc r4
+ cmp r3m, byte 29
+ mov r3, 9
+ cmove r3, r4
-cglobal intra_pred_ang4_7, 3,4,5
- cmp r4m, byte 29
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
punpcklqdq m2, m0, m3
punpcklqdq m0, m0
+ lea r3, [ang_table + 20 * 16]
movh m3, [r3 - 11 * 16] ; [ 9]
movhps m3, [r3 - 2 * 16] ; [18]
movh m4, [r3 + 7 * 16] ; [27]
movhps m4, [r3 - 16 * 16] ; [ 4]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_8, 3,5,5
+ xor r4, r4
+ inc r4
+ cmp r3m, byte 28
+ mov r3, 9
+ cmove r3, r4
-cglobal intra_pred_ang4_8, 3,4,5
- cmp r4m, byte 28
- cmove r2, r3mp
- lea r3, [ang_table + 13 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklqdq m0, m0
mova m2, m0
+ lea r3, [ang_table + 13 * 16]
movh m3, [r3 - 8 * 16] ; [ 5]
movhps m3, [r3 - 3 * 16] ; [10]
movh m4, [r3 + 2 * 16] ; [15]
movhps m4, [r3 + 7 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_9, 3,5,5
+ xor r4, r4
+ inc r4
+ cmp r3m, byte 27
+ mov r3, 9
+ cmove r3, r4
-cglobal intra_pred_ang4_9, 3,4,5
- cmp r4m, byte 27
- cmove r2, r3mp
- lea r3, [ang_table + 4 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklqdq m0, m0
mova m2, m0
+ lea r3, [ang_table + 4 * 16]
movh m3, [r3 - 2 * 16] ; [ 2]
movhps m3, [r3 - 0 * 16] ; [ 4]
movh m4, [r3 + 2 * 16] ; [ 6]
movhps m4, [r3 + 4 * 16] ; [ 8]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
cglobal intra_pred_ang4_10, 3,3,4
- movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
+ movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1]
pshufb m0, [pb_unpackbd1]
-
pshufd m1, m0, 1
movhlps m2, m0
pshufd m3, m0, 3
@@ -901,12 +916,10 @@ cglobal intra_pred_ang4_10, 3,3,4
movd [r0 + r1 * 2], m2
lea r1, [r1 * 3]
movd [r0 + r1], m3
-
- cmp r5m, byte 0
- jz .quit
+ cmp r4m, byte 0
+ jz .quit
; filter
- mov r2, r3mp
pmovzxbw m0, m0 ; [-1 -1 -1 -1]
movh m1, [r2] ; [4 3 2 1 0]
pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
@@ -915,15 +928,13 @@ cglobal intra_pred_ang4_10, 3,3,4
psraw m1, 1
paddw m0, m1
packuswb m0, m0
-
.quit:
movd [r0], m0
RET
-
INIT_XMM sse4
-cglobal intra_pred_ang4_26, 4,4,3
- movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1]
+cglobal intra_pred_ang4_26, 3,4,3
+ movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
; store
movd [r0], m0
@@ -933,11 +944,12 @@ cglobal intra_pred_ang4_26, 4,4,3
movd [r0 + r3], m0
; filter
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
pshufb m0, [pb_0_8] ; [ 1 1 1 1]
- movh m1, [r2] ; [-4 -3 -2 -1 0]
+ movh m1, [r2 + 8] ; [-4 -3 -2 -1 0]
+ pinsrb m1, [r2], 0
pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
psubw m1, m2
@@ -949,54 +961,62 @@ cglobal intra_pred_ang4_26, 4,4,3
pextrb [r0 + r1], m0, 1
pextrb [r0 + r1 * 2], m0, 2
pextrb [r0 + r3], m0, 3
-
.quit:
RET
+cglobal intra_pred_ang4_11, 3,5,5
+ xor r4, r4
+ cmp r3m, byte 25
+ mov r3, 8
+ cmove r3, r4
-cglobal intra_pred_ang4_11, 3,4,5
- cmp r4m, byte 25
- cmove r2, r3mp
- lea r3, [ang_table + 24 * 16]
- movh m0, [r2] ; [x x x 4 3 2 1 0]
+ movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
+ pinsrb m0, [r2], 0
palignr m1, m0, 1 ; [x x x x 4 3 2 1]
punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
punpcklqdq m0, m0
mova m2, m0
+ lea r3, [ang_table + 24 * 16]
+
movh m3, [r3 + 6 * 16] ; [24]
movhps m3, [r3 + 4 * 16] ; [26]
movh m4, [r3 + 2 * 16] ; [28]
movhps m4, [r3 + 0 * 16] ; [30]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_12, 3,5,5
+ xor r4, r4
+ cmp r3m, byte 24
+ mov r3, 8
+ cmove r3, r4
-cglobal intra_pred_ang4_12, 3,4,5
- cmp r4m, byte 24
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movh m0, [r2] ; [x x x 4 3 2 1 0]
+ movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
+ pinsrb m0, [r2], 0
palignr m1, m0, 1 ; [x x x x 4 3 2 1]
punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
punpcklqdq m0, m0
mova m2, m0
+ lea r3, [ang_table + 20 * 16]
movh m3, [r3 + 7 * 16] ; [27]
movhps m3, [r3 + 2 * 16] ; [22]
movh m4, [r3 - 3 * 16] ; [17]
movhps m4, [r3 - 8 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_13, 4,4,5
- cmp r4m, byte 23
- jnz .load
- xchg r2, r3
-.load:
- movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_13, 4,5,5
+ xor r4, r4
+ cmp r3m, byte 23
+ mov r3, 8
+ jz .next
+ xchg r3, r4
+.next:
+ movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
+ pinsrb m1, [r2], 1
palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
palignr m2, m1, 2 ; [x x x x 4 3 2 1]
- pinsrb m1, [r3 + 4], 0
+ pinsrb m1, [r2 + r3 + 4], 0
punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
punpcklqdq m2, m0, m1
@@ -1009,16 +1029,18 @@ cglobal intra_pred_ang4_13, 4,4,5
movhps m4, [r3 + 7 * 16] ; [28]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_14, 4,4,5
- cmp r4m, byte 22
- jnz .load
- xchg r2, r3
-.load:
- movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_14, 4,5,5
+ xor r4, r4
+ cmp r3m, byte 22
+ mov r3, 8
+ jz .next
+ xchg r3, r4
+.next:
+ movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
+ pinsrb m2, [r2], 1
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- pinsrb m2, [r3 + 2], 0
+ pinsrb m2, [r2 + r3 + 2], 0
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
punpcklqdq m0, m0
@@ -1031,18 +1053,20 @@ cglobal intra_pred_ang4_14, 4,4,5
movhps m4, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_15, 4,4,5
- cmp r4m, byte 21
- jnz .load
- xchg r2, r3
-.load:
- movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_15, 4,5,5
+ xor r4, r4
+ cmp r3m, byte 21
+ mov r3, 8
+ jz .next
+ xchg r3, r4
+.next:
+ movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
+ pinsrb m2, [r2], 1
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- pinsrb m2, [r3 + 2], 0
+ pinsrb m2, [r2 + r3 + 2], 0
pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
- pinsrb m3, [r3 + 4], 0
+ pinsrb m3, [r2 + r3 + 4], 0
punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
@@ -1056,18 +1080,20 @@ cglobal intra_pred_ang4_15, 4,4,5
movhps m4, [r3 + 5 * 16] ; [28]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_16, 4,4,5
- cmp r4m, byte 20
- jnz .load
- xchg r2, r3
-.load:
- movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
+cglobal intra_pred_ang4_16, 3,5,5
+ xor r4, r4
+ cmp r3m, byte 20
+ mov r3, 8
+ jz .next
+ xchg r3, r4
+.next:
+ movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
+ pinsrb m2, [r2], 1
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- pinsrb m2, [r3 + 2], 0
+ pinsrb m2, [r2 + r3 + 2], 0
pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
- pinsrb m3, [r3 + 3], 0
+ pinsrb m3, [r2 + r3 + 3], 0
punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
@@ -1081,26 +1107,27 @@ cglobal intra_pred_ang4_16, 4,4,5
movhps m4, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_17, 4,4,5
- cmp r4m, byte 19
- jnz .load
- xchg r2, r3
-.load:
- movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x]
+cglobal intra_pred_ang4_17, 3,5,5
+ xor r4, r4
+ cmp r3m, byte 19
+ mov r3, 8
+ jz .next
+ xchg r3, r4
+.next:
+ movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x]
+ pinsrb m3, [r2], 1
palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
palignr m1, m3, 2 ; [- - - - 4 3 2 1]
mova m4, m0
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
-
- pinsrb m3, [r3 + 1], 0
+ pinsrb m3, [r2 + r3 + 1], 0
punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
punpcklqdq m0, m1
pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
- pinsrb m2, [r3 + 2], 0
+ pinsrb m2, [r2 + r3 + 2], 0
pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
- pinsrb m1, [r3 + 4], 0
+ pinsrb m1, [r2 + r3 + 4], 0
punpcklbw m1, m2 ; [1 0 0 x x y y z]
punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
punpcklqdq m2, m1
@@ -1112,29 +1139,36 @@ cglobal intra_pred_ang4_17, 4,4,5
movhps m4, [r3 + 10 * 16] ; [24]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+cglobal intra_pred_ang4_18, 3,5,1
+ mov r4d, [r2 + 8]
+ mov r3b, byte [r2]
+ mov [r2 + 8], r3b
+ mov r3d, [r2 + 8]
+ bswap r3d
+ movd m0, r3d
-cglobal intra_pred_ang4_18, 4,4,1
- mov r2d, [r2]
- bswap r2d
- movd m0, r2d
- pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
- lea r2, [r1 * 3]
- movd [r0 + r2], m0
+ pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
+ lea r3, [r1 * 3]
+ movd [r0 + r3], m0
psrldq m0, 1
movd [r0 + r1 * 2], m0
psrldq m0, 1
movd [r0 + r1], m0
psrldq m0, 1
movd [r0], m0
+ mov [r2 + 8], r4w
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------------------
+; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang8_2, 3,5,2
- cmp r4m, byte 34
- cmove r2, r3mp
- movu m0, [r2 + 2]
+ lea r4, [r2 + 2]
+ add r2, 18
+ cmp r3m, byte 34
+ cmove r2, r4
+ movu m0, [r2]
lea r4, [r1 * 3]
movh [r0], m0
@@ -1157,13 +1191,15 @@ cglobal intra_pred_ang8_2, 3,5,2
INIT_XMM sse4
cglobal intra_pred_ang8_3, 3,5,8
- cmp r4m, byte 33
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 33
+ cmove r2, r4
lea r3, [ang_table + 22 * 16]
lea r4, [ang_table + 8 * 16]
mova m3, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
@@ -1206,7 +1242,7 @@ cglobal intra_pred_ang8_3, 3,5,8
pmaddubsw m2, [r3 - 6 * 16] ; [16]
pmulhrsw m2, m3
packuswb m1, m2
- jmp .transpose8x8
+ jmp .transpose8x8
ALIGN 16
.transpose8x8:
@@ -1243,13 +1279,15 @@ ALIGN 16
RET
cglobal intra_pred_ang8_4, 3,5,8
- cmp r4m, byte 32
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 32
+ cmove r2, r4
lea r3, [ang_table + 24 * 16]
lea r4, [ang_table + 10 * 16]
mova m3, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
@@ -1294,13 +1332,15 @@ cglobal intra_pred_ang8_4, 3,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_5, 3,5,8
- cmp r4m, byte 31
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 31
+ cmove r2, r4
lea r3, [ang_table + 17 * 16]
lea r4, [ang_table + 2 * 16]
mova m3, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
@@ -1345,13 +1385,15 @@ cglobal intra_pred_ang8_5, 3,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_6, 3,5,8
- cmp r4m, byte 30
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 30
+ cmove r2, r4
lea r3, [ang_table + 20 * 16]
lea r4, [ang_table + 8 * 16]
mova m7, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
@@ -1394,13 +1436,15 @@ cglobal intra_pred_ang8_6, 3,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_7, 3,5,8
- cmp r4m, byte 29
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 29
+ cmove r2, r4
lea r3, [ang_table + 24 * 16]
lea r4, [ang_table + 6 * 16]
mova m7, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
@@ -1440,13 +1484,15 @@ cglobal intra_pred_ang8_7, 3,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_8, 3,5,8
- cmp r4m, byte 28
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 28
+ cmove r2, r4
lea r3, [ang_table + 23 * 16]
lea r4, [ang_table + 8 * 16]
mova m7, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
@@ -1482,12 +1528,14 @@ cglobal intra_pred_ang8_8, 3,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_9, 3,5,8
- cmp r4m, byte 27
- cmove r2, r3mp
+ lea r4, [r2 + 1]
+ add r2, 17
+ cmp r3m, byte 27
+ cmove r2, r4
lea r3, [ang_table + 10 * 16]
mova m7, [pw_1024]
- movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
@@ -1520,8 +1568,8 @@ cglobal intra_pred_ang8_9, 3,5,8
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
-cglobal intra_pred_ang8_10, 4,5,5
- movh m0, [r2 + 1]
+cglobal intra_pred_ang8_10, 3,6,5
+ movh m0, [r2 + 17]
mova m4, [pb_unpackbq]
palignr m1, m0, 2
pshufb m1, m4
@@ -1531,22 +1579,22 @@ cglobal intra_pred_ang8_10, 4,5,5
pshufb m3, m4
pshufb m0, m4
- lea r4, [r1 * 3]
+ lea r5, [r1 * 3]
movhps [r0 + r1], m0
movh [r0 + r1 * 2], m1
- movhps [r0 + r4], m1
- lea r2, [r0 + r1 * 4]
- movh [r2], m2
- movhps [r2 + r1], m2
- movh [r2 + r1 * 2], m3
- movhps [r2 + r4], m3
+ movhps [r0 + r5], m1
+ lea r3, [r0 + r1 * 4]
+ movh [r3], m2
+ movhps [r3 + r1], m2
+ movh [r3 + r1 * 2], m3
+ movhps [r3 + r5], m3
; filter
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
pmovzxbw m0, m0
- movu m1, [r3]
+ movu m1, [r2]
palignr m2, m1, 1
pshufb m1, m4
pmovzxbw m1, m1
@@ -1560,58 +1608,60 @@ cglobal intra_pred_ang8_10, 4,5,5
movh [r0], m0
RET
-cglobal intra_pred_ang8_26, 4,5,3
- movh m0, [r3 + 1]
-
- lea r4, [r1 * 3]
+cglobal intra_pred_ang8_26, 3,6,3
+ movu m2, [r2]
+ palignr m0, m2, 1
+ lea r5, [r1 * 3]
movh [r0], m0
movh [r0 + r1], m0
movh [r0 + r1 * 2], m0
- movh [r0 + r4], m0
+ movh [r0 + r5], m0
lea r3, [r0 + r1 * 4]
movh [r3], m0
movh [r3 + r1], m0
movh [r3 + r1 * 2], m0
- movh [r3 + r4], m0
+ movh [r3 + r5], m0
; filter
- cmp r5m, byte 0
+ cmp r4m, byte 0
jz .quit
- pshufb m0, [pb_unpackbq]
- pmovzxbw m0, m0
- movu m1, [r2]
- palignr m2, m1, 1
- pshufb m1, [pb_unpackbq]
- pmovzxbw m1, m1
+ pshufb m2, [pb_unpackbq]
+ movhlps m1, m2
pmovzxbw m2, m2
- psubw m2, m1
- psraw m2, 1
- paddw m0, m2
- packuswb m0, m0
- pextrb [r0], m0, 0
- pextrb [r0 + r1], m0, 1
- pextrb [r0 + r1 * 2], m0, 2
- pextrb [r0 + r4], m0, 3
- pextrb [r3], m0, 4
- pextrb [r3 + r1], m0, 5
- pextrb [r3 + r1 * 2], m0, 6
- pextrb [r3 + r4], m0, 7
-
+ movu m0, [r2 + 17]
+ pmovzxbw m1, m1
+ pmovzxbw m0, m0
+ psubw m0, m2
+ psraw m0, 1
+ paddw m1, m0
+ packuswb m1, m1
+ pextrb [r0], m1, 0
+ pextrb [r0 + r1], m1, 1
+ pextrb [r0 + r1 * 2], m1, 2
+ pextrb [r0 + r5], m1, 3
+ pextrb [r3], m1, 4
+ pextrb [r3 + r1], m1, 5
+ pextrb [r3 + r1 * 2], m1, 6
+ pextrb [r3 + r5], m1, 7
.quit:
RET
cglobal intra_pred_ang8_11, 3,5,8
- cmp r4m, byte 25
- cmove r2, r3mp
- lea r3, [ang_table + 23 * 16]
- mova m7, [pw_1024]
+ xor r4, r4
+ cmp r3m, byte 25
+ mov r3, 16
+ cmove r3, r4
- movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m0, [r2], 0
palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+ lea r3, [ang_table + 23 * 16]
+ mova m7, [pw_1024]
+
pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
@@ -1640,18 +1690,22 @@ cglobal intra_pred_ang8_11, 3,5,8
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
-cglobal intra_pred_ang8_12, 4,5,8
- cmp r4m, byte 24
- mov r4, r2
- cmovz r2, r3
- cmovz r3, r4
+cglobal intra_pred_ang8_12, 3,5,8
+ xor r4, r4
+ cmp r3m, byte 24
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+
+ movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m1, [r2], 0
+ pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
+ pinsrb m0, [r2 + r3 + 6], 0
lea r4, [ang_table + 22 * 16]
mova m7, [pw_1024]
- movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
- pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
- pinsrb m0, [r3 + 6], 0
punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -1686,24 +1740,27 @@ cglobal intra_pred_ang8_12, 4,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_13, 4,5,8
- cmp r4m, byte 23
- mov r4, r2
- cmovz r2, r3
- cmovz r3, r4
-
- lea r4, [ang_table + 24 * 16]
- mova m7, [pw_1024]
-
- movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ xor r4, r4
+ cmp r3m, byte 23
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+
+ movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m1, [r2], 0
pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
- pinsrb m1, [r3 + 4], 0
+ pinsrb m1, [r2 + r3 + 4], 0
pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
- pinsrb m0, [r3 + 7], 0
+ pinsrb m0, [r2 + r3 + 7], 0
punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+ lea r4, [ang_table + 24 * 16]
+ mova m7, [pw_1024]
+
pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
pmulhrsw m4, m7
@@ -1735,25 +1792,28 @@ cglobal intra_pred_ang8_13, 4,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_14, 4,5,8
- cmp r4m, byte 22
- mov r4, r2
- cmovz r2, r3
- cmovz r3, r4
-
- lea r4, [ang_table + 24 * 16]
- mova m3, [pw_1024]
-
- movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
- pinsrb m1, [r3 + 2], 1
- pinsrb m1, [r3 + 5], 0
+ xor r4, r4
+ cmp r3m, byte 22
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+
+ movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
+ pinsrb m1, [r2], 2
+ pinsrb m1, [r2 + r3 + 2], 1
+ pinsrb m1, [r2 + r3 + 5], 0
pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
- pinsrb m0, [r3 + 7], 0
+ pinsrb m0, [r2 + r3 + 7], 0
punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+ lea r4, [ang_table + 24 * 16]
+ mova m3, [pw_1024]
+
pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
pmulhrsw m4, m3
@@ -1785,20 +1845,20 @@ cglobal intra_pred_ang8_14, 4,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_15, 4,5,8
- cmp r4m, byte 21
- mov r4, r2
- cmovz r2, r3
- cmovz r3, r4
-
- lea r4, [ang_table + 23 * 16]
- mova m3, [pw_1024]
-
- movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
- movu m2, [r3]
+ xor r4, r4
+ cmp r3m, byte 21
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+
+ movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m1, [r2], 0
+ movu m2, [r2 + r3]
pshufb m2, [c_mode16_15]
palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
- pinsrb m0, [r3 + 8], 0
+ pinsrb m0, [r2 + r3 + 8], 0
punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
@@ -1806,6 +1866,9 @@ cglobal intra_pred_ang8_15, 4,5,8
palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+ lea r4, [ang_table + 23 * 16]
+ mova m3, [pw_1024]
+
pmaddubsw m4, [r4 - 8 * 16] ; [15]
pmulhrsw m4, m3
@@ -1837,20 +1900,20 @@ cglobal intra_pred_ang8_15, 4,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_16, 4,5,8
- cmp r4m, byte 20
- mov r4, r2
- cmovz r2, r3
- cmovz r3, r4
-
- lea r4, [ang_table + 22 * 16]
- mova m7, [pw_1024]
-
- movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
- movu m2, [r3]
+ xor r4, r4
+ cmp r3m, byte 20
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+
+ movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m1, [r2], 0
+ movu m2, [r2 + r3]
pshufb m2, [c_mode16_16]
palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
- pinsrb m0, [r3 + 8], 0
+ pinsrb m0, [r2 + r3 + 8], 0
punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
@@ -1859,6 +1922,9 @@ cglobal intra_pred_ang8_16, 4,5,8
palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+ lea r4, [ang_table + 22 * 16]
+ mova m7, [pw_1024]
+
pmaddubsw m3, m5, [r4] ; [22]
pmulhrsw m3, m7
@@ -1892,20 +1958,20 @@ cglobal intra_pred_ang8_16, 4,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_17, 4,5,8
- cmp r4m, byte 19
- mov r4, r2
- cmovz r2, r3
- cmovz r3, r4
-
- lea r4, [ang_table + 17 * 16]
- mova m3, [pw_1024]
-
- movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
- movu m1, [r3]
+ xor r4, r4
+ cmp r3m, byte 19
+ mov r3, 16
+ jz .next
+ xchg r3, r4
+.next:
+
+ movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m2, [r2], 0
+ movu m1, [r2 + r3]
pshufb m1, [c_mode16_17]
palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
- pinsrb m0, [r3 + 7], 0
+ pinsrb m0, [r2 + r3 + 7], 0
punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
@@ -1913,6 +1979,8 @@ cglobal intra_pred_ang8_17, 4,5,8
palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+ lea r4, [ang_table + 17 * 16]
+ mova m3, [pw_1024]
pmaddubsw m2, [r4 - 5 * 16] ; [12]
pmulhrsw m2, m3
@@ -1948,9 +2016,10 @@ cglobal intra_pred_ang8_17, 4,5,8
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_18, 4,4,1
- movu m0, [r2]
+ movu m0, [r2 + 16]
+ pinsrb m0, [r2], 0
pshufb m0, [pb_swap8]
- movhps m0, [r3 + 1]
+ movhps m0, [r2 + 1]
lea r2, [r0 + r1 * 4]
lea r3, [r1 * 3]
movh [r2 + r3], m0
@@ -1970,16 +2039,58 @@ cglobal intra_pred_ang8_18, 4,4,1
movh [r0], m0
RET
+%macro TRANSPOSE_STORE_8x8 6
+ %if %2 == 1
+ ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
+ punpckhbw m0, %3, %4
+ punpcklbw %3, %4
+ punpckhbw %4, %3, m0
+ punpcklbw %3, m0
-;-----------------------------------------------------------------------------
-; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
+ punpckhbw m0, %5, m1
+ punpcklbw %5, %6
+ punpckhbw %6, %5, m0
+ punpcklbw %5, m0
+
+ punpckhdq m0, %3, %5
+ punpckldq %3, %5
+ punpckldq %5, %4, %6
+ punpckhdq %4, %6
+
+ movh [r0 + + %1 * 8], %3
+ movhps [r0 + r1 + %1 * 8], %3
+ movh [r0 + r1*2 + %1 * 8], m0
+ movhps [r0 + r5 + %1 * 8], m0
+ movh [r6 + %1 * 8], %5
+ movhps [r6 + r1 + %1 * 8], %5
+ movh [r6 + r1*2 + %1 * 8], %4
+ movhps [r6 + r5 + %1 * 8], %4
+ %else
+ ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
+ movh [r0 ], %3
+ movhps [r0 + r1 ], %3
+ movh [r0 + r1 * 2], %4
+ movhps [r0 + r5 ], %4
+ lea r0, [r0 + r1 * 4]
+ movh [r0 ], %5
+ movhps [r0 + r1 ], %5
+ movh [r0 + r1 * 2], %6
+ movhps [r0 + r5 ], %6
+ lea r0, [r0 + r1 * 4]
+ %endif
+%endmacro
+
+;------------------------------------------------------------------------------------------
+; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
INIT_XMM ssse3
-cglobal intra_pred_ang16_2, 3,3,3
- cmp r4m, byte 34
- cmove r2, r3mp
- movu m0, [r2 + 2]
- movu m1, [r2 + 18]
+cglobal intra_pred_ang16_2, 3,5,3
+ lea r4, [r2 + 2]
+ add r2, 34
+ cmp r3m, byte 34
+ cmove r2, r4
+ movu m0, [r2]
+ movu m1, [r2 + 16]
movu [r0], m0
palignr m2, m1, m0, 1
movu [r0 + r1], m2
@@ -2020,50 +2131,9 @@ cglobal intra_pred_ang16_2, 3,3,3
movu [r0 + r1], m2
RET
-%macro TRANSPOSE_STORE_8x8 6
- %if %2 == 1
- ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
- punpckhbw m0, %3, %4
- punpcklbw %3, %4
- punpckhbw %4, %3, m0
- punpcklbw %3, m0
-
- punpckhbw m0, %5, m1
- punpcklbw %5, %6
- punpckhbw %6, %5, m0
- punpcklbw %5, m0
-
- punpckhdq m0, %3, %5
- punpckldq %3, %5
- punpckldq %5, %4, %6
- punpckhdq %4, %6
-
- movh [r0 + + %1 * 8], %3
- movhps [r0 + r1 + %1 * 8], %3
- movh [r0 + r1*2 + %1 * 8], m0
- movhps [r0 + r5 + %1 * 8], m0
- movh [r6 + %1 * 8], %5
- movhps [r6 + r1 + %1 * 8], %5
- movh [r6 + r1*2 + %1 * 8], %4
- movhps [r6 + r5 + %1 * 8], %4
- %else
- ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
- movh [r0 ], %3
- movhps [r0 + r1 ], %3
- movh [r0 + r1 * 2], %4
- movhps [r0 + r5 ], %4
- lea r0, [r0 + r1 * 4]
- movh [r0 ], %5
- movhps [r0 + r1 ], %5
- movh [r0 + r1 * 2], %6
- movhps [r0 + r5 ], %6
- lea r0, [r0 + r1 * 4]
- %endif
-%endmacro
-
INIT_XMM sse4
cglobal intra_pred_ang16_3, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2165,12 +2235,10 @@ cglobal intra_pred_ang16_3, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_33, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3]
@@ -2279,12 +2347,11 @@ cglobal intra_pred_ang16_33, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_4, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2387,12 +2454,10 @@ cglobal intra_pred_ang16_4, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_32, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2495,12 +2560,11 @@ cglobal intra_pred_ang16_32, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_5, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2585,12 +2649,10 @@ cglobal intra_pred_ang16_5, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_31, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2674,12 +2736,11 @@ cglobal intra_pred_ang16_31, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_6, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2762,12 +2823,10 @@ cglobal intra_pred_ang16_6, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_30, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2849,12 +2908,11 @@ cglobal intra_pred_ang16_30, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_7, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -2934,12 +2992,10 @@ cglobal intra_pred_ang16_7, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_29, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -3018,12 +3074,11 @@ cglobal intra_pred_ang16_29, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_8, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -3096,12 +3151,10 @@ cglobal intra_pred_ang16_8, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_28, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -3173,12 +3226,11 @@ cglobal intra_pred_ang16_28, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_9, 3,7,8
-
+ add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -3247,12 +3299,10 @@ cglobal intra_pred_ang16_9, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_27, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -3326,15 +3376,14 @@ cglobal intra_pred_ang16_27, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
-cglobal intra_pred_ang16_10, 6,6,8
- lea r4, [r1 * 3]
+cglobal intra_pred_ang16_10, 5,6,8
+ lea r5, [r1 * 3]
pxor m7, m7
- movu m0, [r2 + 1]
+ movu m0, [r2 + 1 + 32]
palignr m1, m0, 1
pshufb m1, m7
palignr m2, m0, 2
@@ -3350,11 +3399,11 @@ cglobal intra_pred_ang16_10, 6,6,8
movu [r0 + r1], m1
movu [r0 + r1 * 2], m2
- movu [r0 + r4], m3
- lea r2, [r0 + r1 * 4]
- movu [r2], m4
- movu [r2 + r1], m5
- movu [r2 + r1 * 2], m6
+ movu [r0 + r5], m3
+ lea r3, [r0 + r1 * 4]
+ movu [r3], m4
+ movu [r3 + r1], m5
+ movu [r3 + r1 * 2], m6
palignr m1, m0, 7
pshufb m1, m7
@@ -3369,14 +3418,14 @@ cglobal intra_pred_ang16_10, 6,6,8
palignr m6, m0, 12
pshufb m6, m7
- movu [r2 + r4], m1
- lea r2, [r2 + r1 * 4]
- movu [r2], m2
- movu [r2 + r1], m3
- movu [r2 + r1 * 2], m4
- movu [r2 + r4], m5
- lea r2, [r2 + r1 * 4]
- movu [r2], m6
+ movu [r3 + r5], m1
+ lea r3, [r3 + r1 * 4]
+ movu [r3], m2
+ movu [r3 + r1], m3
+ movu [r3 + r1 * 2], m4
+ movu [r3 + r5], m5
+ lea r3, [r3 + r1 * 4]
+ movu [r3], m6
palignr m1, m0, 13
pshufb m1, m7
@@ -3386,17 +3435,17 @@ cglobal intra_pred_ang16_10, 6,6,8
pshufb m3, m7
pshufb m0, m7
- movu [r2 + r1], m1
- movu [r2 + r1 * 2], m2
- movu [r2 + r4], m3
+ movu [r3 + r1], m1
+ movu [r3 + r1 * 2], m2
+ movu [r3 + r5], m3
; filter
- cmp r5w, byte 0
+ cmp r4w, byte 0
jz .quit
pmovzxbw m0, m0
mova m1, m0
- movu m2, [r3]
- movu m3, [r3 + 1]
+ movu m2, [r2]
+ movu m3, [r2 + 1]
pshufb m2, m7
pmovzxbw m2, m2
@@ -3410,23 +3459,21 @@ cglobal intra_pred_ang16_10, 6,6,8
paddw m0, m3
paddw m1, m4
packuswb m0, m1
-
.quit:
movu [r0], m0
-
RET
INIT_XMM sse4
%if ARCH_X86_64 == 1
-cglobal intra_pred_ang16_26, 4,8,5
- mov r7, r5mp
+cglobal intra_pred_ang16_26, 3,8,5
+ mov r7, r4mp
%define bfilter r7w
%else
- cglobal intra_pred_ang16_26, 6,7,5,0 - 4
+cglobal intra_pred_ang16_26, 5,7,5,0-4
%define bfilter dword[rsp]
- mov bfilter, r5
+ mov bfilter, r4
%endif
- movu m0, [r3 + 1]
+ movu m0, [r2 + 1]
lea r4, [r1 * 3]
lea r3, [r0 + r1 * 4]
@@ -3459,8 +3506,9 @@ cglobal intra_pred_ang16_26, 4,8,5
pshufb m0, m4
pmovzxbw m0, m0
mova m1, m0
- movu m2, [r2]
- movu m3, [r2 + 1]
+ movu m2, [r2 + 32]
+ pinsrb m2, [r2], 0
+ movu m3, [r2 + 1 + 32]
pshufb m2, m4
pmovzxbw m2, m2
@@ -3491,21 +3539,18 @@ cglobal intra_pred_ang16_26, 4,8,5
pextrb [r6 + r1], m0, 13
pextrb [r6 + r1 * 2], m0, 14
pextrb [r6 + r4], m0, 15
-
.quit:
RET
INIT_XMM sse4
cglobal intra_pred_ang16_11, 3,7,8
-
lea r3, [ang_table + 16 * 16]
- mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
-.loop:
- movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
mova m2, m3
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -3563,15 +3608,66 @@ cglobal intra_pred_ang16_11, 3,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
+ movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ mova m2, m3
+ palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
+
+ pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
+ pmulhrsw m4, m7
+ pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
+ pmulhrsw m0, m7
+ packuswb m4, m0
+
+ pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
+ pmulhrsw m5, m7
+ pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
+ pmulhrsw m6, m7
+ packuswb m5, m6
+
+ pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
+ pmulhrsw m6, m7
+ pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
+ pmulhrsw m0, m7
+ packuswb m6, m0
+
+ pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
+ pmulhrsw m1, m7
+ pmaddubsw m0, m3, [r3] ; [16]
+ pmulhrsw m0, m7
+ packuswb m1, m0
+
+ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
+
+ pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
+ pmulhrsw m5, m7
+ packuswb m4, m5
+
+ pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
+ pmulhrsw m5, m7
+ pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
+ pmulhrsw m6, m7
+ packuswb m5, m6
+
+ pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
+ pmulhrsw m6, m7
+ pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
+ pmulhrsw m1, m7
+ packuswb m6, m1
+
+ pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
+ pmulhrsw m1, m7
+ packuswb m1, m1
+ punpcklqdq m1, m2 ;[00]
+
+ TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_25, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
@@ -3646,21 +3742,20 @@ cglobal intra_pred_ang16_25, 3,7,8
add r2, 8
dec r4
jnz .loop
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_12, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
- movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r3]
+ movu m2, [r2]
pshufb m2, [c_mode16_12]
palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -3724,7 +3819,7 @@ cglobal intra_pred_ang16_12, 4,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
@@ -3785,21 +3880,19 @@ cglobal intra_pred_ang16_12, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_24, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
- movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r2]
+ movu m2, [r2 + 32]
pshufb m2, [c_mode16_12]
palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -3862,7 +3955,7 @@ cglobal intra_pred_ang16_24, 4,7,8
lea r0, [r6 + 8]
- movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
@@ -3923,21 +4016,20 @@ cglobal intra_pred_ang16_24, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_13, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
- movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r3]
+ movu m2, [r2]
pshufb m2, [c_mode16_13]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -4009,7 +4101,7 @@ cglobal intra_pred_ang16_13, 4,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
@@ -4078,21 +4170,19 @@ cglobal intra_pred_ang16_13, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_23, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
- movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r2]
+ movu m2, [r2 + 32]
pshufb m2, [c_mode16_13]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -4163,7 +4253,7 @@ cglobal intra_pred_ang16_23, 4,7,8
lea r0, [r6 + 8]
- movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
@@ -4232,21 +4322,20 @@ cglobal intra_pred_ang16_23, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_14, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
- movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r3]
+ movu m2, [r2]
pshufb m2, [c_mode16_14]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -4324,7 +4413,7 @@ cglobal intra_pred_ang16_14, 4,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
@@ -4399,21 +4488,19 @@ cglobal intra_pred_ang16_14, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_22, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
- movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r2]
+ movu m2, [r2 + 32]
pshufb m2, [c_mode16_14]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -4490,7 +4577,7 @@ cglobal intra_pred_ang16_22, 4,7,8
lea r0, [r6 + 8]
- movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
@@ -4565,21 +4652,20 @@ cglobal intra_pred_ang16_22, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_15, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
- movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r3]
+ movu m2, [r2]
pshufb m2, [c_mode16_15]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -4669,7 +4755,7 @@ cglobal intra_pred_ang16_15, 4,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
@@ -4756,21 +4842,20 @@ cglobal intra_pred_ang16_15, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_21, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
- movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r2]
+ movu m2, [r2 + 32]
+ pinsrb m2, [r2], 0
pshufb m2, [c_mode16_15]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -4859,7 +4944,7 @@ cglobal intra_pred_ang16_21, 4,7,8
lea r0, [r6 + 8]
- movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
@@ -4946,21 +5031,20 @@ cglobal intra_pred_ang16_21, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_16, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
- movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r3]
+ movu m2, [r2]
pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -5053,7 +5137,7 @@ cglobal intra_pred_ang16_16, 4,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
@@ -5146,21 +5230,20 @@ cglobal intra_pred_ang16_16, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_20, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
- movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r2]
+ movu m2, [r2 + 32]
+ pinsrb m2, [r2], 0
pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -5252,7 +5335,7 @@ cglobal intra_pred_ang16_20, 4,7,8
lea r0, [r6 + 8]
- movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
@@ -5345,21 +5428,20 @@ cglobal intra_pred_ang16_20, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_17, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
- movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ pinsrb m3, [r2], 0
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r3]
+ movu m2, [r2]
pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -5378,7 +5460,7 @@ cglobal intra_pred_ang16_17, 4,7,8
pmulhrsw m5, m7
pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
- pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
+ pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
@@ -5459,7 +5541,7 @@ cglobal intra_pred_ang16_17, 4,7,8
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
- movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
@@ -5557,21 +5639,20 @@ cglobal intra_pred_ang16_17, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_19, 4,7,8
-
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
- movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
- movu m2, [r2]
+ movu m2, [r2 + 32]
+ pinsrb m2, [r2], 0
pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
@@ -5590,7 +5671,7 @@ cglobal intra_pred_ang16_19, 4,7,8
pmulhrsw m5, m7
pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
- pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
+ pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
@@ -5670,7 +5751,7 @@ cglobal intra_pred_ang16_19, 4,7,8
lea r0, [r6 + 8]
- movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
@@ -5768,14 +5849,12 @@ cglobal intra_pred_ang16_19, 4,7,8
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
-
RET
INIT_XMM sse4
cglobal intra_pred_ang16_18, 4,5,3
-
- movu m0, [r3]
- movu m1, [r2]
+ movu m0, [r2]
+ movu m1, [r2 + 32]
mova m2, [c_mode16_18]
pshufb m1, m2
@@ -5818,263 +5897,102 @@ cglobal intra_pred_ang16_18, 4,5,3
movu [r0 + r3], m0
RET
-;---------------------------------------------------------------------------------------------------------------
-; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;---------------------------------------------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang32_2, 3,4,4
- cmp r4m, byte 34
- cmove r2, r3mp
- movu m0, [r2 + 2]
- movu m1, [r2 + 18]
- movu m3, [r2 + 34]
+; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
+%macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
+ %if %3 == 0
+ %else
+ pshufb m0, [r3]
+ pmaddubsw m0, [r4 + %3 * 16]
+ pmulhrsw m0, [pw_1024]
+ %endif
+ %if %4 == 0
+ pmovzxbw m1, m1
+ %else
+ pshufb m1, [r3]
+ pmaddubsw m1, [r4 + %4 * 16]
+ pmulhrsw m1, [pw_1024]
+ %endif
+ %if %3 == 0
+ packuswb m1, m1
+ movlhps m0, m1
+ %else
+ packuswb m0, m1
+ %endif
+ mova m1, [pw_1024]
+ %if %5 == 0
+ %else
+ pshufb m2, [r3]
+ pmaddubsw m2, [r4 + %5 * 16]
+ pmulhrsw m2, m1
+ %endif
+ %if %6 == 0
+ pmovzxbw m3, m3
+ %else
+ pshufb m3, [r3]
+ pmaddubsw m3, [r4 + %6 * 16]
+ pmulhrsw m3, m1
+ %endif
+ %if %5 == 0
+ packuswb m3, m3
+ movlhps m2, m3
+ %else
+ packuswb m2, m3
+ %endif
+ %if %7 == 0
+ %else
+ pshufb m4, [r3]
+ pmaddubsw m4, [r4 + %7 * 16]
+ pmulhrsw m4, m1
+ %endif
+ %if %8 == 0
+ pmovzxbw m5, m5
+ %else
+ pshufb m5, [r3]
+ pmaddubsw m5, [r4 + %8 * 16]
+ pmulhrsw m5, m1
+ %endif
+ %if %7 == 0
+ packuswb m5, m5
+ movlhps m4, m5
+ %else
+ packuswb m4, m5
+ %endif
+ %if %9 == 0
+ %else
+ pshufb m6, [r3]
+ pmaddubsw m6, [r4 + %9 * 16]
+ pmulhrsw m6, m1
+ %endif
+ %if %10 == 0
+ pmovzxbw m7, m7
+ %else
+ pshufb m7, [r3]
+ pmaddubsw m7, [r4 + %10 * 16]
+ pmulhrsw m7, m1
+ %endif
+ %if %9 == 0
+ packuswb m7, m7
+ movlhps m6, m7
+ %else
+ packuswb m6, m7
+ %endif
- lea r3, [r1 * 3]
+ %if %2 == 1
+ ; transpose
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ punpckhbw m3, m0, m1
+ punpcklbw m0, m1
- movu [r0], m0
- movu [r0 + 16], m1
- palignr m2, m1, m0, 1
- movu [r0 + r1], m2
- palignr m2, m3, m1, 1
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 2
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 2
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 3
- movu [r0 + r3], m2
- palignr m2, m3, m1, 3
- movu [r0 + r3 + 16], m2
+ punpckhbw m1, m4, m6
+ punpcklbw m4, m6
+ punpckhbw m6, m4, m1
+ punpcklbw m4, m1
- lea r0, [r0 + r1 * 4]
-
- palignr m2, m1, m0, 4
- movu [r0], m2
- palignr m2, m3, m1, 4
- movu [r0 + 16], m2
- palignr m2, m1, m0, 5
- movu [r0 + r1], m2
- palignr m2, m3, m1, 5
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 6
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 6
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 7
- movu [r0 + r3], m2
- palignr m2, m3, m1, 7
- movu [r0 + r3 + 16], m2
-
- lea r0, [r0 + r1 * 4]
-
- palignr m2, m1, m0, 8
- movu [r0], m2
- palignr m2, m3, m1, 8
- movu [r0 + 16], m2
- palignr m2, m1, m0, 9
- movu [r0 + r1], m2
- palignr m2, m3, m1, 9
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 10
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 10
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 11
- movu [r0 + r3], m2
- palignr m2, m3, m1, 11
- movu [r0 + r3 + 16], m2
-
- lea r0, [r0 + r1 * 4]
-
- palignr m2, m1, m0, 12
- movu [r0], m2
- palignr m2, m3, m1, 12
- movu [r0 + 16], m2
- palignr m2, m1, m0, 13
- movu [r0 + r1], m2
- palignr m2, m3, m1, 13
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 14
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 14
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 15
- movu [r0 + r3], m2
- palignr m2, m3, m1, 15
- movu [r0 + r3 + 16], m2
-
- lea r0, [r0 + r1 * 4]
-
- movu [r0], m1
- movu m0, [r2 + 50]
- movu [r0 + 16], m3
- palignr m2, m3, m1, 1
- movu [r0 + r1], m2
- palignr m2, m0, m3, 1
- movu [r0 + r1 + 16], m2
- palignr m2, m3, m1, 2
- movu [r0 + r1 * 2], m2
- palignr m2, m0, m3, 2
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m3, m1, 3
- movu [r0 + r3], m2
- palignr m2, m0, m3, 3
- movu [r0 + r3 + 16], m2
-
- lea r0, [r0 + r1 * 4]
-
- palignr m2, m3, m1, 4
- movu [r0], m2
- palignr m2, m0, m3, 4
- movu [r0 + 16], m2
- palignr m2, m3, m1, 5
- movu [r0 + r1], m2
- palignr m2, m0, m3, 5
- movu [r0 + r1 + 16], m2
- palignr m2, m3, m1, 6
- movu [r0 + r1 * 2], m2
- palignr m2, m0, m3, 6
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m3, m1, 7
- movu [r0 + r3], m2
- palignr m2, m0, m3, 7
- movu [r0 + r3 + 16], m2
-
- lea r0, [r0 + r1 * 4]
-
- palignr m2, m3, m1, 8
- movu [r0], m2
- palignr m2, m0, m3, 8
- movu [r0 + 16], m2
- palignr m2, m3, m1, 9
- movu [r0 + r1], m2
- palignr m2, m0, m3, 9
- movu [r0 + r1 + 16], m2
- palignr m2, m3, m1, 10
- movu [r0 + r1 * 2], m2
- palignr m2, m0, m3, 10
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m3, m1, 11
- movu [r0 + r3], m2
- palignr m2, m0, m3, 11
- movu [r0 + r3 + 16], m2
-
- lea r0, [r0 + r1 * 4]
-
- palignr m2, m3, m1, 12
- movu [r0], m2
- palignr m2, m0, m3, 12
- movu [r0 + 16], m2
- palignr m2, m3, m1, 13
- movu [r0 + r1], m2
- palignr m2, m0, m3, 13
- movu [r0 + r1 + 16], m2
- palignr m2, m3, m1, 14
- movu [r0 + r1 * 2], m2
- palignr m2, m0, m3, 14
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m3, m1, 15
- movu [r0 + r3], m2
- palignr m2, m0, m3, 15
- movu [r0 + r3 + 16], m2
- RET
-
-; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
-%macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
- %if %3 == 0
- %else
- pshufb m0, [r3]
- pmaddubsw m0, [r4 + %3 * 16]
- pmulhrsw m0, [pw_1024]
- %endif
- %if %4 == 0
- pmovzxbw m1, m1
- %else
- pshufb m1, [r3]
- pmaddubsw m1, [r4 + %4 * 16]
- pmulhrsw m1, [pw_1024]
- %endif
- %if %3 == 0
- packuswb m1, m1
- movlhps m0, m1
- %else
- packuswb m0, m1
- %endif
- mova m1, [pw_1024]
- %if %5 == 0
- %else
- pshufb m2, [r3]
- pmaddubsw m2, [r4 + %5 * 16]
- pmulhrsw m2, m1
- %endif
- %if %6 == 0
- pmovzxbw m3, m3
- %else
- pshufb m3, [r3]
- pmaddubsw m3, [r4 + %6 * 16]
- pmulhrsw m3, m1
- %endif
- %if %5 == 0
- packuswb m3, m3
- movlhps m2, m3
- %else
- packuswb m2, m3
- %endif
- %if %7 == 0
- %else
- pshufb m4, [r3]
- pmaddubsw m4, [r4 + %7 * 16]
- pmulhrsw m4, m1
- %endif
- %if %8 == 0
- pmovzxbw m5, m5
- %else
- pshufb m5, [r3]
- pmaddubsw m5, [r4 + %8 * 16]
- pmulhrsw m5, m1
- %endif
- %if %7 == 0
- packuswb m5, m5
- movlhps m4, m5
- %else
- packuswb m4, m5
- %endif
- %if %9 == 0
- %else
- pshufb m6, [r3]
- pmaddubsw m6, [r4 + %9 * 16]
- pmulhrsw m6, m1
- %endif
- %if %10 == 0
- pmovzxbw m7, m7
- %else
- pshufb m7, [r3]
- pmaddubsw m7, [r4 + %10 * 16]
- pmulhrsw m7, m1
- %endif
- %if %9 == 0
- packuswb m7, m7
- movlhps m6, m7
- %else
- packuswb m6, m7
- %endif
-
- %if %2 == 1
- ; transpose
- punpckhbw m1, m0, m2
- punpcklbw m0, m2
- punpckhbw m3, m0, m1
- punpcklbw m0, m1
-
- punpckhbw m1, m4, m6
- punpcklbw m4, m6
- punpckhbw m6, m4, m1
- punpcklbw m4, m1
-
- punpckhdq m2, m0, m4
- punpckldq m0, m4
- punpckldq m4, m3, m6
- punpckhdq m3, m6
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckldq m4, m3, m6
+ punpckhdq m3, m6
movh [r0 + + %1 * 8], m0
movhps [r0 + r1 + %1 * 8], m0
@@ -6226,24 +6144,6 @@ cglobal intra_pred_ang32_2, 3,4,4
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_3, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_3_33 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
%macro MODE_4_32 1
movu m0, [r2 + 1]
@@ -6377,24 +6277,6 @@ cglobal intra_pred_ang32_3, 3,7,8
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;-----------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_4, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_4_32 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
%macro MODE_5_31 1
movu m0, [r2 + 1]
@@ -6528,24 +6410,6 @@ cglobal intra_pred_ang32_4, 3,7,8
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_5, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_5_31 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
%macro MODE_6_30 1
movu m0, [r2 + 1]
@@ -6668,24 +6532,6 @@ cglobal intra_pred_ang32_5, 3,7,8
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_6, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_6_30 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
%macro MODE_7_29 1
movu m0, [r2 + 1]
@@ -6801,24 +6647,6 @@ cglobal intra_pred_ang32_6, 3,7,8
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_7, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_7_29 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
%macro MODE_8_28 1
movu m0, [r2 + 1]
@@ -6931,24 +6759,6 @@ cglobal intra_pred_ang32_7, 3,7,8
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_8, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_8_28 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
%macro MODE_9_27 1
movu m2, [r2 + 1]
@@ -7051,97 +6861,466 @@ cglobal intra_pred_ang32_8, 3,7,8
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_9, 3,7,8
- lea r3, [ang_table + 16 * 16]
- mov r4d, 4
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
- mova m7, [pw_1024]
-.loop:
- MODE_9_27 1
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
- dec r4
- jnz .loop
- RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize)
-%define m8 [rsp + 0 * mmsize]
-%define m9 [rsp + 1 * mmsize]
- lea r4, [r1 * 3]
- pxor m7, m7
- mov r6, 2
- movu m0, [r3]
- movu m1, [r3 + 1]
- mova m8, m0
- mova m9, m1
- mov r3d, r5d
+%macro MODE_12_24 1
+ movu m2, [r2]
+ palignr m1, m2, 1
+ punpckhbw m0, m2, m1
+ punpcklbw m2, m1
+ palignr m0, m2, 2
+ pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
+ pmulhrsw m4, m7
+ pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
+ pmulhrsw m3, m7
+ packuswb m4, m3
+ pmaddubsw m5, m0, [r4 + 16] ; [17]
+ pmulhrsw m5, m7
+ pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
+ pmulhrsw m6, m7
+ packuswb m5, m6
+ pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
+ pmulhrsw m6, m7
+ pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
+ pmulhrsw m3, m7
+ packuswb m6, m3
+ pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
+ pmulhrsw m1, m7
+ pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+ pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
+ pmulhrsw m5, m7
+ pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
+ pmulhrsw m6, m7
+ packuswb m5, m6
+ movu m0, [r2 - 2]
+ palignr m1, m0, 1
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ palignr m2, m0, 2
+ pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
+ pmulhrsw m6, m7
+ pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
+ pmulhrsw m1, m7
+ packuswb m6, m1
+ pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
+ pmulhrsw m1, m7
+ pmaddubsw m3, m2, [r4] ; [16]
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
+ pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
+ pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
+ pmulhrsw m3, m7
+ packuswb m4, m3
+ pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
+ pmulhrsw m5, m7
+ movu m0, [r2 - 3]
+ palignr m1, m0, 1
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ palignr m2, m0, 2
+ pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
+ pmulhrsw m6, m7
+ packuswb m5, m6
+ pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
+ pmulhrsw m6, m7
+ pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
+ pmulhrsw m3, m7
+ packuswb m6, m3
+ pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
+ pmulhrsw m1, m7
+ pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
+ pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
+ pmulhrsw m4, m7
+ movu m2, [r2 - 4]
+ palignr m1, m2, 1
+ punpckhbw m0, m2, m1
+ punpcklbw m2, m1
+ palignr m0, m2, 2
+ pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
+ pmulhrsw m5, m7
+ pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
+ pmulhrsw m6, m7
+ packuswb m5, m6
+ pmaddubsw m6, m0, [r4 - 16] ; [15]
+ pmulhrsw m6, m7
+ pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
+ pmulhrsw m1, m7
+ packuswb m6, m1
+ pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
+ pmulhrsw m1, m7
+ movu m2, [pb_fact0]
+ pshufb m0, m2
+ pmovzxbw m0, m0
+ packuswb m1, m0
+ TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
+%endmacro
-.loop:
- movu m0, [r2 + 1]
- palignr m1, m0, 1
- pshufb m1, m7
- palignr m2, m0, 2
- pshufb m2, m7
- palignr m3, m0, 3
- pshufb m3, m7
- palignr m4, m0, 4
- pshufb m4, m7
- palignr m5, m0, 5
- pshufb m5, m7
- palignr m6, m0, 6
- pshufb m6, m7
+;------------------------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang32_2, 3,5,4
+ lea r4, [r2]
+ add r2, 64
+ cmp r3m, byte 34
+ cmove r2, r4
+ movu m0, [r2 + 2]
+ movu m1, [r2 + 18]
+ movu m3, [r2 + 34]
- movu [r0 + r1], m1
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 * 2], m2
- movu [r0 + r1 * 2 + 16], m2
- movu [r0 + r4], m3
- movu [r0 + r4 + 16], m3
- lea r5, [r0 + r1 * 4]
- movu [r5], m4
- movu [r5 + 16], m4
- movu [r5 + r1], m5
- movu [r5 + r1 + 16], m5
- movu [r5 + r1 * 2], m6
- movu [r5 + r1 * 2 + 16], m6
+ lea r3, [r1 * 3]
- palignr m1, m0, 7
- pshufb m1, m7
- movhlps m2, m0
- pshufb m2, m7
- palignr m3, m0, 9
- pshufb m3, m7
- palignr m4, m0, 10
- pshufb m4, m7
- palignr m5, m0, 11
- pshufb m5, m7
- palignr m6, m0, 12
- pshufb m6, m7
+ movu [r0], m0
+ movu [r0 + 16], m1
+ palignr m2, m1, m0, 1
+ movu [r0 + r1], m2
+ palignr m2, m3, m1, 1
+ movu [r0 + r1 + 16], m2
+ palignr m2, m1, m0, 2
+ movu [r0 + r1 * 2], m2
+ palignr m2, m3, m1, 2
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m1, m0, 3
+ movu [r0 + r3], m2
+ palignr m2, m3, m1, 3
+ movu [r0 + r3 + 16], m2
- movu [r5 + r4], m1
- movu [r5 + r4 + 16], m1
- lea r5, [r5 + r1 * 4]
- movu [r5], m2
- movu [r5 + 16], m2
- movu [r5 + r1], m3
- movu [r5 + r1 + 16], m3
- movu [r5 + r1 * 2], m4
- movu [r5 + r1 * 2 + 16], m4
- movu [r5 + r4], m5
- movu [r5 + r4 + 16], m5
- lea r5, [r5 + r1 * 4]
- movu [r5], m6
- movu [r5 + 16], m6
+ lea r0, [r0 + r1 * 4]
+
+ palignr m2, m1, m0, 4
+ movu [r0], m2
+ palignr m2, m3, m1, 4
+ movu [r0 + 16], m2
+ palignr m2, m1, m0, 5
+ movu [r0 + r1], m2
+ palignr m2, m3, m1, 5
+ movu [r0 + r1 + 16], m2
+ palignr m2, m1, m0, 6
+ movu [r0 + r1 * 2], m2
+ palignr m2, m3, m1, 6
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m1, m0, 7
+ movu [r0 + r3], m2
+ palignr m2, m3, m1, 7
+ movu [r0 + r3 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+
+ palignr m2, m1, m0, 8
+ movu [r0], m2
+ palignr m2, m3, m1, 8
+ movu [r0 + 16], m2
+ palignr m2, m1, m0, 9
+ movu [r0 + r1], m2
+ palignr m2, m3, m1, 9
+ movu [r0 + r1 + 16], m2
+ palignr m2, m1, m0, 10
+ movu [r0 + r1 * 2], m2
+ palignr m2, m3, m1, 10
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m1, m0, 11
+ movu [r0 + r3], m2
+ palignr m2, m3, m1, 11
+ movu [r0 + r3 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+
+ palignr m2, m1, m0, 12
+ movu [r0], m2
+ palignr m2, m3, m1, 12
+ movu [r0 + 16], m2
+ palignr m2, m1, m0, 13
+ movu [r0 + r1], m2
+ palignr m2, m3, m1, 13
+ movu [r0 + r1 + 16], m2
+ palignr m2, m1, m0, 14
+ movu [r0 + r1 * 2], m2
+ palignr m2, m3, m1, 14
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m1, m0, 15
+ movu [r0 + r3], m2
+ palignr m2, m3, m1, 15
+ movu [r0 + r3 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+
+ movu [r0], m1
+ movu m0, [r2 + 50]
+ movu [r0 + 16], m3
+ palignr m2, m3, m1, 1
+ movu [r0 + r1], m2
+ palignr m2, m0, m3, 1
+ movu [r0 + r1 + 16], m2
+ palignr m2, m3, m1, 2
+ movu [r0 + r1 * 2], m2
+ palignr m2, m0, m3, 2
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m3, m1, 3
+ movu [r0 + r3], m2
+ palignr m2, m0, m3, 3
+ movu [r0 + r3 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+
+ palignr m2, m3, m1, 4
+ movu [r0], m2
+ palignr m2, m0, m3, 4
+ movu [r0 + 16], m2
+ palignr m2, m3, m1, 5
+ movu [r0 + r1], m2
+ palignr m2, m0, m3, 5
+ movu [r0 + r1 + 16], m2
+ palignr m2, m3, m1, 6
+ movu [r0 + r1 * 2], m2
+ palignr m2, m0, m3, 6
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m3, m1, 7
+ movu [r0 + r3], m2
+ palignr m2, m0, m3, 7
+ movu [r0 + r3 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+
+ palignr m2, m3, m1, 8
+ movu [r0], m2
+ palignr m2, m0, m3, 8
+ movu [r0 + 16], m2
+ palignr m2, m3, m1, 9
+ movu [r0 + r1], m2
+ palignr m2, m0, m3, 9
+ movu [r0 + r1 + 16], m2
+ palignr m2, m3, m1, 10
+ movu [r0 + r1 * 2], m2
+ palignr m2, m0, m3, 10
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m3, m1, 11
+ movu [r0 + r3], m2
+ palignr m2, m0, m3, 11
+ movu [r0 + r3 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+
+ palignr m2, m3, m1, 12
+ movu [r0], m2
+ palignr m2, m0, m3, 12
+ movu [r0 + 16], m2
+ palignr m2, m3, m1, 13
+ movu [r0 + r1], m2
+ palignr m2, m0, m3, 13
+ movu [r0 + r1 + 16], m2
+ palignr m2, m3, m1, 14
+ movu [r0 + r1 * 2], m2
+ palignr m2, m0, m3, 14
+ movu [r0 + r1 * 2 + 16], m2
+ palignr m2, m3, m1, 15
+ movu [r0 + r3], m2
+ palignr m2, m0, m3, 15
+ movu [r0 + r3 + 16], m2
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_3, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_3_33 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_4, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_4_32 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_5, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_5_31 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_6, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_6_30 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_7, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_7_29 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_8, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_8_28 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_9, 3,7,8
+ add r2, 64
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 4
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m7, [pw_1024]
+.loop:
+ MODE_9_27 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec r4
+ jnz .loop
+ RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize)
+%define m8 [rsp + 0 * mmsize]
+%define m9 [rsp + 1 * mmsize]
+ pxor m7, m7
+ mov r6, 2
+ movu m0, [r2]
+ movu m1, [r2 + 1]
+ mova m8, m0
+ mova m9, m1
+ mov r3d, r4d
+ lea r4, [r1 * 3]
+
+.loop:
+ movu m0, [r2 + 1 + 64]
+ palignr m1, m0, 1
+ pshufb m1, m7
+ palignr m2, m0, 2
+ pshufb m2, m7
+ palignr m3, m0, 3
+ pshufb m3, m7
+ palignr m4, m0, 4
+ pshufb m4, m7
+ palignr m5, m0, 5
+ pshufb m5, m7
+ palignr m6, m0, 6
+ pshufb m6, m7
+
+ movu [r0 + r1], m1
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 16], m2
+ movu [r0 + r4], m3
+ movu [r0 + r4 + 16], m3
+ lea r5, [r0 + r1 * 4]
+ movu [r5], m4
+ movu [r5 + 16], m4
+ movu [r5 + r1], m5
+ movu [r5 + r1 + 16], m5
+ movu [r5 + r1 * 2], m6
+ movu [r5 + r1 * 2 + 16], m6
+
+ palignr m1, m0, 7
+ pshufb m1, m7
+ movhlps m2, m0
+ pshufb m2, m7
+ palignr m3, m0, 9
+ pshufb m3, m7
+ palignr m4, m0, 10
+ pshufb m4, m7
+ palignr m5, m0, 11
+ pshufb m5, m7
+ palignr m6, m0, 12
+ pshufb m6, m7
+
+ movu [r5 + r4], m1
+ movu [r5 + r4 + 16], m1
+ lea r5, [r5 + r1 * 4]
+ movu [r5], m2
+ movu [r5 + 16], m2
+ movu [r5 + r1], m3
+ movu [r5 + r1 + 16], m3
+ movu [r5 + r1 * 2], m4
+ movu [r5 + r1 * 2 + 16], m4
+ movu [r5 + r4], m5
+ movu [r5 + r4 + 16], m5
+ lea r5, [r5 + r1 * 4]
+ movu [r5], m6
+ movu [r5 + 16], m6
palignr m1, m0, 13
pshufb m1, m7
@@ -7189,26 +7368,23 @@ cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize)
jnz .loop
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_11, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
-
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3 + 16]
+ movu m0, [r2 + 16]
pxor m1, m1
pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
mova [rsp], m0
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
+ movu m0, [r2 + 64]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 16 + 64]
+ movu m2, [r2 + 32 + 64]
movu [rsp + 1], m0
movu [rsp + 1 + 16], m1
movu [rsp + 1 + 32], m2
@@ -7282,6 +7458,9 @@ cglobal intra_pred_ang32_11, 4,7,8
pinsrb m0, [r3 + 26], 12
mova above, m0
movu m2, [r2]
+ %if %1 == 1
+ pinsrb m2, [r3], 0
+ %endif
palignr m1, m2, 1
punpcklbw m2, m1
pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
@@ -7300,6 +7479,9 @@ cglobal intra_pred_ang32_11, 4,7,8
pmulhrsw m3, m7
packuswb m6, m3
movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ %if %1 == 1
+ pinsrb m1, [r3], 0
+ %endif
palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
@@ -7370,129 +7552,20 @@ cglobal intra_pred_ang32_11, 4,7,8
pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
- pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
- pmulhrsw m1, m7
- movu m0, [pb_fact0]
- pshufb m2, m0
- pmovzxbw m2, m2
- packuswb m1, m2
- TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
-%endmacro
-
-%macro MODE_12_24 1
- movu m2, [r2]
- palignr m1, m2, 1
- punpckhbw m0, m2, m1
- punpcklbw m2, m1
- palignr m0, m2, 2
- pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
- pmulhrsw m4, m7
- pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
- pmulhrsw m3, m7
- packuswb m4, m3
- pmaddubsw m5, m0, [r4 + 16] ; [17]
- pmulhrsw m5, m7
- pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
- pmulhrsw m6, m7
- packuswb m5, m6
- pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
- pmulhrsw m6, m7
- pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
- pmulhrsw m3, m7
- packuswb m6, m3
- pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
- pmulhrsw m1, m7
- pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
- pmulhrsw m3, m7
- packuswb m1, m3
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
- pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
- pmulhrsw m4, m7
- pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
- pmulhrsw m5, m7
- packuswb m4, m5
- pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
- pmulhrsw m5, m7
- pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
- pmulhrsw m6, m7
- packuswb m5, m6
- movu m0, [r2 - 2]
- palignr m1, m0, 1
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- palignr m2, m0, 2
- pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
- pmulhrsw m6, m7
- pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
- pmulhrsw m1, m7
- packuswb m6, m1
- pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
- pmulhrsw m1, m7
- pmaddubsw m3, m2, [r4] ; [16]
- pmulhrsw m3, m7
- packuswb m1, m3
- TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
- pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
- pmulhrsw m4, m7
- pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
- pmulhrsw m3, m7
- packuswb m4, m3
- pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
- pmulhrsw m5, m7
- movu m0, [r2 - 3]
- palignr m1, m0, 1
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- palignr m2, m0, 2
- pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
- pmulhrsw m6, m7
- packuswb m5, m6
- pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
- pmulhrsw m6, m7
- pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
- pmulhrsw m3, m7
- packuswb m6, m3
- pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
- pmulhrsw m1, m7
- pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
- pmulhrsw m3, m7
- packuswb m1, m3
- TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
- pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
- pmulhrsw m4, m7
- movu m2, [r2 - 4]
- palignr m1, m2, 1
- punpckhbw m0, m2, m1
- punpcklbw m2, m1
- palignr m0, m2, 2
- pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
- pmulhrsw m5, m7
- packuswb m4, m5
- pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
- pmulhrsw m5, m7
- pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
- pmulhrsw m6, m7
- packuswb m5, m6
- pmaddubsw m6, m0, [r4 - 16] ; [15]
- pmulhrsw m6, m7
- pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
- pmulhrsw m1, m7
- packuswb m6, m1
- pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
- pmulhrsw m1, m7
- movu m2, [pb_fact0]
- pshufb m0, m2
- pmovzxbw m0, m0
- packuswb m1, m0
+ pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
+ pmulhrsw m1, m7
+ movu m0, [pb_fact0]
+ pshufb m2, m0
+ pmovzxbw m2, m2
+ packuswb m1, m2
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;-----------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------------------------------------------
+
INIT_XMM sse4
-cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
+cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
-
+ mov r3, r2
+ add r2, 64
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
@@ -7521,6 +7594,9 @@ cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
pshufb m0, [c_mode32_13_shuf]
mova above, m0
movu m2, [r2]
+ %if (%1 == 1)
+ pinsrb m2, [r3], 0
+ %endif
palignr m1, m2, 1
punpcklbw m2, m1
pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
@@ -7531,6 +7607,9 @@ cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
pmulhrsw m5, m7
movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ %if (%1 == 1)
+ pinsrb m1, [r3], 0
+ %endif
palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
@@ -7625,7 +7704,7 @@ cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-%macro MODE_13_23 1
+%macro MODE_13_23 2
movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
@@ -7722,6 +7801,9 @@ cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
pmulhrsw m6, m7
packuswb m5, m6
movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+ %if ((%1 & %2) == 1)
+ pinsrb m2, [r3], 0
+ %endif
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
@@ -7737,12 +7819,12 @@ cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
packuswb m1, m2
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
-;-----------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------------------------------------------
+
INIT_XMM sse4
-cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize)
+cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
+ mov r3, r2
+ add r2, 64
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
@@ -7752,9 +7834,14 @@ cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize)
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 7
- mov r3, 3
+
+ MODE_13_23 1, 1
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ mov r3, 2
.loop:
- MODE_13_23 1
+ MODE_13_23 1, 0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
@@ -7762,11 +7849,8 @@ cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize)
jnz .loop
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_14, 4,7,8
+cglobal intra_pred_ang32_14, 3,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
@@ -7774,15 +7858,15 @@ cglobal intra_pred_ang32_14, 4,7,8
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 15]
+ movu m0, [r2]
+ movu m1, [r2 + 15]
pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
mova [rsp], m0
- movu m0, [r2 + 1]
- movu m1, [r2 + 1 + 16]
+ movu m0, [r2 + 1 + 64]
+ movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 13], m0
movu [rsp + 13 + 16], m1
mov [rsp + 63], byte 4
@@ -7849,9 +7933,6 @@ cglobal intra_pred_ang32_14, 4,7,8
mov rsp, [rsp+64]
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_15, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
@@ -7861,14 +7942,14 @@ cglobal intra_pred_ang32_15, 4,7,8
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 15]
+ movu m0, [r2]
+ movu m1, [r2 + 15]
pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
mova [rsp], m1
movu [rsp + 8], m0
- movu m0, [r2 + 1]
- movu m1, [r2 + 1 + 16]
+ movu m0, [r2 + 1 + 64]
+ movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 17], m0
movu [rsp + 17 + 16], m1
mov [rsp + 63], byte 4
@@ -7935,9 +8016,6 @@ cglobal intra_pred_ang32_15, 4,7,8
mov rsp, [rsp+64]
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_16, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
@@ -7947,14 +8025,14 @@ cglobal intra_pred_ang32_16, 4,7,8
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 15]
+ movu m0, [r2]
+ movu m1, [r2 + 15]
pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
mova [rsp], m1
movu [rsp + 10], m0
- movu m0, [r2 + 1]
- movu m1, [r2 + 1 + 16]
+ movu m0, [r2 + 1 + 64]
+ movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 21], m0
movu [rsp + 21 + 16], m1
mov [rsp + 63], byte 4
@@ -8021,9 +8099,6 @@ cglobal intra_pred_ang32_16, 4,7,8
mov rsp, [rsp+64]
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_17, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
@@ -8033,14 +8108,14 @@ cglobal intra_pred_ang32_17, 4,7,8
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 16]
+ movu m0, [r2]
+ movu m1, [r2 + 16]
pshufb m0, [c_mode32_17_0]
pshufb m1, [c_mode32_17_0]
mova [rsp ], m1
movu [rsp + 13], m0
- movu m0, [r2 + 1]
- movu m1, [r2 + 1 + 16]
+ movu m0, [r2 + 1 + 64]
+ movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 26], m0
movu [rsp + 26 + 16], m1
mov [rsp + 63], byte 4
@@ -8108,15 +8183,12 @@ cglobal intra_pred_ang32_17, 4,7,8
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_18, 4,5,5
- movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
- movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
- movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
- movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
+ movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+ movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
+ movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
lea r2, [r1 * 2]
lea r3, [r1 * 3]
@@ -8273,21 +8345,18 @@ cglobal intra_pred_ang32_18, 4,5,5
movu [r0 + r3 + 16], m4
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_19, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
- xchg r2, r3
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 16]
+ movu m0, [r2 + 64]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 16 + 64]
pshufb m0, [c_mode32_17_0]
pshufb m1, [c_mode32_17_0]
mova [rsp ], m1
@@ -8363,21 +8432,18 @@ cglobal intra_pred_ang32_19, 4,7,8
mov rsp, [rsp+64]
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_20, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
- xchg r2, r3
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 15]
+ movu m0, [r2 + 64]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 15 + 64]
pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
mova [rsp], m1
@@ -8453,21 +8519,18 @@ cglobal intra_pred_ang32_20, 4,7,8
mov rsp, [rsp+64]
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_21, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
- xchg r2, r3
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 15]
+ movu m0, [r2 + 64]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 15 + 64]
pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
mova [rsp], m1
@@ -8543,22 +8606,18 @@ cglobal intra_pred_ang32_21, 4,7,8
mov rsp, [rsp+64]
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_22, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
-
- xchg r2, r3
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3]
- movu m1, [r3 + 15]
+ movu m0, [r2 + 64]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 15 + 64]
pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
@@ -8635,13 +8694,10 @@ cglobal intra_pred_ang32_22, 4,7,8
mov rsp, [rsp+64]
RET
-;-----------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
- xchg r2, r3
+ lea r3, [r2 + 64]
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
@@ -8653,7 +8709,7 @@ cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
add r2, 7
mov r3, 3
.loop:
- MODE_13_23 0
+ MODE_13_23 0, 0
add r6, 8
mov r0, r6
add r2, 8
@@ -8661,13 +8717,10 @@ cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
jnz .loop
RET
-;-----------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
- xchg r2, r3
+ lea r3, [r2 + 64]
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
@@ -8687,20 +8740,16 @@ cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
jnz .loop
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_25, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
- xchg r2, r3
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
- movu m0, [r3 + 16]
+ movu m0, [r2 + 16 + 64]
pxor m1, m1
pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
mova [rsp], m0
@@ -8777,23 +8826,21 @@ cglobal intra_pred_ang32_25, 4,7,8
mov rsp, [rsp+64]
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize)
+cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
- lea r4, [r1 * 3]
mov r6, 2
- movu m0, [r2]
- movu m1, [r2 + 1]
+ movu m0, [r2 + 64]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 1 + 64]
mova m8, m0
mova m9, m1
- mov r2d, r5d
+ mov r3d, r4d
+ lea r4, [r1 * 3]
.loop:
- movu m0, [r3 + 1]
+ movu m0, [r2 + 1]
movu [r0], m0
movu [r0 + r1], m0
@@ -8851,7 +8898,7 @@ cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize)
movu [r5 + r4], m0
; filter
- cmp r2d, byte 0
+ cmp r3d, byte 0
jz .quit
pxor m4, m4
@@ -8895,18 +8942,14 @@ cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize)
pextrb [r5 + r4], m0, 15
.quit:
- lea r3, [r3 + 16]
+ lea r2, [r2 + 16]
add r0, 16
dec r6d
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_27, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -8921,12 +8964,8 @@ cglobal intra_pred_ang32_27, 3,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_28, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -8941,12 +8980,8 @@ cglobal intra_pred_ang32_28, 3,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_29, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -8961,12 +8996,8 @@ cglobal intra_pred_ang32_29, 3,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_30, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -8981,12 +9012,8 @@ cglobal intra_pred_ang32_30, 3,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_31, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -9001,12 +9028,8 @@ cglobal intra_pred_ang32_31, 3,7,8
jnz .loop
RET
-;-----------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_32, 3,7,8
- mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -9021,12 +9044,8 @@ cglobal intra_pred_ang32_32, 3,7,8
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_ang32_33, 3,7,8
- xchg r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
@@ -9042,14 +9061,14 @@ cglobal intra_pred_ang32_33, 3,7,8
RET
;-----------------------------------------------------------------------------
-; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal all_angs_pred_4x4, 6, 6, 8
+cglobal all_angs_pred_4x4, 4, 4, 8
; mode 2
-movh m0, [r2 + 2]
+movh m0, [r1 + 10]
movd [r0], m0
palignr m1, m0, 1
@@ -9058,200 +9077,197 @@ movd [r0 + 4], m1
palignr m1, m0, 2
movd [r0 + 8], m1
-psrldq m0, 3
-movd [r0 + 12], m0
+palignr m1, m0, 3
+movd [r0 + 12], m1
; mode 3
-mova m0, [pw_1024]
+mova m2, [pw_1024]
-movh m1, [r2 + 1]
+pslldq m1, m0, 1
+pinsrb m1, [r1 + 9], 0
+punpcklbw m1, m0
-palignr m2, m1, 1
-punpcklbw m1, m2
+lea r3, [ang_table]
-lea r5, [ang_table]
+pmaddubsw m6, m1, [r3 + 26 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 16], m6
-pmaddubsw m5, m1, [r5 + 26 * 16]
-pmulhrsw m5, m0
-packuswb m5, m5
-movd [r0 + 16], m5
+palignr m0, m1, 2
-palignr m2, m1, 2
+mova m7, [r3 + 20 * 16]
-mova m7, [r5 + 20 * 16]
+pmaddubsw m3, m0, m7
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 20], m3
-pmaddubsw m6, m2, m7
-pmulhrsw m6, m0
-packuswb m6, m6
-movd [r0 + 20], m6
+; mode 6 [row 3]
+movd [r0 + 76], m3
-palignr m3, m1, 4
+palignr m3, m1, 4
-pmaddubsw m4, m3, [r5 + 14 * 16]
-pmulhrsw m4, m0
+pmaddubsw m4, m3, [r3 + 14 * 16]
+pmulhrsw m4, m2
packuswb m4, m4
movd [r0 + 24], m4
palignr m4, m1, 6
-pmaddubsw m4, [r5 + 8 * 16]
-pmulhrsw m4, m0
+pmaddubsw m4, [r3 + 8 * 16]
+pmulhrsw m4, m2
packuswb m4, m4
movd [r0 + 28], m4
; mode 4
-pmaddubsw m4, m1, [r5 + 21 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 32], m4
+pmaddubsw m5, m1, [r3 + 21 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 32], m5
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 36], m4
+pmaddubsw m5, m0, [r3 + 10 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 36], m5
-pmaddubsw m4, m2, [r5 + 31 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 40], m4
+pmaddubsw m5, m0, [r3 + 31 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 40], m5
pmaddubsw m4, m3, m7
-pmulhrsw m4, m0
+pmulhrsw m4, m2
packuswb m4, m4
movd [r0 + 44], m4
; mode 5
-pmaddubsw m4, m1, [r5 + 17 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 48], m4
+pmaddubsw m5, m1, [r3 + 17 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 48], m5
-pmaddubsw m4, m2, [r5 + 2 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 52], m4
+pmaddubsw m5, m0, [r3 + 2 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 52], m5
-pmaddubsw m4, m2, [r5 + 19 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 56], m4
+pmaddubsw m5, m0, [r3 + 19 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 56], m5
-pmaddubsw m3, [r5 + 4 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 60], m3
+pmaddubsw m4, m3, [r3 + 4 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 60], m4
; mode 6
-pmaddubsw m3, m1, [r5 + 13 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 64], m3
-
-movd [r0 + 68], m5
+pmaddubsw m5, m1, [r3 + 13 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 64], m5
-pmaddubsw m3, m2, [r5 + 7 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 72], m3
+movd [r0 + 68], m6
-movd [r0 + 76], m6
+pmaddubsw m5, m0, [r3 + 7 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 72], m5
; mode 7
-pmaddubsw m3, m1, [r5 + 9 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 80], m3
+pmaddubsw m5, m1, [r3 + 9 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 80], m5
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 84], m3
+pmaddubsw m5, m1, [r3 + 18 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 84], m5
-pmaddubsw m3, m1, [r5 + 27 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 88], m3
+pmaddubsw m5, m1, [r3 + 27 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 88], m5
-pmaddubsw m2, [r5 + 4 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 92], m2
+pmaddubsw m5, m0, [r3 + 4 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 92], m5
; mode 8
-pmaddubsw m2, m1, [r5 + 5 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 96], m2
+pmaddubsw m5, m1, [r3 + 5 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 96], m5
-pmaddubsw m2, m1, [r5 + 10 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 100], m2
+pmaddubsw m5, m1, [r3 + 10 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 100], m5
-pmaddubsw m2, m1, [r5 + 15 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 104], m2
+pmaddubsw m5, m1, [r3 + 15 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 104], m5
-pmaddubsw m2, m1, m7
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 108], m2
+pmaddubsw m5, m1, [r3 + 20 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 108], m5
; mode 9
-pmaddubsw m2, m1, [r5 + 2 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 112], m2
+pmaddubsw m5, m1, [r3 + 2 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 112], m5
-pmaddubsw m2, m1, [r5 + 4 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 116], m2
+pmaddubsw m5, m1, [r3 + 4 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 116], m5
-pmaddubsw m2, m1, [r5 + 6 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 120], m2
+pmaddubsw m5, m1, [r3 + 6 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 120], m5
-pmaddubsw m1, [r5 + 8 * 16]
-pmulhrsw m1, m0
-packuswb m1, m1
-movd [r0 + 124], m1
+pmaddubsw m5, m1, [r3 + 8 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 124], m5
; mode 10
-movh m1, [r2]
-palignr m2, m1, 1
-pshufd m3, m2, 0
-movu [r0 + 128], m3
+movd m3, [r1 + 9]
+pshufd m4, m3, 0
+movu [r0 + 128], m4
-pxor m3, m3
+pxor m5, m5
+movd m7, [r1 + 1]
+pshufd m4, m7, 0
+punpcklbw m4, m5
-pshufb m4, m2, m3
-punpcklbw m4, m3
+pinsrb m7, [r1], 0
+pshufb m6, m7, m5
+punpcklbw m6, m5
-movh m5, [r1]
+psubw m4, m6
+psraw m4, 1
-pshufb m6, m5, m3
-punpcklbw m6, m3
+pshufb m6, m3, m5
+punpcklbw m6, m5
-psrldq m5, 1
-punpcklbw m5, m3
-
-psubw m5, m6
-psraw m5, 1
-
-paddw m4, m5
-
-packuswb m4, m3
+paddw m4, m6
+packuswb m4, m5
pextrb [r0 + 128], m4, 0
pextrb [r0 + 132], m4, 1
@@ -9260,179 +9276,180 @@ pextrb [r0 + 140], m4, 3
; mode 11
-palignr m2, m1, 1
-punpcklbw m1, m2
+pslldq m1, m1, 2
+pinsrb m1, [r1], 0
+pinsrb m1, [r1 + 9], 1
-pmaddubsw m2, m1, [r5 + 30 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 144], m2
+pmaddubsw m3, m1, [r3 + 30 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 144], m3
-pmaddubsw m2, m1, [r5 + 28 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 148], m2
+pmaddubsw m3, m1, [r3 + 28 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 148], m3
-pmaddubsw m2, m1, [r5 + 26 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 152], m2
+pmaddubsw m3, m1, [r3 + 26 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 152], m3
-pmaddubsw m2, m1, [r5 + 24 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 156], m2
+pmaddubsw m3, m1, [r3 + 24 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 156], m3
; mode 12
-pmaddubsw m2, m1, [r5 + 27 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 160], m2
+pmaddubsw m3, m1, [r3 + 27 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 160], m3
-pmaddubsw m2, m1, [r5 + 22 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 164], m2
+pmaddubsw m3, m1, [r3 + 22 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 164], m3
-pmaddubsw m2, m1, [r5 + 17 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 168], m2
+pmaddubsw m3, m1, [r3 + 17 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 168], m3
-pmaddubsw m2, m1, [r5 + 12 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 172], m2
+pmaddubsw m3, m1, [r3 + 12 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 172], m3
; mode 13
-pmaddubsw m2, m1, [r5 + 23 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 176], m2
+pmaddubsw m3, m1, [r3 + 23 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 176], m3
-pmaddubsw m2, m1, [r5 + 14 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 180], m2
+pmaddubsw m3, m1, [r3 + 14 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 180], m3
-pmaddubsw m2, m1, [r5 + 5 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 184], m2
+pmaddubsw m3, m1, [r3 + 5 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 184], m3
-pslldq m2, m1, 2
-pinsrb m2, [r1 + 0], 1
-pinsrb m2, [r1 + 4], 0
+pslldq m5, m1, 2
+pinsrb m5, [r1 + 0], 1
+pinsrb m5, [r1 + 4], 0
-pmaddubsw m3, m2, [r5 + 28 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 188], m3
+pmaddubsw m4, m5, [r3 + 28 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 188], m4
; mode 14
-pmaddubsw m3, m1, [r5 + 19 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 192], m3
+pmaddubsw m4, m1, [r3 + 19 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 192], m4
-pmaddubsw m5, m1, [r5 + 6 * 16]
-pmulhrsw m5, m0
-packuswb m5, m5
-movd [r0 + 196], m5
+pmaddubsw m7, m1, [r3 + 6 * 16]
+pmulhrsw m7, m2
+packuswb m7, m7
+movd [r0 + 196], m7
-pinsrb m2, [r1 + 2], 0
+pinsrb m5, [r1 + 2], 0
-pmaddubsw m3, m2, [r5 + 25 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 200], m3
+pmaddubsw m4, m5, [r3 + 25 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 200], m4
-pmaddubsw m3, m2, [r5 + 12 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 204], m3
+pmaddubsw m4, m5, [r3 + 12 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 204], m4
; mode 15
-pmaddubsw m3, m1, [r5 + 15 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 208], m3
+pmaddubsw m4, m1, [r3 + 15 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 208], m4
-pmaddubsw m3, m2, [r5 + 30 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 212], m3
+pmaddubsw m4, m5, [r3 + 30 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 212], m4
-pmaddubsw m3, m2, [r5 + 13 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 216], m3
+pmaddubsw m4, m5, [r3 + 13 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 216], m4
-pslldq m3, m2, 2
-pinsrb m3, [r1 + 2], 1
-pinsrb m3, [r1 + 4], 0
+pslldq m4, m5, 2
+pinsrb m4, [r1 + 2], 1
+pinsrb m4, [r1 + 4], 0
-pmaddubsw m4, m3, [r5 + 28 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 220], m4
+pmaddubsw m6, m4, [r3 + 28 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 220], m6
; mode 16
-pmaddubsw m4, m1, [r5 + 11 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 224], m4
+pmaddubsw m6, m1, [r3 + 11 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 224], m6
-pmaddubsw m4, m2, [r5 + 22 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 228], m4
+pmaddubsw m6, m5, [r3 + 22 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 228], m6
-pmaddubsw m4, m2, [r5 + 1 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 232], m4
+pmaddubsw m6, m5, [r3 + 1 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 232], m6
-pinsrb m3, [r1 + 3], 0
+pinsrb m4, [r1 + 3], 0
-pmaddubsw m3, [r5 + 12 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 236], m3
+pmaddubsw m4, [r3 + 12 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 236], m4
; mode 17
-movd [r0 + 240], m5
+movd [r0 + 240], m7
pslldq m1, 2
pinsrb m1, [r1 + 1], 0
pinsrb m1, [r1 + 0], 1
-pmaddubsw m2, m1, [r5 + 12 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 244], m2
+pmaddubsw m3, m1, [r3 + 12 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 244], m3
pslldq m1, 2
-pinsrb m1, [r1 + 2], 0
pinsrb m1, [r1 + 1], 1
+pinsrb m1, [r1 + 2], 0
-pmaddubsw m2, m1, [r5 + 18 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 248], m2
+pmaddubsw m3, m1, [r3 + 18 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 248], m3
pslldq m1, 2
-pinsrb m1, [r1 + 4], 0
pinsrb m1, [r1 + 2], 1
+pinsrb m1, [r1 + 4], 0
-pmaddubsw m1, [r5 + 24 * 16]
-pmulhrsw m1, m0
+pmaddubsw m1, [r3 + 24 * 16]
+pmulhrsw m1, m2
packuswb m1, m1
movd [r0 + 252], m1
@@ -9441,22557 +9458,22572 @@ movd [r0 + 252], m1
movh m1, [r1]
movd [r0 + 256], m1
-pslldq m2, m1, 1
-pinsrb m2, [r2 + 1], 0
-movd [r0 + 260], m2
-
-pslldq m3, m2, 1
-pinsrb m3, [r2 + 2], 0
-movd [r0 + 264], m3
+pslldq m3, m1, 1
+pinsrb m3, [r1 + 9], 0
+movd [r0 + 260], m3
pslldq m4, m3, 1
-pinsrb m4, [r2 + 3], 0
+pinsrb m4, [r1 + 10], 0
+movd [r0 + 264], m4
+
+pslldq m4, 1
+pinsrb m4, [r1 + 11], 0
movd [r0 + 268], m4
; mode 19
-palignr m4, m1, 1
-punpcklbw m1, m4
+palignr m3, m1, 1
+punpcklbw m1, m3
-pmaddubsw m5, m1, [r5 + 6 * 16]
-pmulhrsw m5, m0
-packuswb m5, m5
-movd [r0 + 272], m5
+pmaddubsw m7, m1, [r3 + 6 * 16]
+pmulhrsw m7, m2
+packuswb m7, m7
+movd [r0 + 272], m7
-pslldq m2, m1, 2
-pinsrb m2, [r2 + 1], 0
-pinsrb m2, [r2], 1
+pslldq m3, m1, 2
+pinsrb m3, [r1], 1
+pinsrb m3, [r1 + 9], 0
-pmaddubsw m3, m2, [r5 + 12 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 276], m3
+pmaddubsw m4, m3, [r3 + 12 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 276], m4
-pslldq m3, m2, 2
-pinsrb m3, [r2 + 1], 1
-pinsrb m3, [r2 + 2], 0
+pslldq m4, m3, 2
+pinsrb m4, [r1 + 9], 1
+pinsrb m4, [r1 + 10], 0
-pmaddubsw m4, m3, [r5 + 18 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 280], m4
+pmaddubsw m5, m4, [r3 + 18 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 280], m5
-pslldq m3, 2
-pinsrb m3, [r2 + 2], 1
-pinsrb m3, [r2 + 4], 0
+pslldq m4, 2
+pinsrb m4, [r1 + 10], 1
+pinsrb m4, [r1 + 12], 0
-pmaddubsw m3, [r5 + 24 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 284], m3
+pmaddubsw m4, [r3 + 24 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 284], m4
; mode 20
-pmaddubsw m3, m1, [r5 + 11 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 288], m3
+pmaddubsw m4, m1, [r3 + 11 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 288], m4
-pinsrb m2, [r2 + 2], 0
+pinsrb m3, [r1 + 10], 0
-pmaddubsw m3, m2, [r5 + 22 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 292], m3
+pmaddubsw m4, m3, [r3 + 22 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 292], m4
-pmaddubsw m3, m2, [r5 + 1 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 296], m3
+pmaddubsw m4, m3, [r3 + 1 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 296], m4
-pslldq m3, m2, 2
-pinsrb m3, [r2 + 2], 1
-pinsrb m3, [r2 + 3], 0
+pslldq m6, m3, 2
+pinsrb m6, [r1 + 10], 1
+pinsrb m6, [r1 + 11], 0
-pmaddubsw m4, m3, [r5 + 12 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 300], m4
+pmaddubsw m5, m6, [r3 + 12 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 300], m5
; mode 21
-pmaddubsw m4, m1, [r5 + 15 * 16]
-pmulhrsw m4, m0
+pmaddubsw m4, m1, [r3 + 15 * 16]
+pmulhrsw m4, m2
packuswb m4, m4
movd [r0 + 304], m4
-pmaddubsw m4, m2, [r5 + 30 * 16]
-pmulhrsw m4, m0
+pmaddubsw m4, m3, [r3 + 30 * 16]
+pmulhrsw m4, m2
packuswb m4, m4
movd [r0 + 308], m4
-pmaddubsw m4, m2, [r5 + 13 * 16]
-pmulhrsw m4, m0
+pmaddubsw m4, m3, [r3 + 13 * 16]
+pmulhrsw m4, m2
packuswb m4, m4
movd [r0 + 312], m4
-pinsrb m3, [r2 + 4], 0
+pinsrb m6, [r1 + 12], 0
-pmaddubsw m3, [r5 + 28 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 316], m3
+pmaddubsw m6, [r3 + 28 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 316], m6
; mode 22
-pmaddubsw m3, m1, [r5 + 19 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 320], m3
+pmaddubsw m4, m1, [r3 + 19 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 320], m4
-movd [r0 + 324], m5
+movd [r0 + 324], m7
-pmaddubsw m3, m2, [r5 + 25 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 328], m3
+pmaddubsw m4, m3, [r3 + 25 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 328], m4
-pmaddubsw m3, m2, [r5 + 12 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 332], m3
+pmaddubsw m4, m3, [r3 + 12 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 332], m4
; mode 23
-pmaddubsw m3, m1, [r5 + 23 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 336], m3
+pmaddubsw m4, m1, [r3 + 23 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 336], m4
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 340], m3
+pmaddubsw m4, m1, [r3 + 14 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 340], m4
-pmaddubsw m3, m1, [r5 + 5 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 344], m3
+pmaddubsw m4, m1, [r3 + 5 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 344], m4
-pinsrb m2, [r2 + 4], 0
+pinsrb m3, [r1 + 12], 0
-pmaddubsw m2, [r5 + 28 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 348], m2
+pmaddubsw m3, [r3 + 28 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 348], m3
; mode 24
-pmaddubsw m2, m1, [r5 + 27 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 352], m2
+pmaddubsw m3, m1, [r3 + 27 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 352], m3
-pmaddubsw m2, m1, [r5 + 22 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 356], m2
+pmaddubsw m3, m1, [r3 + 22 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 356], m3
-pmaddubsw m2, m1, [r5 + 17 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 360], m2
+pmaddubsw m3, m1, [r3 + 17 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 360], m3
-pmaddubsw m2, m1, [r5 + 12 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 364], m2
+pmaddubsw m3, m1, [r3 + 12 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 364], m3
; mode 25
-pmaddubsw m2, m1, [r5 + 30 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 368], m2
+pmaddubsw m3, m1, [r3 + 30 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 368], m3
-pmaddubsw m2, m1, [r5 + 28 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 372], m2
+pmaddubsw m3, m1, [r3 + 28 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 372], m3
-pmaddubsw m2, m1, [r5 + 26 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 376], m2
+pmaddubsw m3, m1, [r3 + 26 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 376], m3
-pmaddubsw m2, m1, [r5 + 24 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 380], m2
+pmaddubsw m1, [r3 + 24 * 16]
+pmulhrsw m1, m2
+packuswb m1, m1
+movd [r0 + 380], m1
; mode 26
movh m1, [r1 + 1]
-pshufd m2, m1, 0
-movu [r0 + 384], m2
-
-pxor m2, m2
+pshufd m3, m1, 0
+movu [r0 + 384], m3
-pshufb m3, m1, m2
-punpcklbw m3, m2
+pxor m4, m4
+movd m5, [r1 + 9]
+pshufd m5, m5, 0
+punpcklbw m5, m4
-movh m4, [r2]
+pinsrb m6, [r1], 0
+pshufb m6, m4
+punpcklbw m6, m4
-pshufb m5, m4, m2
-punpcklbw m5, m2
+psubw m5, m6
+psraw m5, 1
-psrldq m4, 1
-punpcklbw m4, m2
+pshufb m6, m1, m4
+punpcklbw m6, m4
-psubw m4, m5
-psraw m4, 1
+paddw m5, m6
+packuswb m5, m4
-paddw m3, m4
-
-packuswb m3, m2
-
-pextrb [r0 + 384], m3, 0
-pextrb [r0 + 388], m3, 1
-pextrb [r0 + 392], m3, 2
-pextrb [r0 + 396], m3, 3
+pextrb [r0 + 384], m5, 0
+pextrb [r0 + 388], m5, 1
+pextrb [r0 + 392], m5, 2
+pextrb [r0 + 396], m5, 3
; mode 27
-palignr m2, m1, 1
-punpcklbw m1, m2
+palignr m3, m1, 1
+punpcklbw m1, m3
-pmaddubsw m2, m1, [r5 + 2 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 400], m2
+pmaddubsw m3, m1, [r3 + 2 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 400], m3
-pmaddubsw m2, m1, [r5 + 4 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 404], m2
+pmaddubsw m3, m1, [r3 + 4 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 404], m3
-pmaddubsw m2, m1, [r5 + 6 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 408], m2
+pmaddubsw m3, m1, [r3 + 6 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 408], m3
-pmaddubsw m2, m1, [r5 + 8 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 412], m2
+pmaddubsw m3, m1, [r3 + 8 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 412], m3
; mode 28
-pmaddubsw m2, m1, [r5 + 5 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 416], m2
+pmaddubsw m3, m1, [r3 + 5 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 416], m3
-pmaddubsw m2, m1, [r5 + 10 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 420], m2
+pmaddubsw m3, m1, [r3 + 10 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 420], m3
-pmaddubsw m2, m1, [r5 + 15 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 424], m2
+pmaddubsw m3, m1, [r3 + 15 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 424], m3
-pmaddubsw m2, m1, m7
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 428], m2
+pmaddubsw m3, m1, [r3 + 20 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 428], m3
; mode 29
-pmaddubsw m2, m1, [r5 + 9 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 432], m2
+pmaddubsw m3, m1, [r3 + 9 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 432], m3
-pmaddubsw m2, m1, [r5 + 18 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 436], m2
+pmaddubsw m3, m1, [r3 + 18 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 436], m3
-pmaddubsw m2, m1, [r5 + 27 * 16]
-pmulhrsw m2, m0
-packuswb m2, m2
-movd [r0 + 440], m2
+pmaddubsw m3, m1, [r3 + 27 * 16]
+pmulhrsw m3, m2
+packuswb m3, m3
+movd [r0 + 440], m3
-palignr m2, m1, 2
+palignr m3, m1, 2
-pmaddubsw m3, m2, [r5 + 4 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 444], m3
+pmaddubsw m4, m3, [r3 + 4 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 444], m4
; mode 30
-pmaddubsw m3, m1, [r5 + 13 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 448], m3
-
-pmaddubsw m6, m1, [r5 + 26 * 16]
-pmulhrsw m6, m0
-packuswb m6, m6
-movd [r0 + 452], m6
+pmaddubsw m4, m1, [r3 + 13 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 448], m4
-pmaddubsw m3, m2, [r5 + 7 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 456], m3
+pmaddubsw m7, m1, [r3 + 26 * 16]
+pmulhrsw m7, m2
+packuswb m7, m7
+movd [r0 + 452], m7
-pmaddubsw m5, m2, m7
-pmulhrsw m5, m0
+pmaddubsw m5, m3, [r3 + 7 * 16]
+pmulhrsw m5, m2
packuswb m5, m5
-movd [r0 + 460], m5
+movd [r0 + 456], m5
+
+pmaddubsw m6, m3, [r3 + 20 * 16]
+pmulhrsw m6, m2
+packuswb m6, m6
+movd [r0 + 460], m6
; mode 31
-pmaddubsw m3, m1, [r5 + 17 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 464], m3
+pmaddubsw m4, m1, [r3 + 17 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 464], m4
-pmaddubsw m3, m2, [r5 + 2 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 468], m3
+pmaddubsw m5, m3, [r3 + 2 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 468], m5
-pmaddubsw m3, m2, [r5 + 19 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 472], m3
+pmaddubsw m5, m3, [r3 + 19 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 472], m5
-palignr m3, m2, 2
+palignr m4, m3, 2
-pmaddubsw m4, m3, [r5 + 4 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 476], m4
+pmaddubsw m5, m4, [r3 + 4 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 476], m5
; mode 32
-pmaddubsw m4, m1, [r5 + 21 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 480], m4
+pmaddubsw m5, m1, [r3 + 21 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 480], m5
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 484], m4
+pmaddubsw m5, m3, [r3 + 10 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 484], m5
-pmaddubsw m4, m2, [r5 + 31 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 488], m4
+pmaddubsw m5, m3, [r3 + 31 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 488], m5
-pmaddubsw m4, m3, m7
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 492], m4
+pmaddubsw m5, m4, [r3 + 20 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 492], m5
; mode 33
-movd [r0 + 496], m6
+movd [r0 + 496], m7
-movd [r0 + 500], m5
+movd [r0 + 500], m6
-pmaddubsw m4, m3, [r5 + 14 * 16]
-pmulhrsw m4, m0
-packuswb m4, m4
-movd [r0 + 504], m4
+pmaddubsw m5, m4, [r3 + 14 * 16]
+pmulhrsw m5, m2
+packuswb m5, m5
+movd [r0 + 504], m5
-psrldq m3, 2
+psrldq m4, 2
-pmaddubsw m3, [r5 + 8 * 16]
-pmulhrsw m3, m0
-packuswb m3, m3
-movd [r0 + 508], m3
+pmaddubsw m4, [r3 + 8 * 16]
+pmulhrsw m4, m2
+packuswb m4, m4
+movd [r0 + 508], m4
; mode 34
-movh m0, [r1 + 2]
-movd [r0 + 512], m0
+movh m7, [r1 + 2]
+movd [r0 + 512], m7
-palignr m1, m0, 1
-movd [r0 + 516], m1
+psrldq m7, 1
+movd [r0 + 516], m7
-palignr m1, m0, 2
-movd [r0 + 520], m1
+psrldq m7, 1
+movd [r0 + 520], m7
-palignr m1, m0, 3
-movd [r0 + 524], m1
+psrldq m7, 1
+movd [r0 + 524], m7
RET
-;-----------------------------------------------------------------------------
-; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
-;-----------------------------------------------------------------------------
+;------------------------------------------------------------------------------
+; void all_angs_pred_8x8(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
-
-; mode 2
-
-movu m0, [r4 + 2]
+cglobal all_angs_pred_8x8, 3,4,8
+ ; mode 2
-palignr m1, m0, 1
-punpcklqdq m2, m0, m1
-movu [r0], m2
+ movu m0, [r2 + 18]
+ palignr m1, m0, 1
+ punpcklqdq m2, m0, m1
+ movu [r0], m2
-palignr m1, m0, 2
-palignr m2, m0, 3
-punpcklqdq m1, m2
-movu [r0 + 16], m1
+ palignr m1, m0, 2
+ palignr m2, m0, 3
+ punpcklqdq m1, m2
+ movu [r0 + 16], m1
-palignr m1, m0, 4
-palignr m2, m0, 5
-punpcklqdq m1, m2
-movu [r0 + 32], m1
+ palignr m1, m0, 4
+ palignr m2, m0, 5
+ punpcklqdq m1, m2
+ movu [r0 + 32], m1
-palignr m1, m0, 6
-palignr m2, m0, 7
-punpcklqdq m1, m2
-movu [r0 + 48], m1
+ palignr m1, m0, 6
+ palignr m2, m0, 7
+ punpcklqdq m1, m2
+ movu [r0 + 48], m1
-; mode 3 [row 0, 1]
+ ; mode 3 [row 0, 1]
-mova m7, [pw_1024]
-lea r5, [ang_table]
+ mova m7, [pw_1024]
+ lea r3, [ang_table]
-movu m0, [r2 + 1]
+ movu m0, [r1 + 17]
-palignr m1, m0, 1
-palignr m2, m0, 2
+ palignr m1, m0, 1
+ palignr m2, m0, 2
-punpcklbw m3, m0, m1
-pmaddubsw m4, m3, [r5 + 26 * 16]
-pmulhrsw m4, m7
+ punpcklbw m3, m0, m1
+ pmaddubsw m4, m3, [r3 + 26 * 16]
+ pmulhrsw m4, m7
-punpcklbw m1, m2
-pmaddubsw m5, m1, [r5 + 20 * 16]
-pmulhrsw m5, m7
+ punpcklbw m1, m2
+ pmaddubsw m5, m1, [r3 + 20 * 16]
+ pmulhrsw m5, m7
-packuswb m4, m5
+ packuswb m4, m5
-movu [r0 + 64], m4
+ movu [r0 + 64], m4
-; mode 6 [row 1]
+ ; mode 6 [row 1]
-movh [r0 + 264], m4
+ movh [r0 + 264], m4
-; mode 6 [row 3]
-
-movhps [r0 + 280], m4
+ ; mode 6 [row 3]
-; mode 4 [row 0, 1]
+ movhps [r0 + 280], m4
-pmaddubsw m4, m3, [r5 + 21 * 16]
-pmulhrsw m4, m7
+ ; mode 4 [row 0, 1]
-pmaddubsw m5, m1, [r5 + 10 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 21 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 128], m4
+ pmaddubsw m5, m1, [r3 + 10 * 16]
+ pmulhrsw m5, m7
-; mode 5 [row 0, 1]
+ packuswb m4, m5
+ movu [r0 + 128], m4
-pmaddubsw m4, m3, [r5 + 17 * 16]
-pmulhrsw m4, m7
+ ; mode 5 [row 0, 1]
-pmaddubsw m5, m1, [r5 + 2 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 17 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 192], m4
+ pmaddubsw m5, m1, [r3 + 2 * 16]
+ pmulhrsw m5, m7
-; mode 6 [row 0]
+ packuswb m4, m5
+ movu [r0 + 192], m4
-pmaddubsw m4, m3, [r5 + 13 * 16]
-pmulhrsw m4, m7
+ ; mode 6 [row 0]
-pxor m5, m5
+ pmaddubsw m4, m3, [r3 + 13 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movh [r0 + 256], m4
+ pxor m5, m5
-; mode 7 [row 0, 1]
+ packuswb m4, m5
+ movh [r0 + 256], m4
-pmaddubsw m4, m3, [r5 + 9 * 16]
-pmulhrsw m4, m7
+ ; mode 7 [row 0, 1]
-pmaddubsw m5, m3, [r5 + 18 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 9 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 320], m4
+ pmaddubsw m5, m3, [r3 + 18 * 16]
+ pmulhrsw m5, m7
-; mode 8 [row 0, 1]
+ packuswb m4, m5
+ movu [r0 + 320], m4
-pmaddubsw m4, m3, [r5 + 5 * 16]
-pmulhrsw m4, m7
+ ; mode 8 [row 0, 1]
-pmaddubsw m5, m3, [r5 + 10 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 5 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 384], m4
+ pmaddubsw m5, m3, [r3 + 10 * 16]
+ pmulhrsw m5, m7
-; mode 8 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 384], m4
-pmaddubsw m4, m3, [r5 + 15 * 16]
-pmulhrsw m4, m7
+ ; mode 8 [row 2, 3]
-pmaddubsw m5, m3, [r5 + 20 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 15 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 400], m4
+ pmaddubsw m5, m3, [r3 + 20 * 16]
+ pmulhrsw m5, m7
-; mode 8 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 400], m4
-pmaddubsw m4, m3, [r5 + 25 * 16]
-pmulhrsw m4, m7
+ ; mode 8 [row 4, 5]
-pmaddubsw m5, m3, [r5 + 30 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 25 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 416], m4
+ pmaddubsw m5, m3, [r3 + 30 * 16]
+ pmulhrsw m5, m7
-; mode 8 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 416], m4
-pmaddubsw m4, m1, [r5 + 3 * 16]
-pmulhrsw m4, m7
+ ; mode 8 [row 6, 7]
-pmaddubsw m5, m1, [r5 + 8 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m1, [r3 + 3 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 432], m4
+ pmaddubsw m5, m1, [r3 + 8 * 16]
+ pmulhrsw m5, m7
-; mode 9 [row 0, 1]
+ packuswb m4, m5
+ movu [r0 + 432], m4
-pmaddubsw m4, m3, [r5 + 2 * 16]
-pmulhrsw m4, m7
+ ; mode 9 [row 0, 1]
-pmaddubsw m5, m3, [r5 + 4 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 2 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 448], m4
+ pmaddubsw m5, m3, [r3 + 4 * 16]
+ pmulhrsw m5, m7
-; mode 9 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 448], m4
-pmaddubsw m4, m3, [r5 + 6 * 16]
-pmulhrsw m4, m7
+ ; mode 9 [row 2, 3]
-pmaddubsw m5, m3, [r5 + 8 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 6 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 464], m4
+ pmaddubsw m5, m3, [r3 + 8 * 16]
+ pmulhrsw m5, m7
-; mode 9 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 464], m4
-pmaddubsw m4, m3, [r5 + 10 * 16]
-pmulhrsw m4, m7
+ ; mode 9 [row 4, 5]
-pmaddubsw m5, m3, [r5 + 12 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 10 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 480], m4
+ pmaddubsw m5, m3, [r3 + 12 * 16]
+ pmulhrsw m5, m7
-; mode 9 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 480], m4
-pmaddubsw m4, m3, [r5 + 14 * 16]
-pmulhrsw m4, m7
+ ; mode 9 [row 6, 7]
-pmaddubsw m5, m3, [r5 + 16 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 14 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 496], m4
+ pmaddubsw m5, m3, [r3 + 16 * 16]
+ pmulhrsw m5, m7
-; mode 7 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 496], m4
-pmaddubsw m4, m3, [r5 + 27 * 16]
-pmulhrsw m4, m7
+ ; mode 7 [row 2, 3]
-pmaddubsw m5, m1, [r5 + 4 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 27 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 336], m4
+ pmaddubsw m5, m1, [r3 + 4 * 16]
+ pmulhrsw m5, m7
-; mode 7 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 336], m4
-pmaddubsw m4, m1, [r5 + 13 * 16]
-pmulhrsw m4, m7
+ ; mode 7 [row 4, 5]
-pmaddubsw m5, m1, [r5 + 22 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m1, [r3 + 13 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 352], m4
+ pmaddubsw m5, m1, [r3 + 22 * 16]
+ pmulhrsw m5, m7
-; mode 6 [row 2]
+ packuswb m4, m5
+ movu [r0 + 352], m4
-pmaddubsw m4, m1, [r5 + 7 * 16]
-pmulhrsw m4, m7
+ ; mode 6 [row 2]
-pxor m5, m5
+ pmaddubsw m4, m1, [r3 + 7 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movh [r0 + 272], m4
+ pxor m5, m5
-; mode 3 [row 2, 3]
+ packuswb m4, m5
+ movh [r0 + 272], m4
-palignr m1, m0, 3
-palignr m3, m0, 4
+ ; mode 3 [row 2, 3]
-punpcklbw m2, m1
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, m7
+ palignr m1, m0, 3
+ palignr m3, m0, 4
-punpcklbw m1, m3
-pmaddubsw m6, m1, [r5 + 8 * 16]
-pmulhrsw m6, m7
+ punpcklbw m2, m1
+ pmaddubsw m5, m2, [r3 + 14 * 16]
+ pmulhrsw m5, m7
-packuswb m5, m6
-movu [r0 + 80], m5
+ punpcklbw m1, m3
+ pmaddubsw m6, m1, [r3 + 8 * 16]
+ pmulhrsw m6, m7
-; mode 6 [row 7]
+ packuswb m5, m6
+ movu [r0 + 80], m5
-movhps [r0 + 312], m5
+ ; mode 6 [row 7]
-; mode 6 [row 5]
+ movhps [r0 + 312], m5
-movh [r0 + 296], m5
+ ; mode 6 [row 5]
-; mode 4 [calculate and store row 4, 5]
+ movh [r0 + 296], m5
-pmaddubsw m4, m1, [r5 + 9 * 16]
-pmulhrsw m4, m7
+ ; mode 4 [calculate and store row 4, 5]
-pmaddubsw m5, m1, [r5 + 30 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m1, [r3 + 9 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 160], m4
+ pmaddubsw m5, m1, [r3 + 30 * 16]
+ pmulhrsw m5, m7
-; mode 5 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 160], m4
-pmaddubsw m4, m2, [r5 + 21 * 16]
-pmulhrsw m4, m7
+ ; mode 5 [row 4, 5]
-pmaddubsw m5, m1, [r5 + 6 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 21 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 224], m4
+ pmaddubsw m5, m1, [r3 + 6 * 16]
+ pmulhrsw m5, m7
-; mode 6 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 224], m4
-pmaddubsw m5, m2, [r5 + 1 * 16]
-pmulhrsw m5, m7
+ ; mode 6 [row 4, 5]
-pxor m6, m6
+ pmaddubsw m5, m2, [r3 + 1 * 16]
+ pmulhrsw m5, m7
-packuswb m5, m6
-movh [r0 + 288], m5
+ pxor m6, m6
-; mode 6 [row 6, 7]
+ packuswb m5, m6
+ movh [r0 + 288], m5
-pmaddubsw m5, m2, [r5 + 27 * 16]
-pmulhrsw m5, m7
+ ; mode 6 [row 6, 7]
-pxor m6, m6
+ pmaddubsw m5, m2, [r3 + 27 * 16]
+ pmulhrsw m5, m7
-packuswb m5, m6
-movh [r0 + 304], m5
+ pxor m6, m6
-; mode 5 [calculate row 6]
+ packuswb m5, m6
+ movh [r0 + 304], m5
-pmaddubsw m6, m1, [r5 + 23 * 16]
-pmulhrsw m6, m7
+ ; mode 5 [calculate row 6]
-; mode 3 [row 4, 5]
+ pmaddubsw m6, m1, [r3 + 23 * 16]
+ pmulhrsw m6, m7
-palignr m1, m0, 5
+ ; mode 3 [row 4, 5]
-punpcklbw m3, m1
-pmaddubsw m4, m3, [r5 + 2 * 16]
-pmulhrsw m4, m7
+ palignr m1, m0, 5
-pmaddubsw m5, m3, [r5 + 28 * 16]
-pmulhrsw m5, m7
+ punpcklbw m3, m1
+ pmaddubsw m4, m3, [r3 + 2 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 96], m4
+ pmaddubsw m5, m3, [r3 + 28 * 16]
+ pmulhrsw m5, m7
-; mode 4 [calculate row 7]
+ packuswb m4, m5
+ movu [r0 + 96], m4
-pmaddubsw m5, m3, [r5 + 19 * 16]
-pmulhrsw m5, m7
+ ; mode 4 [calculate row 7]
-; mode 5 [calculate row 6]
+ pmaddubsw m5, m3, [r3 + 19 * 16]
+ pmulhrsw m5, m7
-pmaddubsw m4, m3, [r5 + 8 * 16]
-pmulhrsw m4, m7
+ ; mode 5 [calculate row 6]
-packuswb m6, m4
-movu [r0 + 240], m6
+ pmaddubsw m4, m3, [r3 + 8 * 16]
+ pmulhrsw m4, m7
-; mode 3 [row 6, 7]
+ packuswb m6, m4
+ movu [r0 + 240], m6
-palignr m2, m0, 6
-palignr m3, m0, 7
+ ; mode 3 [row 6, 7]
-punpcklbw m1, m2
-pmaddubsw m4, m1, [r5 + 22 * 16]
-pmulhrsw m4, m7
+ palignr m2, m0, 6
+ palignr m3, m0, 7
-punpcklbw m2, m3
-pmaddubsw m2, [r5 + 16 * 16]
-pmulhrsw m2, m7
+ punpcklbw m1, m2
+ pmaddubsw m4, m1, [r3 + 22 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m2
-movu [r0 + 112], m4
+ punpcklbw m2, m3
+ pmaddubsw m2, [r3 + 16 * 16]
+ pmulhrsw m2, m7
-; mode 4 [calculate row 7]
+ packuswb m4, m2
+ movu [r0 + 112], m4
-pmaddubsw m2, m1, [r5 + 8 * 16]
-pmulhrsw m2, m7
+ ; mode 4 [calculate row 7]
-; mode 4 [store row 6 and 7]
+ pmaddubsw m2, m1, [r3 + 8 * 16]
+ pmulhrsw m2, m7
-packuswb m5, m2
-movu [r0 + 176], m5
+ ; mode 4 [store row 6 and 7]
-; mode 4 [row 2, 3]
+ packuswb m5, m2
+ movu [r0 + 176], m5
-palignr m1, m0, 1
-palignr m2, m0, 2
-palignr m3, m0, 3
+ ; mode 4 [row 2, 3]
-punpcklbw m1, m2
-pmaddubsw m4, m1, [r5 + 31 * 16]
-pmulhrsw m4, m7
+ palignr m1, m0, 1
+ palignr m2, m0, 2
+ palignr m3, m0, 3
-punpcklbw m2, m3
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, m7
+ punpcklbw m1, m2
+ pmaddubsw m4, m1, [r3 + 31 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 144], m4
+ punpcklbw m2, m3
+ pmaddubsw m5, m2, [r3 + 20 * 16]
+ pmulhrsw m5, m7
-; mode 5 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 144], m4
-pmaddubsw m4, m1, [r5 + 19 * 16]
-pmulhrsw m4, m7
+ ; mode 5 [row 2, 3]
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m1, [r3 + 19 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 208], m4
+ pmaddubsw m5, m2, [r3 + 4 * 16]
+ pmulhrsw m5, m7
-; mode 7 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 208], m4
-pmaddubsw m4, m1, [r5 + 31 * 16]
-pmulhrsw m4, m7
+ ; mode 7 [row 6, 7]
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m1, [r3 + 31 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 368], m4
+ pmaddubsw m5, m2, [r3 + 8 * 16]
+ pmulhrsw m5, m7
-; mode 10
+ packuswb m4, m5
+ movu [r0 + 368], m4
-pshufb m1, m0, [tab_Si]
-movu [r0 + 512], m1
-movu [r0 + 528], m1
-movu [r0 + 544], m1
-movu [r0 + 560], m1
+ ; mode 10
-pxor m0, m0
+ pshufb m1, m0, [tab_Si]
+ movu [r0 + 512], m1
+ movu [r0 + 528], m1
+ movu [r0 + 544], m1
+ movu [r0 + 560], m1
-pshufb m1, m1, m0
-punpcklbw m1, m0
+ pxor m0, m0
-movu m2, [r1]
+ pshufb m1, m1, m0
+ punpcklbw m1, m0
-pshufb m3, m2, m0
-punpcklbw m3, m0
+ movu m2, [r1]
-psrldq m4, m2, 1
-punpcklbw m4, m0
+ pshufb m3, m2, m0
+ punpcklbw m3, m0
-movu m2, [r1 + 9]
-punpcklbw m2, m0
+ psrldq m4, m2, 1
+ punpcklbw m4, m0
-psubw m4, m3
-psubw m2, m3
+ movu m2, [r1 + 9]
+ punpcklbw m2, m0
-psraw m4, 1
-psraw m2, 1
+ psubw m4, m3
+ psubw m2, m3
-paddw m4, m1
-paddw m2, m1
+ psraw m4, 1
+ psraw m2, 1
-packuswb m4, m2
+ paddw m4, m1
+ paddw m2, m1
-pextrb [r0 + 512], m4, 0
-pextrb [r0 + 520], m4, 1
-pextrb [r0 + 528], m4, 2
-pextrb [r0 + 536], m4, 3
-pextrb [r0 + 544], m4, 4
-pextrb [r0 + 552], m4, 5
-pextrb [r0 + 560], m4, 6
-pextrb [r0 + 568], m4, 7
+ packuswb m4, m2
-; mode 11 [row 0, 1]
+ pextrb [r0 + 512], m4, 0
+ pextrb [r0 + 520], m4, 1
+ pextrb [r0 + 528], m4, 2
+ pextrb [r0 + 536], m4, 3
+ pextrb [r0 + 544], m4, 4
+ pextrb [r0 + 552], m4, 5
+ pextrb [r0 + 560], m4, 6
+ pextrb [r0 + 568], m4, 7
-movu m0, [r2]
-palignr m1, m0, 1
-punpcklbw m2, m0, m1
+ ; mode 11 [row 0, 1]
-pmaddubsw m3, m2, [r5 + 30 * 16]
-pmulhrsw m3, m7
+ movu m0, [r1 + 16]
+ pinsrb m0, [r1], 0
+ palignr m1, m0, 1
+ punpcklbw m2, m0, m1
-pmaddubsw m4, m2, [r5 + 28 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 30 * 16]
+ pmulhrsw m3, m7
-packuswb m3, m4
-movu [r0 + 576], m3
+ pmaddubsw m4, m2, [r3 + 28 * 16]
+ pmulhrsw m4, m7
-; mode 11 [row 2, 3]
+ packuswb m3, m4
+ movu [r0 + 576], m3
-pmaddubsw m3, m2, [r5 + 26 * 16]
-pmulhrsw m3, m7
+ ; mode 11 [row 2, 3]
-pmaddubsw m4, m2, [r5 + 24 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 26 * 16]
+ pmulhrsw m3, m7
-packuswb m3, m4
-movu [r0 + 592], m3
+ pmaddubsw m4, m2, [r3 + 24 * 16]
+ pmulhrsw m4, m7
-; mode 11 [row 4, 5]
+ packuswb m3, m4
+ movu [r0 + 592], m3
-pmaddubsw m3, m2, [r5 + 22 * 16]
-pmulhrsw m3, m7
+ ; mode 11 [row 4, 5]
-pmaddubsw m4, m2, [r5 + 20 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 22 * 16]
+ pmulhrsw m3, m7
-packuswb m5, m3, m4
-movu [r0 + 608], m5
+ pmaddubsw m4, m2, [r3 + 20 * 16]
+ pmulhrsw m4, m7
-; mode 12 [row 0, 1]
+ packuswb m5, m3, m4
+ movu [r0 + 608], m5
-pmaddubsw m4, m2, [r5 + 27 * 16]
-pmulhrsw m4, m7
+ ; mode 12 [row 0, 1]
-packuswb m4, m3
-movu [r0 + 640], m4
+ pmaddubsw m4, m2, [r3 + 27 * 16]
+ pmulhrsw m4, m7
-; mode 11 [row 6, 7]
+ packuswb m4, m3
+ movu [r0 + 640], m4
-pmaddubsw m3, m2, [r5 + 18 * 16]
-pmulhrsw m3, m7
+ ; mode 11 [row 6, 7]
-pmaddubsw m4, m2, [r5 + 16 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 18 * 16]
+ pmulhrsw m3, m7
-packuswb m3, m4
-movu [r0 + 624], m3
+ pmaddubsw m4, m2, [r3 + 16 * 16]
+ pmulhrsw m4, m7
-; mode 12 [row 2, 3]
+ packuswb m3, m4
+ movu [r0 + 624], m3
-pmaddubsw m3, m2, [r5 + 17 * 16]
-pmulhrsw m3, m7
+ ; mode 12 [row 2, 3]
-pmaddubsw m4, m2, [r5 + 12 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 17 * 16]
+ pmulhrsw m3, m7
-packuswb m3, m4
-movu [r0 + 656], m3
+ pmaddubsw m4, m2, [r3 + 12 * 16]
+ pmulhrsw m4, m7
-; mode 12 [row 4, 5]
+ packuswb m3, m4
+ movu [r0 + 656], m3
-pmaddubsw m3, m2, [r5 + 7 * 16]
-pmulhrsw m3, m7
+ ; mode 12 [row 4, 5]
-pmaddubsw m4, m2, [r5 + 2 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 7 * 16]
+ pmulhrsw m3, m7
-packuswb m3, m4
-movu [r0 + 672], m3
+ pmaddubsw m4, m2, [r3 + 2 * 16]
+ pmulhrsw m4, m7
-; mode 12 [row 6, 7]
+ packuswb m3, m4
+ movu [r0 + 672], m3
-pslldq m3, m2, 2
-pinsrb m3, [r1 + 0], 1
-pinsrb m3, [r1 + 6], 0
+ ; mode 12 [row 6, 7]
-pmaddubsw m4, m3, [r5 + 29 * 16]
-pmulhrsw m4, m7
+ pslldq m3, m2, 2
+ pinsrb m3, [r1 + 0], 1
+ pinsrb m3, [r1 + 6], 0
-pmaddubsw m5, m3, [r5 + 24 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 29 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 688], m4
+ pmaddubsw m5, m3, [r3 + 24 * 16]
+ pmulhrsw m5, m7
-; mode 13 [row 0, 1]
+ packuswb m4, m5
+ movu [r0 + 688], m4
-pmaddubsw m4, m2, [r5 + 23 * 16]
-pmulhrsw m4, m7
+ ; mode 13 [row 0, 1]
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 23 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 704], m4
+ pmaddubsw m5, m2, [r3 + 14 * 16]
+ pmulhrsw m5, m7
-; mode 13 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 704], m4
-pmaddubsw m4, m2, [r5 + 5 * 16]
-pmulhrsw m4, m7
+ ; mode 13 [row 2, 3]
-pinsrb m3, [r1 + 4], 0
-pmaddubsw m5, m3, [r5 + 28 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 5 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 720], m4
+ pinsrb m3, [r1 + 4], 0
+ pmaddubsw m5, m3, [r3 + 28 * 16]
+ pmulhrsw m5, m7
-; mode 13 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 720], m4
-pmaddubsw m4, m3, [r5 + 19 * 16]
-pmulhrsw m4, m7
+ ; mode 13 [row 4, 5]
-pmaddubsw m5, m3, [r5 + 10 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 19 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 736], m4
+ pmaddubsw m5, m3, [r3 + 10 * 16]
+ pmulhrsw m5, m7
-; mode 13 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 736], m4
-pmaddubsw m4, m3, [r5 + 1 * 16]
-pmulhrsw m4, m7
+ ; mode 13 [row 6, 7]
-pslldq m5, m3, 2
-pinsrb m5, [r1 + 4], 1
-pinsrb m5, [r1 + 7], 0
+ pmaddubsw m4, m3, [r3 + 1 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m5, [r5 + 24 * 16]
-pmulhrsw m5, m7
+ pslldq m5, m3, 2
+ pinsrb m5, [r1 + 4], 1
+ pinsrb m5, [r1 + 7], 0
-packuswb m4, m5
-movu [r0 + 752], m4
+ pmaddubsw m5, [r3 + 24 * 16]
+ pmulhrsw m5, m7
-; mode 14 [row 0, 1]
+ packuswb m4, m5
+ movu [r0 + 752], m4
-pmaddubsw m4, m2, [r5 + 19 * 16]
-pmulhrsw m4, m7
+ ; mode 14 [row 0, 1]
-pmaddubsw m5, m2, [r5 + 6 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 19 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 768], m4
+ pmaddubsw m5, m2, [r3 + 6 * 16]
+ pmulhrsw m5, m7
-; mode 14 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 768], m4
-pinsrb m3, [r1 + 2], 0
+ ; mode 14 [row 2, 3]
-pmaddubsw m4, m3, [r5 + 25 * 16]
-pmulhrsw m4, m7
+ pinsrb m3, [r1 + 2], 0
-pmaddubsw m5, m3, [r5 + 12 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 25 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 784], m4
+ pmaddubsw m5, m3, [r3 + 12 * 16]
+ pmulhrsw m5, m7
-; mode 14 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 784], m4
-pslldq m1, m3, 2
-pinsrb m1, [r1 + 2], 1
-pinsrb m1, [r1 + 5], 0
+ ; mode 14 [row 4, 5]
-pmaddubsw m4, m1, [r5 + 31 * 16]
-pmulhrsw m4, m7
+ pslldq m1, m3, 2
+ pinsrb m1, [r1 + 2], 1
+ pinsrb m1, [r1 + 5], 0
-pmaddubsw m5, m1, [r5 + 18 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m1, [r3 + 31 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 800], m4
+ pmaddubsw m5, m1, [r3 + 18 * 16]
+ pmulhrsw m5, m7
-; mode 14 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 800], m4
-pmaddubsw m4, m1, [r5 + 5 * 16]
-pmulhrsw m4, m7
+ ; mode 14 [row 6, 7]
-pslldq m1, 2
-pinsrb m1, [r1 + 5], 1
-pinsrb m1, [r1 + 7], 0
+ pmaddubsw m4, m1, [r3 + 5 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m5, m1, [r5 + 24 * 16]
-pmulhrsw m5, m7
+ pslldq m1, 2
+ pinsrb m1, [r1 + 5], 1
+ pinsrb m1, [r1 + 7], 0
-packuswb m4, m5
-movu [r0 + 816], m4
+ pmaddubsw m5, m1, [r3 + 24 * 16]
+ pmulhrsw m5, m7
-; mode 15 [row 0, 1]
+ packuswb m4, m5
+ movu [r0 + 816], m4
-pmaddubsw m4, m2, [r5 + 15 * 16]
-pmulhrsw m4, m7
+ ; mode 15 [row 0, 1]
-pmaddubsw m5, m3, [r5 + 30 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 15 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 832], m4
+ pmaddubsw m5, m3, [r3 + 30 * 16]
+ pmulhrsw m5, m7
-; mode 15 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 832], m4
-pmaddubsw m4, m3, [r5 + 13 * 16]
-pmulhrsw m4, m7
+ ; mode 15 [row 2, 3]
-pslldq m1, m3, 2
-pinsrb m1, [r1 + 2], 1
-pinsrb m1, [r1 + 4], 0
+ pmaddubsw m4, m3, [r3 + 13 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m5, m1, [r5 + 28 * 16]
-pmulhrsw m5, m7
+ pslldq m1, m3, 2
+ pinsrb m1, [r1 + 2], 1
+ pinsrb m1, [r1 + 4], 0
-packuswb m4, m5
-movu [r0 + 848], m4
+ pmaddubsw m5, m1, [r3 + 28 * 16]
+ pmulhrsw m5, m7
-; mode 15 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 848], m4
-pmaddubsw m4, m1, [r5 + 11 * 16]
-pmulhrsw m4, m7
+ ; mode 15 [row 4, 5]
-pslldq m1, 2
-pinsrb m1, [r1 + 4], 1
-pinsrb m1, [r1 + 6], 0
+ pmaddubsw m4, m1, [r3 + 11 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m5, m1, [r5 + 26 * 16]
-pmulhrsw m5, m7
+ pslldq m1, 2
+ pinsrb m1, [r1 + 4], 1
+ pinsrb m1, [r1 + 6], 0
-packuswb m4, m5
-movu [r0 + 864], m4
+ pmaddubsw m5, m1, [r3 + 26 * 16]
+ pmulhrsw m5, m7
-; mode 15 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 864], m4
-pmaddubsw m4, m1, [r5 + 9 * 16]
-pmulhrsw m4, m7
+ ; mode 15 [row 6, 7]
-pslldq m1, 2
-pinsrb m1, [r1 + 6], 1
-pinsrb m1, [r1 + 8], 0
+ pmaddubsw m4, m1, [r3 + 9 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m1, [r5 + 24 * 16]
-pmulhrsw m1, m7
+ pslldq m1, 2
+ pinsrb m1, [r1 + 6], 1
+ pinsrb m1, [r1 + 8], 0
-packuswb m4, m1
-movu [r0 + 880], m4
+ pmaddubsw m1, [r3 + 24 * 16]
+ pmulhrsw m1, m7
-; mode 16 [row 0, 1]
+ packuswb m4, m1
+ movu [r0 + 880], m4
-pmaddubsw m4, m2, [r5 + 11 * 16]
-pmulhrsw m4, m7
+ ; mode 16 [row 0, 1]
-pmaddubsw m5, m3, [r5 + 22 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 11 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 896], m4
+ pmaddubsw m5, m3, [r3 + 22 * 16]
+ pmulhrsw m5, m7
-; mode 16 [row 2, 3]
+ packuswb m4, m5
+ movu [r0 + 896], m4
-pmaddubsw m4, m3, [r5 + 1 * 16]
-pmulhrsw m4, m7
+ ; mode 16 [row 2, 3]
-pslldq m3, 2
-pinsrb m3, [r1 + 2], 1
-pinsrb m3, [r1 + 3], 0
+ pmaddubsw m4, m3, [r3 + 1 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m5, m3, [r5 + 12 * 16]
-pmulhrsw m5, m7
+ pslldq m3, 2
+ pinsrb m3, [r1 + 2], 1
+ pinsrb m3, [r1 + 3], 0
-packuswb m4, m5
-movu [r0 + 912], m4
+ pmaddubsw m5, m3, [r3 + 12 * 16]
+ pmulhrsw m5, m7
-; mode 16 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 912], m4
-pslldq m3, 2
-pinsrb m3, [r1 + 3], 1
-pinsrb m3, [r1 + 5], 0
+ ; mode 16 [row 4, 5]
-pmaddubsw m4, m3, [r5 + 23 * 16]
-pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r1 + 3], 1
+ pinsrb m3, [r1 + 5], 0
-pmaddubsw m5, m3, [r5 + 2 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m3, [r3 + 23 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 928], m4
+ pmaddubsw m5, m3, [r3 + 2 * 16]
+ pmulhrsw m5, m7
-; mode 16 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 928], m4
-pslldq m3, 2
-pinsrb m3, [r1 + 5], 1
-pinsrb m3, [r1 + 6], 0
+ ; mode 16 [row 6, 7]
-pmaddubsw m4, m3, [r5 + 13 * 16]
-pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r1 + 5], 1
+ pinsrb m3, [r1 + 6], 0
-pslldq m3, 2
-pinsrb m3, [r1 + 6], 1
-pinsrb m3, [r1 + 8], 0
+ pmaddubsw m4, m3, [r3 + 13 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m3, [r5 + 24 * 16]
-pmulhrsw m3, m7
+ pslldq m3, 2
+ pinsrb m3, [r1 + 6], 1
+ pinsrb m3, [r1 + 8], 0
-packuswb m4, m3
-movu [r0 + 944], m4
+ pmaddubsw m3, [r3 + 24 * 16]
+ pmulhrsw m3, m7
-; mode 17 [row 0, 1]
+ packuswb m4, m3
+ movu [r0 + 944], m4
-pmaddubsw m4, m2, [r5 + 6 * 16]
-pmulhrsw m4, m7
+ ; mode 17 [row 0, 1]
-pslldq m2, 2
-pinsrb m2, [r1 + 0], 1
-pinsrb m2, [r1 + 1], 0
+ pmaddubsw m4, m2, [r3 + 6 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m3, m2, [r5 + 12 * 16]
-pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 0], 1
+ pinsrb m2, [r1 + 1], 0
-packuswb m4, m3
-movu [r0 + 960], m4
+ pmaddubsw m3, m2, [r3 + 12 * 16]
+ pmulhrsw m3, m7
-; mode 17 [row 2, 3]
+ packuswb m4, m3
+ movu [r0 + 960], m4
-pslldq m2, 2
-pinsrb m2, [r1 + 1], 1
-pinsrb m2, [r1 + 2], 0
+ ; mode 17 [row 2, 3]
-pmaddubsw m4, m2, [r5 + 18 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 1], 1
+ pinsrb m2, [r1 + 2], 0
-pslldq m2, 2
-pinsrb m2, [r1 + 2], 1
-pinsrb m2, [r1 + 4], 0
+ pmaddubsw m4, m2, [r3 + 18 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m3, m2, [r5 + 24 * 16]
-pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 2], 1
+ pinsrb m2, [r1 + 4], 0
-packuswb m4, m3
-movu [r0 + 976], m4
+ pmaddubsw m3, m2, [r3 + 24 * 16]
+ pmulhrsw m3, m7
-; mode 17 [row 4, 5]
+ packuswb m4, m3
+ movu [r0 + 976], m4
-pslldq m2, 2
-pinsrb m2, [r1 + 4], 1
-pinsrb m2, [r1 + 5], 0
+ ; mode 17 [row 4, 5]
-pmaddubsw m4, m2, [r5 + 30 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 4], 1
+ pinsrb m2, [r1 + 5], 0
-pmaddubsw m3, m2, [r5 + 4 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m4, m2, [r3 + 30 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m3
-movu [r0 + 992], m4
+ pmaddubsw m3, m2, [r3 + 4 * 16]
+ pmulhrsw m3, m7
-; mode 17 [row 6, 7]
+ packuswb m4, m3
+ movu [r0 + 992], m4
-pslldq m2, 2
-pinsrb m2, [r1 + 5], 1
-pinsrb m2, [r1 + 6], 0
+ ; mode 17 [row 6, 7]
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 5], 1
+ pinsrb m2, [r1 + 6], 0
-pslldq m2, 2
-pinsrb m2, [r1 + 6], 1
-pinsrb m2, [r1 + 7], 0
+ pmaddubsw m4, m2, [r3 + 10 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m3, m2, [r5 + 16 * 16]
-pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 6], 1
+ pinsrb m2, [r1 + 7], 0
-packuswb m4, m3
-movu [r0 + 1008], m4
+ pmaddubsw m3, m2, [r3 + 16 * 16]
+ pmulhrsw m3, m7
-; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]
+ packuswb m4, m3
+ movu [r0 + 1008], m4
-movh m1, [r3]
-movh [r0 + 1024], m1
+ ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]
-pslldq m2, m1, 1
-pinsrb m2, [r4 + 1], 0
-movh [r0 + 1032], m2
+ movh m1, [r2]
-pslldq m2, 1
-pinsrb m2, [r4 + 2], 0
-movh [r0 + 1040], m2
+ pslldq m2, m1, 1
+ pinsrb m2, [r2 + 1 + 16], 0
+ punpcklqdq m1, m2
+ movu [r0 + 1024], m1
-pslldq m2, 1
-pinsrb m2, [r4 + 3], 0
-movh [r0 + 1048], m2
+ pslldq m2, 1
+ pinsrb m2, [r2 + 2 + 16], 0
-pslldq m2, 1
-pinsrb m2, [r4 + 4], 0
-movh [r0 + 1056], m2
+ pslldq m0, m2, 1
+ pinsrb m0, [r2 + 3 + 16], 0
+ punpcklqdq m2, m0
+ movu [r0 + 1040], m2
-pslldq m2, 1
-pinsrb m2, [r4 + 5], 0
-movh [r0 + 1064], m2
+ pslldq m0, 1
+ pinsrb m0, [r2 + 4 + 16], 0
-pslldq m2, 1
-pinsrb m2, [r4 + 6], 0
-movh [r0 + 1072], m2
+ pslldq m2, m0, 1
+ pinsrb m2, [r2 + 5 + 16], 0
+ punpcklqdq m0, m2
+ movu [r0 + 1056], m0
-pslldq m2, 1
-pinsrb m2, [r4 + 7], 0
-movh [r0 + 1080], m2
+ pslldq m2, 1
+ pinsrb m2, [r2 + 6 + 16], 0
-; mode 19 [row 0, 1]
+ pslldq m0, m2, 1
+ pinsrb m0, [r2 + 7 + 16], 0
+ punpcklqdq m2, m0
+ movu [r0 + 1072], m2
-movu m0, [r1]
-palignr m1, m0, 1
-punpcklbw m0, m1
+ ; mode 19 [row 0, 1]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m7
+ movu m0, [r1]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
-pslldq m2, m0, 2
-pinsrb m2, [r2 + 0], 1
-pinsrb m2, [r2 + 1], 0
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m7
-pmaddubsw m3, m2, [r5 + 12 * 16]
-pmulhrsw m3, m7
+ pslldq m2, m0, 2
+ pinsrb m2, [r1], 1
+ pinsrb m2, [r1 + 1 + 16], 0
-packuswb m1, m3
-movu [r0 + 1088], m1
+ pmaddubsw m3, m2, [r3 + 12 * 16]
+ pmulhrsw m3, m7
-; mode 19 [row 2, 3]
+ packuswb m1, m3
+ movu [r0 + 1088], m1
-pslldq m2, 2
-pinsrb m2, [r2 + 1], 1
-pinsrb m2, [r2 + 2], 0
+ ; mode 19 [row 2, 3]
-pmaddubsw m4, m2, [r5 + 18 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 1 + 16], 1
+ pinsrb m2, [r1 + 2 + 16], 0
-pslldq m2, 2
-pinsrb m2, [r2 + 2], 1
-pinsrb m2, [r2 + 4], 0
+ pmaddubsw m4, m2, [r3 + 18 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 2 + 16], 1
+ pinsrb m2, [r1 + 4 + 16], 0
-packuswb m4, m5
-movu [r0 + 1104], m4
+ pmaddubsw m5, m2, [r3 + 24 * 16]
+ pmulhrsw m5, m7
-; mode 19 [row 4, 5]
+ packuswb m4, m5
+ movu [r0 + 1104], m4
-pslldq m2, 2
-pinsrb m2, [r2 + 4], 1
-pinsrb m2, [r2 + 5], 0
+ ; mode 19 [row 4, 5]
-pmaddubsw m4, m2, [r5 + 30 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 4 + 16], 1
+ pinsrb m2, [r1 + 5 + 16], 0
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m4, m2, [r3 + 30 * 16]
+ pmulhrsw m4, m7
-packuswb m4, m5
-movu [r0 + 1120], m4
+ pmaddubsw m5, m2, [r3 + 4 * 16]
+ pmulhrsw m5, m7
-; mode 19 [row 6, 7]
+ packuswb m4, m5
+ movu [r0 + 1120], m4
-pslldq m2, 2
-pinsrb m2, [r2 + 5], 1
-pinsrb m2, [r2 + 6], 0
+ ; mode 19 [row 6, 7]
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 5 + 16], 1
+ pinsrb m2, [r1 + 6 + 16], 0
-pslldq m2, 2
-pinsrb m2, [r2 + 6], 1
-pinsrb m2, [r2 + 7], 0
+ pmaddubsw m4, m2, [r3 + 10 * 16]
+ pmulhrsw m4, m7
-pmaddubsw m2, [r5 + 16 * 16]
-pmulhrsw m2, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 6 + 16], 1
+ pinsrb m2, [r1 + 7 + 16], 0
-packuswb m4, m2
-movu [r0 + 1136], m4
+ pmaddubsw m2, [r3 + 16 * 16]
+ pmulhrsw m2, m7
-; mode 20 [row 0, 1]
+ packuswb m4, m2
+ movu [r0 + 1136], m4
-pmaddubsw m3, m0, [r5 + 11 * 16]
-pmulhrsw m3, m7
+ ; mode 20 [row 0, 1]
-pslldq m1, m0, 2
-pinsrb m1, [r2 + 0], 1
-pinsrb m1, [r2 + 2], 0
+ pmaddubsw m3, m0, [r3 + 11 * 16]
+ pmulhrsw m3, m7
-pmaddubsw m4, m1, [r5 + 22 * 16]
-pmulhrsw m4, m7
+ pslldq m1, m0, 2
+ pinsrb m1, [r1 + 0], 1
+ pinsrb m1, [r1 + 2 + 16], 0
-packuswb m3, m4
-movu [r0 + 1152], m3
+ pmaddubsw m4, m1, [r3 + 22 * 16]
+ pmulhrsw m4, m7
-; mode 20 [row 2, 3]
+ packuswb m3, m4
+ movu [r0 + 1152], m3
-pmaddubsw m3, m1, [r5 + 1 * 16]
-pmulhrsw m3, m7
+ ; mode 20 [row 2, 3]
-pslldq m2, m1, 2
-pinsrb m2, [r2 + 2], 1
-pinsrb m2, [r2 + 3], 0
+ pmaddubsw m3, m1, [r3 + 1 * 16]
+ pmulhrsw m3, m7
-pmaddubsw m4, m2, [r5 + 12 * 16]
-pmulhrsw m4, m7
+ pslldq m2, m1, 2
+ pinsrb m2, [r1 + 2 + 16], 1
+ pinsrb m2, [r1 + 3 + 16], 0
-packuswb m3, m4
-movu [r0 + 1168], m3
+ pmaddubsw m4, m2, [r3 + 12 * 16]
+ pmulhrsw m4, m7
-; mode 20 [row 4, 5]
+ packuswb m3, m4
+ movu [r0 + 1168], m3
-pslldq m2, 2
-pinsrb m2, [r2 + 3], 1
-pinsrb m2, [r2 + 5], 0
+ ; mode 20 [row 4, 5]
-pmaddubsw m3, m2, [r5 + 23 * 16]
-pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 3 + 16], 1
+ pinsrb m2, [r1 + 5 + 16], 0
-pmaddubsw m4, m2, [r5 + 2 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m3, m2, [r3 + 23 * 16]
+ pmulhrsw m3, m7
-packuswb m3, m4
-movu [r0 + 1184], m3
+ pmaddubsw m4, m2, [r3 + 2 * 16]
+ pmulhrsw m4, m7
-; mode 20 [row 6, 7]
+ packuswb m3, m4
+ movu [r0 + 1184], m3
-pslldq m2, 2
-pinsrb m2, [r2 + 5], 1
-pinsrb m2, [r2 + 6], 0
+ ; mode 20 [row 6, 7]
-pmaddubsw m3, m2, [r5 + 13 * 16]
-pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 5 + 16], 1
+ pinsrb m2, [r1 + 6 + 16], 0
-pslldq m2, 2
-pinsrb m2, [r2 + 6], 1
-pinsrb m2, [r2 + 8], 0
+ pmaddubsw m3, m2, [r3 + 13 * 16]
+ pmulhrsw m3, m7
-pmaddubsw m4, m2, [r5 + 24 * 16]
-pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r1 + 6 + 16], 1
+ pinsrb m2, [r1 + 8 + 16], 0
-packuswb m3, m4
-movu [r0 + 1200], m3
+ pmaddubsw m4, m2, [r3 + 24 * 16]
+ pmulhrsw m4, m7
-; mode 21 [row 0, 1]
+ packuswb m3, m4
+ movu [r0 + 1200], m3
-pmaddubsw m2, m0, [r5 + 15 * 16]
-pmulhrsw m2, m7
+ ; mode 21 [row 0, 1]
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m0, [r3 + 15 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1216], m2
+ pmaddubsw m3, m1, [r3 + 30 * 16]
+ pmulhrsw m3, m7
-; mode 21 [row 2, 3]
+ packuswb m2, m3
+ movu [r0 + 1216], m2
-pmaddubsw m2, m1, [r5 + 13 * 16]
-pmulhrsw m2, m7
+ ; mode 21 [row 2, 3]
-pslldq m3, m1, 2
-pinsrb m3, [r2 + 2], 1
-pinsrb m3, [r2 + 4], 0
+ pmaddubsw m2, m1, [r3 + 13 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m4, m3, [r5 + 28 * 16]
-pmulhrsw m4, m7
+ pslldq m3, m1, 2
+ pinsrb m3, [r1 + 2 + 16], 1
+ pinsrb m3, [r1 + 4 + 16], 0
-packuswb m2, m4
-movu [r0 + 1232], m2
+ pmaddubsw m4, m3, [r3 + 28 * 16]
+ pmulhrsw m4, m7
-; mode 21 [row 4, 5]
+ packuswb m2, m4
+ movu [r0 + 1232], m2
-pmaddubsw m2, m3, [r5 + 11 * 16]
-pmulhrsw m2, m7
+ ; mode 21 [row 4, 5]
-pslldq m3, 2
-pinsrb m3, [r2 + 4], 1
-pinsrb m3, [r2 + 6], 0
+ pmaddubsw m2, m3, [r3 + 11 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m4, m3, [r5 + 26 * 16]
-pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r1 + 4 + 16], 1
+ pinsrb m3, [r1 + 6 + 16], 0
-packuswb m2, m4
-movu [r0 + 1248], m2
+ pmaddubsw m4, m3, [r3 + 26 * 16]
+ pmulhrsw m4, m7
-; mode 21 [row 6, 7]
+ packuswb m2, m4
+ movu [r0 + 1248], m2
-pmaddubsw m2, m3, [r5 + 9 * 16]
-pmulhrsw m2, m7
+ ; mode 21 [row 6, 7]
-pslldq m3, 2
-pinsrb m3, [r2 + 6], 1
-pinsrb m3, [r2 + 8], 0
+ pmaddubsw m2, m3, [r3 + 9 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m4, m3, [r5 + 24 * 16]
-pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r1 + 6 + 16], 1
+ pinsrb m3, [r1 + 8 + 16], 0
-packuswb m2, m4
-movu [r0 + 1264], m2
+ pmaddubsw m4, m3, [r3 + 24 * 16]
+ pmulhrsw m4, m7
-; mode 22 [row 0, 1]
+ packuswb m2, m4
+ movu [r0 + 1264], m2
-pmaddubsw m2, m0, [r5 + 19 * 16]
-pmulhrsw m2, m7
+ ; mode 22 [row 0, 1]
-pmaddubsw m4, m0, [r5 + 6 * 16]
-pmulhrsw m4, m7
+ pmaddubsw m2, m0, [r3 + 19 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m4
-movu [r0 + 1280], m2
+ pmaddubsw m4, m0, [r3 + 6 * 16]
+ pmulhrsw m4, m7
-; mode 22 [row 2, 3]
+ packuswb m2, m4
+ movu [r0 + 1280], m2
-pmaddubsw m2, m1, [r5 + 25 * 16]
-pmulhrsw m2, m7
+ ; mode 22 [row 2, 3]
-pmaddubsw m3, m1, [r5 + 12 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m1, [r3 + 25 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1296], m2
+ pmaddubsw m3, m1, [r3 + 12 * 16]
+ pmulhrsw m3, m7
-; mode 22 [row 4, 5]
+ packuswb m2, m3
+ movu [r0 + 1296], m2
-pslldq m1, 2
-pinsrb m1, [r2 + 5], 0
-pinsrb m1, [r2 + 2], 1
+ ; mode 22 [row 4, 5]
-pmaddubsw m2, m1, [r5 + 31 * 16]
-pmulhrsw m2, m7
+ pslldq m1, 2
+ pinsrb m1, [r1 + 5 + 16], 0
+ pinsrb m1, [r1 + 2 + 16], 1
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m1, [r3 + 31 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1312], m2
+ pmaddubsw m3, m1, [r3 + 18 * 16]
+ pmulhrsw m3, m7
-; mode 22 [row 6, 7]
+ packuswb m2, m3
+ movu [r0 + 1312], m2
-pmaddubsw m2, m1, [r5 + 5 * 16]
-pmulhrsw m2, m7
+ ; mode 22 [row 6, 7]
-pslldq m1, 2
-pinsrb m1, [r2 + 5], 1
-pinsrb m1, [r2 + 7], 0
+ pmaddubsw m2, m1, [r3 + 5 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m1, [r5 + 24 * 16]
-pmulhrsw m1, m7
+ pslldq m1, 2
+ pinsrb m1, [r1 + 5 + 16], 1
+ pinsrb m1, [r1 + 7 + 16], 0
-packuswb m2, m1
-movu [r0 + 1328], m2
+ pmaddubsw m1, [r3 + 24 * 16]
+ pmulhrsw m1, m7
-; mode 23 [row 0, 1]
+ packuswb m2, m1
+ movu [r0 + 1328], m2
-pmaddubsw m2, m0, [r5 + 23 * 16]
-pmulhrsw m2, m7
+ ; mode 23 [row 0, 1]
-pmaddubsw m3, m0, [r5 + 14 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m0, [r3 + 23 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1344], m2
+ pmaddubsw m3, m0, [r3 + 14 * 16]
+ pmulhrsw m3, m7
-; mode 23 [row 2, 3]
+ packuswb m2, m3
+ movu [r0 + 1344], m2
-pmaddubsw m2, m0, [r5 + 5 * 16]
-pmulhrsw m2, m7
+ ; mode 23 [row 2, 3]
-pslldq m1, m0, 2
-pinsrb m1, [r2 + 0], 1
-pinsrb m1, [r2 + 4], 0
+ pmaddubsw m2, m0, [r3 + 5 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, m7
+ pslldq m1, m0, 2
+ pinsrb m1, [r1], 1
+ pinsrb m1, [r1 + 4 + 16], 0
-packuswb m2, m3
-movu [r0 + 1360], m2
+ pmaddubsw m3, m1, [r3 + 28 * 16]
+ pmulhrsw m3, m7
-; mode 23 [row 4, 5]
+ packuswb m2, m3
+ movu [r0 + 1360], m2
-pmaddubsw m2, m1, [r5 + 19 * 16]
-pmulhrsw m2, m7
+ ; mode 23 [row 4, 5]
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m1, [r3 + 19 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1376], m2
+ pmaddubsw m3, m1, [r3 + 10 * 16]
+ pmulhrsw m3, m7
-; mode 23 [row 6, 7]
+ packuswb m2, m3
+ movu [r0 + 1376], m2
-pmaddubsw m2, m1, [r5 + 1 * 16]
-pmulhrsw m2, m7
+ ; mode 23 [row 6, 7]
-pslldq m3, m1, 2
-pinsrb m3, [r2 + 4], 1
-pinsrb m3, [r2 + 7], 0
+ pmaddubsw m2, m1, [r3 + 1 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m3, [r5 + 24 * 16]
-pmulhrsw m3, m7
+ pslldq m3, m1, 2
+ pinsrb m3, [r1 + 4 + 16], 1
+ pinsrb m3, [r1 + 7 + 16], 0
-packuswb m2, m3
-movu [r0 + 1392], m2
+ pmaddubsw m3, [r3 + 24 * 16]
+ pmulhrsw m3, m7
-; mode 24 [row 0, 1]
+ packuswb m2, m3
+ movu [r0 + 1392], m2
-pmaddubsw m2, m0, [r5 + 27 * 16]
-pmulhrsw m2, m7
+ ; mode 24 [row 0, 1]
-pmaddubsw m5, m0, [r5 + 22 * 16]
-pmulhrsw m5, m7
+ pmaddubsw m2, m0, [r3 + 27 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m5
-movu [r0 + 1408], m2
+ pmaddubsw m5, m0, [r3 + 22 * 16]
+ pmulhrsw m5, m7
-; mode 24 [row 2, 3]
+ packuswb m2, m5
+ movu [r0 + 1408], m2
-pmaddubsw m2, m0, [r5 + 17 * 16]
-pmulhrsw m2, m7
+ ; mode 24 [row 2, 3]
-pmaddubsw m3, m0, [r5 + 12 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m0, [r3 + 17 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1424], m2
+ pmaddubsw m3, m0, [r3 + 12 * 16]
+ pmulhrsw m3, m7
-; mode 24 [row 4, 5]
+ packuswb m2, m3
+ movu [r0 + 1424], m2
-pmaddubsw m2, m0, [r5 + 7 * 16]
-pmulhrsw m2, m7
+ ; mode 24 [row 4, 5]
-pmaddubsw m3, m0, [r5 + 2 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m0, [r3 + 7 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1440], m2
+ pmaddubsw m3, m0, [r3 + 2 * 16]
+ pmulhrsw m3, m7
-; mode 24 [row 6, 7]
+ packuswb m2, m3
+ movu [r0 + 1440], m2
-pinsrb m1, [r2 + 6], 0
+ ; mode 24 [row 6, 7]
-pmaddubsw m2, m1, [r5 + 29 * 16]
-pmulhrsw m2, m7
+ pinsrb m1, [r1 + 6 + 16], 0
-pmaddubsw m1, [r5 + 24 * 16]
-pmulhrsw m1, m7
+ pmaddubsw m2, m1, [r3 + 29 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m1
-movu [r0 + 1456], m2
+ pmaddubsw m1, [r3 + 24 * 16]
+ pmulhrsw m1, m7
-; mode 25 [row 0, 1]
+ packuswb m2, m1
+ movu [r0 + 1456], m2
-pmaddubsw m2, m0, [r5 + 30 * 16]
-pmulhrsw m2, m7
+ ; mode 25 [row 0, 1]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m7
+ pmaddubsw m2, m0, [r3 + 30 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m1
-movu [r0 + 1472], m2
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m7
-; mode 25 [row 2, 3]
+ packuswb m2, m1
+ movu [r0 + 1472], m2
-pmaddubsw m2, m0, [r5 + 26 * 16]
-pmulhrsw m2, m7
+ ; mode 25 [row 2, 3]
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m7
+ pmaddubsw m2, m0, [r3 + 26 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m1
-movu [r0 + 1488], m2
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m7
-; mode 25 [row 4, 5]
+ packuswb m2, m1
+ movu [r0 + 1488], m2
-pmaddubsw m1, m0, [r5 + 20 * 16]
-pmulhrsw m1, m7
+ ; mode 25 [row 4, 5]
-packuswb m5, m1
-movu [r0 + 1504], m5
+ pmaddubsw m1, m0, [r3 + 20 * 16]
+ pmulhrsw m1, m7
-; mode 25 [row 6, 7]
+ packuswb m5, m1
+ movu [r0 + 1504], m5
-pmaddubsw m2, m0, [r5 + 18 * 16]
-pmulhrsw m2, m7
+ ; mode 25 [row 6, 7]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m7
+ pmaddubsw m2, m0, [r3 + 18 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m1
-movu [r0 + 1520], m2
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m7
-; mode 26
+ packuswb m2, m1
+ movu [r0 + 1520], m2
-movu m0, [r1 + 1]
+ ; mode 26
-pshufb m1, m0, [tab_Si]
-movu [r0 + 1536], m1
-movu [r0 + 1552], m1
-movu [r0 + 1568], m1
-movu [r0 + 1584], m1
+ movu m0, [r1 + 1]
-pxor m5, m5
+ pshufb m1, m0, [tab_Si]
+ movu [r0 + 1536], m1
+ movu [r0 + 1552], m1
+ movu [r0 + 1568], m1
+ movu [r0 + 1584], m1
-pshufb m1, m1, m5
-punpcklbw m1, m5
+ pxor m5, m5
-movu m2, [r2]
+ pshufb m1, m1, m5
+ punpcklbw m1, m5
-pshufb m3, m2, m5
-punpcklbw m3, m5
+ movu m2, [r1 + 16]
+ pinsrb m2, [r1], 0
-psrldq m4, m2, 1
-punpcklbw m4, m5
+ pshufb m3, m2, m5
+ punpcklbw m3, m5
-movu m2, [r2 + 9]
-punpcklbw m2, m5
+ psrldq m4, m2, 1
+ punpcklbw m4, m5
-psubw m4, m3
-psubw m2, m3
+ movu m2, [r1 + 9 + 16]
+ punpcklbw m2, m5
-psraw m4, 1
-psraw m2, 1
+ psubw m4, m3
+ psubw m2, m3
-paddw m4, m1
-paddw m2, m1
+ psraw m4, 1
+ psraw m2, 1
-packuswb m4, m2
+ paddw m4, m1
+ paddw m2, m1
-pextrb [r0 + 1536], m4, 0
-pextrb [r0 + 1544], m4, 1
-pextrb [r0 + 1552], m4, 2
-pextrb [r0 + 1560], m4, 3
-pextrb [r0 + 1568], m4, 4
-pextrb [r0 + 1576], m4, 5
-pextrb [r0 + 1584], m4, 6
-pextrb [r0 + 1592], m4, 7
+ packuswb m4, m2
-; mode 27 [row 0, 1]
+ pextrb [r0 + 1536], m4, 0
+ pextrb [r0 + 1544], m4, 1
+ pextrb [r0 + 1552], m4, 2
+ pextrb [r0 + 1560], m4, 3
+ pextrb [r0 + 1568], m4, 4
+ pextrb [r0 + 1576], m4, 5
+ pextrb [r0 + 1584], m4, 6
+ pextrb [r0 + 1592], m4, 7
-palignr m6, m0, 1
-punpcklbw m4, m0, m6
+ ; mode 27 [row 0, 1]
-pmaddubsw m1, m4, [r5 + 2 * 16]
-pmulhrsw m1, m7
+ palignr m6, m0, 1
+ punpcklbw m4, m0, m6
-pmaddubsw m2, m4, [r5 + 4 * 16]
-pmulhrsw m2, m7
+ pmaddubsw m1, m4, [r3 + 2 * 16]
+ pmulhrsw m1, m7
-packuswb m1, m2
-movu [r0 + 1600], m1
+ pmaddubsw m2, m4, [r3 + 4 * 16]
+ pmulhrsw m2, m7
-; mode 27 [row 2, 3]
+ packuswb m1, m2
+ movu [r0 + 1600], m1
-pmaddubsw m1, m4, [r5 + 6 * 16]
-pmulhrsw m1, m7
+ ; mode 27 [row 2, 3]
-pmaddubsw m2, m4, [r5 + 8 * 16]
-pmulhrsw m2, m7
+ pmaddubsw m1, m4, [r3 + 6 * 16]
+ pmulhrsw m1, m7
-packuswb m1, m2
-movu [r0 + 1616], m1
+ pmaddubsw m2, m4, [r3 + 8 * 16]
+ pmulhrsw m2, m7
-; mode 27 [row 4, 5]
+ packuswb m1, m2
+ movu [r0 + 1616], m1
-pmaddubsw m3, m4, [r5 + 10 * 16]
-pmulhrsw m3, m7
+ ; mode 27 [row 4, 5]
-pmaddubsw m2, m4, [r5 + 12 * 16]
-pmulhrsw m2, m7
+ pmaddubsw m3, m4, [r3 + 10 * 16]
+ pmulhrsw m3, m7
-packuswb m1, m3, m2
-movu [r0 + 1632], m1
+ pmaddubsw m2, m4, [r3 + 12 * 16]
+ pmulhrsw m2, m7
-; mode 27 [row 6, 7]
+ packuswb m1, m3, m2
+ movu [r0 + 1632], m1
-pmaddubsw m1, m4, [r5 + 14 * 16]
-pmulhrsw m1, m7
+ ; mode 27 [row 6, 7]
-pmaddubsw m2, m4, [r5 + 16 * 16]
-pmulhrsw m2, m7
+ pmaddubsw m1, m4, [r3 + 14 * 16]
+ pmulhrsw m1, m7
-packuswb m1, m2
-movu [r0 + 1648], m1
+ pmaddubsw m2, m4, [r3 + 16 * 16]
+ pmulhrsw m2, m7
-; mode 28 [row 0, 1]
+ packuswb m1, m2
+ movu [r0 + 1648], m1
-pmaddubsw m1, m4, [r5 + 5 * 16]
-pmulhrsw m1, m7
+ ; mode 28 [row 0, 1]
-packuswb m1, m3
-movu [r0 + 1664], m1
+ pmaddubsw m1, m4, [r3 + 5 * 16]
+ pmulhrsw m1, m7
-; mode 28 [row 2, 3]
+ packuswb m1, m3
+ movu [r0 + 1664], m1
-pmaddubsw m1, m4, [r5 + 15 * 16]
-pmulhrsw m1, m7
+ ; mode 28 [row 2, 3]
-pmaddubsw m2, m4, [r5 + 20 * 16]
-pmulhrsw m2, m7
+ pmaddubsw m1, m4, [r3 + 15 * 16]
+ pmulhrsw m1, m7
-packuswb m1, m2
-movu [r0 + 1680], m1
+ pmaddubsw m2, m4, [r3 + 20 * 16]
+ pmulhrsw m2, m7
-; mode 28 [row 4, 5]
+ packuswb m1, m2
+ movu [r0 + 1680], m1
-pmaddubsw m1, m4, [r5 + 25 * 16]
-pmulhrsw m1, m7
+ ; mode 28 [row 4, 5]
-pmaddubsw m2, m4, [r5 + 30 * 16]
-pmulhrsw m2, m7
+ pmaddubsw m1, m4, [r3 + 25 * 16]
+ pmulhrsw m1, m7
-packuswb m1, m2
-movu [r0 + 1696], m1
+ pmaddubsw m2, m4, [r3 + 30 * 16]
+ pmulhrsw m2, m7
-; mode 28 [row 6, 7]
+ packuswb m1, m2
+ movu [r0 + 1696], m1
-palignr m1, m0, 2
-punpcklbw m5, m6, m1
+ ; mode 28 [row 6, 7]
-pmaddubsw m2, m5, [r5 + 3 * 16]
-pmulhrsw m2, m7
+ palignr m1, m0, 2
+ punpcklbw m5, m6, m1
-pmaddubsw m3, m5, [r5 + 8 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m5, [r3 + 3 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1712], m2
+ pmaddubsw m3, m5, [r3 + 8 * 16]
+ pmulhrsw m3, m7
-; mode 29 [row 0, 1]
+ packuswb m2, m3
+ movu [r0 + 1712], m2
-pmaddubsw m2, m4, [r5 + 9 * 16]
-pmulhrsw m2, m7
+ ; mode 29 [row 0, 1]
-pmaddubsw m3, m4, [r5 + 18 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m4, [r3 + 9 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1728], m2
+ pmaddubsw m3, m4, [r3 + 18 * 16]
+ pmulhrsw m3, m7
-; mode 29 [row 2, 3]
+ packuswb m2, m3
+ movu [r0 + 1728], m2
-pmaddubsw m2, m4, [r5 + 27 * 16]
-pmulhrsw m2, m7
+ ; mode 29 [row 2, 3]
-pmaddubsw m3, m5, [r5 + 4 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m4, [r3 + 27 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1744], m2
+ pmaddubsw m3, m5, [r3 + 4 * 16]
+ pmulhrsw m3, m7
-; mode 29 [row 4, 5]
+ packuswb m2, m3
+ movu [r0 + 1744], m2
-pmaddubsw m2, m5, [r5 + 13 * 16]
-pmulhrsw m2, m7
+ ; mode 29 [row 4, 5]
-pmaddubsw m3, m5, [r5 + 22 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m5, [r3 + 13 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1760], m2
+ pmaddubsw m3, m5, [r3 + 22 * 16]
+ pmulhrsw m3, m7
-; mode 29 [row 6, 7]
+ packuswb m2, m3
+ movu [r0 + 1760], m2
-pmaddubsw m2, m5, [r5 + 31 * 16]
-pmulhrsw m2, m7
+ ; mode 29 [row 6, 7]
-palignr m6, m0, 3
-punpcklbw m1, m6
+ pmaddubsw m2, m5, [r3 + 31 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, m7
+ palignr m6, m0, 3
+ punpcklbw m1, m6
-packuswb m2, m3
-movu [r0 + 1776], m2
+ pmaddubsw m3, m1, [r3 + 8 * 16]
+ pmulhrsw m3, m7
-; mode 32 [row 2]
+ packuswb m2, m3
+ movu [r0 + 1776], m2
-movh [r0 + 1936], m2
+ ; mode 32 [row 2]
-; mode 30 [row 0, 1]
+ movh [r0 + 1936], m2
-pmaddubsw m2, m4, [r5 + 13 * 16]
-pmulhrsw m2, m7
+ ; mode 30 [row 0, 1]
-pmaddubsw m3, m4, [r5 + 26 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m4, [r3 + 13 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1792], m2
+ pmaddubsw m3, m4, [r3 + 26 * 16]
+ pmulhrsw m3, m7
-; mode 30 [row 2, 3]
+ packuswb m2, m3
+ movu [r0 + 1792], m2
-pmaddubsw m2, m5, [r5 + 7 * 16]
-pmulhrsw m2, m7
+ ; mode 30 [row 2, 3]
-pmaddubsw m3, m5, [r5 + 20 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m5, [r3 + 7 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1808], m2
+ pmaddubsw m3, m5, [r3 + 20 * 16]
+ pmulhrsw m3, m7
-; mode 33 [row 1]
+ packuswb m2, m3
+ movu [r0 + 1808], m2
-movhps [r0 + 1992], m2
+ ; mode 33 [row 1]
-; mode 30 [row 4, 5]
+ movhps [r0 + 1992], m2
-pmaddubsw m2, m1, [r5 + 1 * 16]
-pmulhrsw m2, m7
+ ; mode 30 [row 4, 5]
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m1, [r3 + 1 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1824], m2
+ pmaddubsw m3, m1, [r3 + 14 * 16]
+ pmulhrsw m3, m7
-; mode 33 [row 2]
+ packuswb m2, m3
+ movu [r0 + 1824], m2
-movhps [r0 + 2000], m2
+ ; mode 33 [row 2]
-; mode 30 [row 6, 7]
+ movhps [r0 + 2000], m2
-pmaddubsw m2, m1, [r5 + 27 * 16]
-pmulhrsw m2, m7
+ ; mode 30 [row 6, 7]
-psrldq m0, 4
-punpcklbw m6, m0
+ pmaddubsw m2, m1, [r3 + 27 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m3, m6, [r5 + 8 * 16]
-pmulhrsw m3, m7
+ psrldq m0, 4
+ punpcklbw m6, m0
-packuswb m2, m3
-movu [r0 + 1840], m2
+ pmaddubsw m3, m6, [r3 + 8 * 16]
+ pmulhrsw m3, m7
-; mode 33 [row 3]
+ packuswb m2, m3
+ movu [r0 + 1840], m2
-movhps [r0 + 2008], m2
+ ; mode 33 [row 3]
-; mode 31 [row 0, 1]
+ movhps [r0 + 2008], m2
-pmaddubsw m2, m4, [r5 + 17 * 16]
-pmulhrsw m2, m7
+ ; mode 31 [row 0, 1]
-pmaddubsw m3, m5, [r5 + 2 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m4, [r3 + 17 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1856], m2
+ pmaddubsw m3, m5, [r3 + 2 * 16]
+ pmulhrsw m3, m7
-; mode 31 [row 2, 3]
+ packuswb m2, m3
+ movu [r0 + 1856], m2
-pmaddubsw m2, m5, [r5 + 19 * 16]
-pmulhrsw m2, m7
+ ; mode 31 [row 2, 3]
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m5, [r3 + 19 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1872], m2
+ pmaddubsw m3, m1, [r3 + 4 * 16]
+ pmulhrsw m3, m7
-; mode 31 [row 4, 5]
+ packuswb m2, m3
+ movu [r0 + 1872], m2
-pmaddubsw m2, m1, [r5 + 21 * 16]
-pmulhrsw m2, m7
+ ; mode 31 [row 4, 5]
-pmaddubsw m3, m6, [r5 + 6 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m1, [r3 + 21 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1888], m2
+ pmaddubsw m3, m6, [r3 + 6 * 16]
+ pmulhrsw m3, m7
-; mode 31 [row 6, 7]
+ packuswb m2, m3
+ movu [r0 + 1888], m2
-pmaddubsw m2, m6, [r5 + 23 * 16]
-pmulhrsw m2, m7
+ ; mode 31 [row 6, 7]
-movu m3, [r1 + 6]
-punpcklbw m0, m3
+ pmaddubsw m2, m6, [r3 + 23 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m3, m0, [r5 + 8 * 16]
-pmulhrsw m3, m7
+ movu m3, [r1 + 6]
+ punpcklbw m0, m3
-packuswb m2, m3
-movu [r0 + 1904], m2
+ pmaddubsw m3, m0, [r3 + 8 * 16]
+ pmulhrsw m3, m7
-; mode 32 [row 0, 1]
+ packuswb m2, m3
+ movu [r0 + 1904], m2
-pmaddubsw m2, m4, [r5 + 21 * 16]
-pmulhrsw m2, m7
+ ; mode 32 [row 0, 1]
-pmaddubsw m3, m5, [r5 + 10 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m4, [r3 + 21 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1920], m2
+ pmaddubsw m3, m5, [r3 + 10 * 16]
+ pmulhrsw m3, m7
-; mode 32 [row 3]
+ packuswb m2, m3
+ movu [r0 + 1920], m2
-pmaddubsw m2, m1, [r5 + 20 * 16]
-pmulhrsw m2, m7
+ ; mode 32 [row 3]
-pxor m3, m3
+ pmaddubsw m2, m1, [r3 + 20 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movh [r0 + 1944], m2
+ pxor m3, m3
-; mode 32 [row 4, 5]
+ packuswb m2, m3
+ movh [r0 + 1944], m2
-pmaddubsw m2, m6, [r5 + 9 * 16]
-pmulhrsw m2, m7
+ ; mode 32 [row 4, 5]
-pmaddubsw m3, m6, [r5 + 30 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m6, [r3 + 9 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 1952], m2
+ pmaddubsw m3, m6, [r3 + 30 * 16]
+ pmulhrsw m3, m7
-; mode 33 [row 4, 5]
+ packuswb m2, m3
+ movu [r0 + 1952], m2
-pmaddubsw m2, m0, [r5 + 2 * 16]
-pmulhrsw m2, m7
+ ; mode 33 [row 4, 5]
-pmaddubsw m3, m0, [r5 + 28 * 16]
-pmulhrsw m3, m7
+ pmaddubsw m2, m0, [r3 + 2 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movu [r0 + 2016], m2
+ pmaddubsw m3, m0, [r3 + 28 * 16]
+ pmulhrsw m3, m7
-; mode 32 [row 6]
+ packuswb m2, m3
+ movu [r0 + 2016], m2
-pmaddubsw m2, m0, [r5 + 19 * 16]
-pmulhrsw m2, m7
+ ; mode 32 [row 6]
-; mode 32 [row 7]
+ pmaddubsw m2, m0, [r3 + 19 * 16]
+ pmulhrsw m2, m7
-movu m0, [r1 + 6]
-palignr m3, m0, 1
-punpcklbw m0, m3
+ ; mode 32 [row 7]
-pmaddubsw m3, m0, [r5 + 8 * 16]
-pmulhrsw m3, m7
+ movu m0, [r1 + 6]
+ palignr m3, m0, 1
+ punpcklbw m0, m3
-packuswb m2, m3
-movu [r0 + 1968], m2
+ pmaddubsw m3, m0, [r3 + 8 * 16]
+ pmulhrsw m3, m7
-; mode 33 [row 6, 7]
+ packuswb m2, m3
+ movu [r0 + 1968], m2
-pmaddubsw m2, m0, [r5 + 22 * 16]
-pmulhrsw m2, m7
+ ; mode 33 [row 6, 7]
-movu m0, [r1 + 7]
-palignr m3, m0, 1
-punpcklbw m0, m3
+ pmaddubsw m2, m0, [r3 + 22 * 16]
+ pmulhrsw m2, m7
-pmaddubsw m3, m0, [r5 + 16 * 16]
-pmulhrsw m3, m7
+ movu m0, [r1 + 7]
+ palignr m3, m0, 1
+ punpcklbw m0, m3
-packuswb m2, m3
-movu [r0 + 2032], m2
+ pmaddubsw m3, m0, [r3 + 16 * 16]
+ pmulhrsw m3, m7
-; mode 33 [row 0]
+ packuswb m2, m3
+ movu [r0 + 2032], m2
-pmaddubsw m2, m4, [r5 + 26 * 16]
-pmulhrsw m2, m7
+ ; mode 33 [row 0]
-pxor m3, m3
+ pmaddubsw m2, m4, [r3 + 26 * 16]
+ pmulhrsw m2, m7
-packuswb m2, m3
-movh [r0 + 1984], m2
+ pxor m3, m3
-; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]
+ packuswb m2, m3
+ movh [r0 + 1984], m2
-movu m0, [r3 + 2]
-palignr m1, m0, 1
-punpcklqdq m2, m0, m1
-movu [r0 + 2048], m2
+ ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]
-palignr m1, m0, 2
-palignr m2, m0, 3
-punpcklqdq m1, m2
-movu [r0 + 2064], m1
+ movu m0, [r2 + 2]
+ palignr m1, m0, 1
+ punpcklqdq m2, m0, m1
+ movu [r0 + 2048], m2
-palignr m1, m0, 4
-palignr m2, m0, 5
-punpcklqdq m1, m2
-movu [r0 + 2080], m1
+ palignr m1, m0, 2
+ palignr m2, m0, 3
+ punpcklqdq m1, m2
+ movu [r0 + 2064], m1
-palignr m1, m0, 6
-palignr m2, m0, 7
-punpcklqdq m1, m2
-movu [r0 + 2096], m1
+ palignr m1, m0, 4
+ palignr m2, m0, 5
+ punpcklqdq m1, m2
+ movu [r0 + 2080], m1
+ palignr m1, m0, 6
+ palignr m2, m0, 7
+ punpcklqdq m1, m2
+ movu [r0 + 2096], m1
RET
-;-----------------------------------------------------------------------------
-; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
-;-----------------------------------------------------------------------------
+;--------------------------------------------------------------------------------
+; void all_angs_pred_16x16(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;--------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
-
-movu m0, [r4 + 2]
-movu [r0 + 0 * 16], m0
-
-movu m1, m0
-
-movu m6, [r4 + 18]
-palignr m5, m6, m0, 1
-movu [r0 + 1 * 16], m5
-
-movu m4, m5
-
-palignr m5, m6, m0, 2
-movu [r0 + 2 * 16], m5
-palignr m5, m6, m0, 3
-movu [r0 + 3 * 16], m5
-palignr m5, m6, m0, 4
-movu [r0 + 4 * 16], m5
-palignr m5, m6, m0, 5
-movu [r0 + 5 * 16], m5
-palignr m5, m6, m0, 6
-movu [r0 + 6 * 16], m5
-palignr m5, m6, m0, 7
-movu [r0 + 7 * 16], m5
-
-movu m7, m5
-
-palignr m5, m6, m0, 8
-movu [r0 + 8 * 16], m5
-
-movu m2, m5
-
-palignr m5, m6, m0, 9
-movu [r0 + 9 * 16], m5
-
-palignr m3, m6, m0, 10
-movu [r0 + 10 * 16], m3
-palignr m3, m6, m0, 11
-movu [r0 + 11 * 16], m3
-palignr m3, m6, m0, 12
-movu [r0 + 12 * 16], m3
-
-; mode 3 [row 15]
-movu [r0 + (3-2)*16*16 + 15 * 16], m3
-
-palignr m3, m6, m0, 13
-movu [r0 + 13 * 16], m3
-palignr m3, m6, m0, 14
-movu [r0 + 14 * 16], m3
-palignr m3, m6, m0, 15
-movu [r0 + 15 * 16], m3
-
-; mode 3 [row 0]
-lea r5, [ang_table]
-movu m3, [pw_1024]
-movu m0, [r4 + 1]
-punpcklbw m0, m1
-
-; mode 17 [row 8 - second half]
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 248 * 16 + 8], m1
-; mode 17 [row 8 - second half] end
-
-pmaddubsw m1, m0, [r5 + 26 * 16]
-pmulhrsw m1, m3
-punpcklbw m7, m2
-pmaddubsw m2, m7, [r5 + 26 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 16 * 16], m1
-
-;mode 6 [row 1]
-movu [r0 + 65 * 16], m1
-
-; mode 4 [row 0]
-pmaddubsw m1, m0, [r5 + 21 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 21 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 32 * 16], m1
-
-; mode 5 [row 0]
-pmaddubsw m1, m0, [r5 + 17 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 17 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 48 * 16], m1
-
-; mode 6 [row 0]
-pmaddubsw m1, m0, [r5 + 13 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 13 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 64 * 16], m1
-
-; mode 7 [row 0]
-pmaddubsw m1, m0, [r5 + 9 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 9 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 80 * 16], m1
-
-; mode 7 [row 1]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 18 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 81 * 16], m1
-
-; mode 7 [row 2]
-pmaddubsw m1, m0, [r5 + 27 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 27 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 82 * 16], m1
-
-; mode 8 [row 0]
-pmaddubsw m1, m0, [r5 + 5 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 5 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 96 * 16], m1
-
-; mode 8 [row 1]
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 10 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 97 * 16], m1
-
-; mode 8 [row 2]
-pmaddubsw m1, m0, [r5 + 15 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 15 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 98 * 16], m1
-
-; mode 8 [row 3]
-pmaddubsw m1, m0, [r5 + 20 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 20 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 99 * 16], m1
-
-; mode 8 [row 4]
-pmaddubsw m1, m0, [r5 + 25 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 25 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 100 * 16], m1
-
-; mode 8 [row 5]
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 30 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 101 * 16], m1
-
-; mode 15 [row 13 - second half]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 221 * 16 + 8], m1
-; mode 15 [row 13 - second half] end
-
-; mode 15 [row 14 - second half]
-pmaddubsw m1, m0, [r5 + 1 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 222 * 16 + 8], m1
-; mode 15 [row 14 - second half] end
-
-; mode 16 [row 10 - second half]
-pmaddubsw m1, m0, [r5 + 25 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 234 * 16 + 8], m1
-; mode 16 [row 10 - second half] end
-
-; mode 16 [row 11 - second half]
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 235 * 16 + 8], m1
-; mode 16 [row 11 - second half] end
-
-; mode 3 [row 1]
-movu m6, [r5 + 20 * 16]
-movu m0, [r4 + 2]
-punpcklbw m0, m4
-
-; mode 17 [row 7 - second half]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 247 * 16 + 8], m1
-
-; mode 17 [row 7 - second half] end
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m3
-movu m2, [r4 + 10]
-punpcklbw m2, m5
-pmaddubsw m4, m2, m6
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 17 * 16], m1
-
-;mode 6 [row 3]
-movu [r0 + 67 * 16], m1
-
-; mode 4 row [row 1]
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 33 * 16], m1
-
-; mode 4 row [row 2]
-pmaddubsw m1, m0, [r5 + 31 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 31 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 34 * 16], m1
-
-; mode 7 [row 6]
-movu [r0 + 86 * 16], m1
-
-; mode 5 row [row 1]
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 2 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 49 * 16], m1
-
-; mode 5 row [row 2]
-pmaddubsw m1, m0, [r5 + 19 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 50 * 16], m1
-
-; mode 6 [row 2]
-pmaddubsw m1, m0, [r5 + 7 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 7 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 66 * 16], m1
-
-; mode 7 [row 3]
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 83 * 16], m1
-
-; mode 7 [row 4]
-pmaddubsw m1, m0, [r5 + 13 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 13 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 84 * 16], m1
-
-; mode 8 [row 8]
-movu [r0 + 104 * 16], m1
-
-; mode 7 [row 5]
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 85 * 16], m1
-
-; mode 8 [row 6]
-pmaddubsw m1, m0, [r5 + 3 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 102 * 16], m1
-
-; mode 8 [row 7]
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 8 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 103 * 16], m1
-
-; mode 8 [row 9]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 105 * 16], m1
-
-; mode 8 [row 10]
-pmaddubsw m1, m0, [r5 + 23 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 23 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 106 * 16], m1
-
-; mode 8 [row 11]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 107 * 16], m1
-
-; mode 3 [row 2]
-movu m0, [r4 + 3]
-movd m1, [r4 + 19]
-palignr m1, m0, 1
-punpcklbw m0, m1
-
-; mode 17 [row 6 - second half]
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 246 * 16 + 8], m1
-; mode 17 [row 6 - second half] end
-
-pmaddubsw m1, m0, [r5 + 14 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 11]
-movd m4, [r4 + 27]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 18 * 16], m1
-
-; mode 6 [row 5]
-movu [r0 + 69 * 16], m1
-
-; mode 4 row [row 3]
-pmaddubsw m1, m0, [r5 + 20 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 20 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 35 * 16], m1
-
-; mode 5 row [row 3]
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 51 * 16], m1
-
-; mode 5 row [row 4]
-pmaddubsw m1, m0, [r5 + 21 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 21 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 52 * 16], m1
-
-; mode 6 [row 4]
-pmaddubsw m1, m0, [r5 + 1 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 68 * 16], m1
-
-; mode 6 [row 6]
-pmaddubsw m1, m0, [r5 + 27 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 27 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 70 * 16], m1
-
-; mode 7 [row 7]
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 8 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 87 * 16], m1
-
-; mode 7 [row 8]
-pmaddubsw m1, m0, [r5 + 17 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 17 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 88 * 16], m1
-
-; mode 7 [row 9]
-pmaddubsw m1, m0, [r5 + 26 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 89 * 16], m1
-
-; mode 8 [row 12]
-pmaddubsw m1, m0, [r5 + 1 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 108 * 16], m1
-
-; mode 8 [row 13]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 6 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 109 * 16], m1
-
-; mode 8 [row 14]
-pmaddubsw m1, m0, [r5 + 11 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 110 * 16], m1
-
-; mode 8 [row 15]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 111 * 16], m1
-
-; mode 3 [row 3]
-movu m0, [r4 + 4]
-movd m1, [r4 + 20]
-palignr m1, m0, 1
-punpcklbw m0, m1
-
-; mode 17 [row 4 - second half]
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 244 * 16 + 8], m1
-; mode 17 [row 4 - second half] end
-
-; mode 17 [row 5 - second half]
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 245 * 16 + 8], m1
-; mode 17 [row 5 - second half] end
-
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 12]
-movd m4, [r4 + 28]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 8 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 19 * 16], m1
-
-; mode 6 [row 7]
-movu [r0 + 71 * 16], m1
-
-; mode 4 row [row 4]
-pmaddubsw m1, m0, [r5 + 9 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 36 * 16], m1
-
-; mode 4 row [row 5]
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 37 * 16], m1
-
-; mode 7 row [row 13]
-movu [r0 + 93 * 16], m1
-
-; mode 5 row [row 5]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 6 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 53 * 16], m1
-
-; mode 5 row [row 6]
-pmaddubsw m1, m0, [r5 + 23 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 23 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 54 * 16], m1
-
-; mode 6 [row 8]
-pmaddubsw m1, m0, [r5 + 21 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 21 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 72 * 16], m1
-
-; mode 7 [row 12]
-movu [r0 + 92 * 16], m1
-
-; mode 7 [row 10]
-pmaddubsw m1, m0, [r5 + 3 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 90 * 16], m1
-
-; mode 7 [row 11]
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 91 * 16], m1
-
-; mode 3 [row 4]
-movu m0, [r4 + 5]
-movd m1, [r4 + 20]
-palignr m1, m0, 1
-punpcklbw m0, m1
-
-; mode 17 [row 3 - second half]
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 243 * 16 + 8], m1
-
-; mode 17 [row 3 - second half] end
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 13]
-movd m4, [r4 + 29]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 2 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 20 * 16], m1
-
-;mode 6 [row 9]
-movu [r0 + 73 * 16], m1
-
-; mode 4 row [row 6]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m3
-pmaddubsw m4, m2, m6
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 38 * 16], m1
-
-; mode 3 [row 5]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 21 * 16], m1
-
-;mode 6 [row 11]
-movu [r0 + 75 * 16], m1
-
-; mode 5 row [row 7]
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 8 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 55 * 16], m1
-
-; mode 5 row [row 8]
-pmaddubsw m1, m0, [r5 + 25 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 56 * 16], m1
-
-; mode 6 [row 10]
-pmaddubsw m1, m0, [r5 + 15 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 15 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 74 * 16], m1
-
-; mode 7 [row 14]
-pmaddubsw m1, m0, [r5 + 7 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 7 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 94 * 16], m1
-
-; mode 7 [row 15]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 95 * 16], m1
-
-; mode 3 [row 6]
-movu m0, [r4 + 6]
-movd m1, [r4 + 22]
-palignr m1, m0, 1
-punpcklbw m0, m1
-
-; mode 17 [row 2 - second half]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 242 * 16 + 8], m1
-; mode 17 [row 2 - second half] end
-
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 14]
-movd m4, [r4 + 30]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 22 * 16], m1
-
-; mode 6 [row 13]
-movu [r0 + 77 * 16], m1
-
-; mode 4 row [row 7]
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 8 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 39 * 16], m1
-
-; mode 4 row [row 8]
-pmaddubsw m1, m0, [r5 + 29 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 40 * 16], m1
-
-; mode 5 row [row 9]
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 57 * 16], m1
-
-; mode 5 row [row 10]
-pmaddubsw m1, m0, [r5 + 27 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 27 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 58 * 16], m1
-
-; mode 6 [row 12]
-pmaddubsw m1, m0, [r5 + 9 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 76 * 16], m1
-
-; mode 3 [row 7]
-movu m0, [r4 + 7]
-movd m1, [r4 + 27]
-palignr m1, m0, 1
-punpcklbw m0, m1
-
-; mode 17 [row 1 - second half]
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 241 * 16 + 8], m1
-; mode 17 [row 1 - second half] end
-
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 15]
-movd m4, [r4 + 25]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 23 * 16], m1
-
-; mode 6 [row 15]
-movu [r0 + 79 * 16], m1
-
-; mode 4 row [row 9]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 41 * 16], m1
-
-; mode 5 row [row 11]
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 59 * 16], m1
-
-; mode 5 row [row 12]
-pmaddubsw m1, m0, [r5 + 29 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 60 * 16], m1
-
-; mode 6 [row 14]
-pmaddubsw m1, m0, [r5 + 3 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 78 * 16], m1
-
-; mode 3 [row 8]
-movu m0, [r4 + 8]
-movd m1, [r4 + 24]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 16]
-psrldq m4, m2, 1
-pinsrb m4, [r4 + 32], 15
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 24 * 16], m1
-
-; mode 4 row [row 10]
-pmaddubsw m1, m0, [r5 + 7 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 7 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 42 * 16], m1
-
-; mode 4 row [row 11]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 43 * 16], m1
-
-; mode 5 row [row 13]
-pmaddubsw m1, m0, [r5 + 14 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 61 * 16], m1
-
-; mode 5 row [row 14]
-pmaddubsw m1, m0, [r5 + 31 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 31 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 62 * 16], m1
-
-; mode 3 [row 9]
-movu m0, [r4 + 9]
-movd m1, [r4 + 16]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 17]
-movd m4, [r4 + 33]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 25 * 16], m1
-
-; mode 4 row [row 12]
-pmaddubsw m1, m0, [r5 + 17 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 17 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 44 * 16], m1
-
-; mode 3 [row 10]
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 26 * 16], m1
-
-; mode 5 row [row 15]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 63 * 16], m1
-
-; mode 3 [row 11]
-movu m0, [r4 + 10]
-movd m1, [r4 + 26]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 18]
-movd m4, [r4 + 34]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 27 * 16], m1
-
-; mode 4 row [row 13]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 6 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 45 * 16], m1
-
-; mode 4 row [row 14]
-pmaddubsw m1, m0, [r5 + 27 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 27 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 46 * 16], m1
-
-; mode 3 [row 12]
-movu m0, [r4 + 11]
-movd m1, [r4 + 27]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 19]
-movd m4, [r4 + 35]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 28 * 16], m1
-
-; mode 4 row [row 15]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m2, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 47 * 16], m1
-
-; mode 3 [row 13]
-movu m0, [r4 + 12]
-movd m1, [r4 + 28]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 20]
-movd m4, [r4 + 36]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 29 * 16], m1
-
-; mode 3 [row 14]
-movu m0, [r4 + 13]
-movd m1, [r4 + 29]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-movu m2, [r4 + 21]
-movd m4, [r4 + 37]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m2, [r5 + 6 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 30 * 16], m1
-
-; mode 9
-movu m0, [r2 + 1]
-movd m1, [r2 + 17]
-palignr m1, m0, 1
-
-; mode 9 [row 15]
-movu [r0 + 127 * 16], m1
-
-; mode 9 [row 0]
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-movu m7, [r2 + 9]
-movd m4, [r4 + 25]
-palignr m2, m7, 1
-punpcklbw m7, m2
-pmaddubsw m2, m7, [r5 + 2 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 112 * 16], m1
-
-; mode 9 [row 1]
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 4 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 113 * 16], m1
-
-; mode 9 [row 2]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 6 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 114 * 16], m1
-
-; mode 9 [row 3]
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 8 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 115 * 16], m1
-
-; mode 9 [row 4]
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 10 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 116 * 16], m1
-
-; mode 9 [row 5]
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 12 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 117 * 16], m1
-
-; mode 9 [row 6]
-pmaddubsw m1, m0, [r5 + 14 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 14 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 118 * 16], m1
-
-; mode 9 [row 7]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 16 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 119 * 16], m1
-
-; mode 9 [row 8]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 18 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 120 * 16], m1
-
-; mode 9 [row 9]
-pmaddubsw m1, m0, [r5 + 20 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 20 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 121 * 16], m1
-
-; mode 9 [row 10]
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 22 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 122 * 16], m1
-
-; mode 9 [row 11]
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 24 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 123 * 16], m1
-
-; mode 9 [row 12]
-pmaddubsw m1, m0, [r5 + 26 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 26 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 124 * 16], m1
-
-; mode 9 [row 13]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 28 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 125 * 16], m1
-
-; mode 9 [row 14]
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 30 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 126 * 16], m1
-
-; mode 10
-movu m1, [r2 + 1]
-movu [r0 + 128 * 16], m1
-movu [r0 + 129 * 16], m1
-movu [r0 + 130 * 16], m1
-movu [r0 + 131 * 16], m1
-movu [r0 + 132 * 16], m1
-movu [r0 + 133 * 16], m1
-movu [r0 + 134 * 16], m1
-movu [r0 + 135 * 16], m1
-movu [r0 + 136 * 16], m1
-movu [r0 + 137 * 16], m1
-movu [r0 + 138 * 16], m1
-movu [r0 + 139 * 16], m1
-movu [r0 + 140 * 16], m1
-movu [r0 + 141 * 16], m1
-movu [r0 + 142 * 16], m1
-movu [r0 + 143 * 16], m1
-
-pxor m0, m0
-pshufb m1, m1, m0
-punpcklbw m1, m0
-movu m2, [r1]
-pshufb m2, m2, m0
-punpcklbw m2, m0
-movu m4, [r1 + 1]
-punpcklbw m5, m4, m0
-punpckhbw m4, m0
-psubw m5, m2
-psubw m4, m2
-psraw m5, 1
-psraw m4, 1
-paddw m5, m1
-paddw m4, m1
-packuswb m5, m4
-
-pextrb [r0 + 128 * 16], m5, 0
-pextrb [r0 + 129 * 16], m5, 1
-pextrb [r0 + 130 * 16], m5, 2
-pextrb [r0 + 131 * 16], m5, 3
-pextrb [r0 + 132 * 16], m5, 4
-pextrb [r0 + 133 * 16], m5, 5
-pextrb [r0 + 134 * 16], m5, 6
-pextrb [r0 + 135 * 16], m5, 7
-pextrb [r0 + 136 * 16], m5, 8
-pextrb [r0 + 137 * 16], m5, 9
-pextrb [r0 + 138 * 16], m5, 10
-pextrb [r0 + 139 * 16], m5, 11
-pextrb [r0 + 140 * 16], m5, 12
-pextrb [r0 + 141 * 16], m5, 13
-pextrb [r0 + 142 * 16], m5, 14
-pextrb [r0 + 143 * 16], m5, 15
-
-; mode 11
-movu m0, [r2]
-
-; mode 11 [row 15]
-movu [r0 + 159 * 16], m0
-
-; mode 11 [row 0]
-movu m1, [r2 + 1]
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m3
-movu m7, [r2 + 8]
-movu m2, [r2 + 9]
-punpcklbw m7, m2
-pmaddubsw m2, m7, [r5 + 30 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 144 * 16], m1
-
-; mode 11 [row 1]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 28 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 145 * 16], m1
-
-; mode 11 [row 2]
-pmaddubsw m1, m0, [r5 + 26 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 26 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 146 * 16], m1
-
-; mode 11 [row 3]
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 24 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 147 * 16], m1
-
-; mode 11 [row 4]
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 22 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 148 * 16], m1
-
-; mode 11 [row 5]
-pmaddubsw m1, m0, [r5 + 20 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 20 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 149 * 16], m1
-
-; mode 11 [row 6]
-pmaddubsw m1, m0, [r5 + 18 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 18 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 150 * 16], m1
-
-; mode 11 [row 7]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 16 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 151 * 16], m1
-
-; mode 11 [row 8]
-pmaddubsw m1, m0, [r5 + 14 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 14 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 152 * 16], m1
-
-; mode 11 [row 9]
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 12 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 153 * 16], m1
-
-; mode 11 [row 10]
-pmaddubsw m1, m0, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 10 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 154 * 16], m1
-
-; mode 11 [row 11]
-pmaddubsw m1, m0, [r5 + 8 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 8 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 155 * 16], m1
-
-; mode 11 [row 12]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 6 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 156 * 16], m1
-
-; mode 11 [row 13]
-pmaddubsw m1, m0, [r5 + 4 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 4 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 157 * 16], m1
-
-; mode 11 [row 14]
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 2 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 158 * 16], m1
-
-; mode 12 [row 0]
-movu m0, [r4]
-movu m1, [r4 + 1]
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 27 * 16]
-pmulhrsw m1, m3
-movu m7, [r4 + 8]
-movd m2, [r4 + 24]
-palignr m2, m7, 1
-punpcklbw m7, m2
-pmaddubsw m2, m7, [r5 + 27 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 160 * 16], m1
-
-; mode 12 [row 1]
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 22 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 161 * 16], m1
-
-; mode 12 [row 2]
-pmaddubsw m1, m0, [r5 + 17 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 17 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 162 * 16], m1
-
-; mode 12 [row 3]
-pmaddubsw m1, m0, [r5 + 12 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 12 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 163 * 16], m1
-
-; mode 12 [row 4]
-pmaddubsw m1, m0, [r5 + 7 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 7 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 164 * 16], m1
-
-; mode 12 [row 5]
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 2 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 165 * 16], m1
-
-; mode 13 [row 0]
-pmaddubsw m1, m0, [r5 + 23 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 23 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 176 * 16], m1
-
-; mode 13 [row 1]
-pmaddubsw m1, m0, [r5 + 14 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 14 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 177 * 16], m1
-
-; mode 13 [row 2]
-pmaddubsw m1, m0, [r5 + 5 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 5 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 178 * 16], m1
-
-; mode 14 [row 0]
-pmaddubsw m1, m0, [r5 + 19 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 19 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 192 * 16], m1
-
-; mode 14 [row 1]
-pmaddubsw m1, m0, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 6 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 193 * 16], m1
-
-; mode 17 [row 0]
-movu [r0 + 240 * 16], m1
-
-; mode 15 [row 0]
-pmaddubsw m1, m0, [r5 + 15 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 15 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 208 * 16], m1
-
-; mode 15 [row 15 - second half]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 223 * 16 + 8], m1
-; mode 15 [row 15 - second half] end
-
-; mode 16 [row 0]
-pmaddubsw m1, m0, [r5 + 11 * 16]
-pmulhrsw m1, m3
-pmaddubsw m2, m7, [r5 + 11 * 16]
-pmulhrsw m2, m3
-packuswb m1, m2
-movu [r0 + 224 * 16], m1
-
-; mode 17 [row 9 - second half]
-pmaddubsw m1, m0, [r5 + 28 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 249 * 16 + 8], m1
-; mode 17 [row 9 - second half] end
-
-; mode 17 [row 10 - second half]
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 250 * 16 + 8], m1
-; mode 17 [row 10 - second half] end
-
-; mode 17 [row 1 - first half]
-pslldq m6, m0, 2
-pinsrb m6, [r3 + 0], 1
-pinsrb m6, [r3 + 1], 0
-pmaddubsw m1, m6, [r5 + 12 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 241 * 16], m1
-
-; mode 17 [row 11 - second half]
-pmaddubsw m1, m6, [r5 + 8 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 251 * 16 + 8], m1
-; mode 17 [row 11 - second half] end
-
-; mode 17 [row 2 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 1], 1
-pinsrb m6, [r3 + 2], 0
-pmaddubsw m1, m6, [r5 + 18 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 242 * 16], m1
-
-; mode 17 [row 12 - second half]
-pmaddubsw m1, m6, [r5 + 14 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 252 * 16 + 8], m1
-; mode 17 [row 12 - second half] end
-
-; mode 17 [row 3 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 2], 1
-pinsrb m6, [r3 + 4], 0
-pmaddubsw m1, m6, [r5 + 24 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 243 * 16], m1
-
-; mode 17 [row 13 - first half]
-pmaddubsw m1, m6, [r5 + 20 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 253 * 16 + 8], m1
-
-; mode 17 [row 4 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 4], 1
-pinsrb m6, [r3 + 5], 0
-pmaddubsw m1, m6, [r5 + 30 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 244 * 16], m1
-
-; mode 17 [row 5 - first half]
-pmaddubsw m1, m6, [r5 + 4 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 245 * 16], m1
-
-; mode 17 [row 14 - second half]
-pmaddubsw m1, m6, [r5 + 26 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 254 * 16 + 8], m1
-; mode 17 [row 14 - second half] end
-
-; mode 17 [row 6 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 5], 1
-pinsrb m6, [r3 + 6], 0
-pmaddubsw m1, m6, [r5 + 10 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 246 * 16], m1
-
-; mode 17 [row 7 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 6], 1
-pinsrb m6, [r3 + 7], 0
-pmaddubsw m1, m6, [r5 + 16 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 247 * 16], m1
-
-; mode 17 [row 8 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 7], 1
-pinsrb m6, [r3 + 9], 0
-pmaddubsw m1, m6, [r5 + 22 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 248 * 16], m1
-
-; mode 17 [row 9 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 9], 1
-pinsrb m6, [r3 + 10], 0
-pmaddubsw m1, m6, [r5 + 28 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 249 * 16], m1
-
-; mode 17 [row 10 - first half]
-pmaddubsw m1, m6, [r5 + 2 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 250 * 16], m1
-
-; mode 17 [row 11 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 10], 1
-pinsrb m6, [r3 + 11], 0
-pmaddubsw m1, m6, [r5 + 8 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 251 * 16], m1
-
-; mode 17 [row 12 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 11], 1
-pinsrb m6, [r3 + 12], 0
-pmaddubsw m1, m6, [r5 + 14 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 252 * 16], m1
-
-; mode 17 [row 13 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 12], 1
-pinsrb m6, [r3 + 14], 0
-pmaddubsw m1, m6, [r5 + 20 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 253 * 16], m1
-
-; mode 17 [row 14 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 14], 1
-pinsrb m6, [r3 + 15], 0
-pmaddubsw m1, m6, [r5 + 26 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 254 * 16], m1
-
-; mode 16 [row 12 - second half]
-pmaddubsw m1, m0, [r5 + 15 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 236 * 16 + 8], m1
-; mode 16 [row 12 - second half]
-
-; mode 12 [row 6]
-pslldq m2, m0, 2
-pinsrb m2, [r3 + 0], 1
-pinsrb m2, [r3 + 6], 0
-pmaddubsw m1, m2, [r5 + 29 * 16]
-pmulhrsw m1, m3
-movu m0, [r4 + 7]
-psrldq m4, m0, 1
-punpcklbw m0, m4
-pmaddubsw m4, m0, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 166 * 16], m1
-
-; mode 12 [row 7]
-pmaddubsw m1, m2, [r5 + 24 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 167 * 16], m1
-
-; mode 12 [row 8]
-pmaddubsw m1, m2, [r5 + 19 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 168 * 16], m1
-
-; mode 12 [row 9]
-pmaddubsw m1, m2, [r5 + 14 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 169 * 16], m1
-
-; mode 12 [row 10]
-pmaddubsw m1, m2, [r5 + 9 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 170 * 16], m1
-
-; mode 12 [row 11]
-pmaddubsw m1, m2, [r5 + 4 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 171 * 16], m1
-
-; mode 13 [row 3]
-pinsrb m7, m2, [r3 + 4], 0
-pmaddubsw m1, m7, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 179 * 16], m1
-
-; mode 13 [row 4]
-pmaddubsw m1, m7, [r5 + 19 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 180 * 16], m1
-
-; mode 13 [row 5]
-pmaddubsw m1, m7, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 181 * 16], m1
-
-; mode 13 [row 6]
-pmaddubsw m1, m7, [r5 + 1 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 182 * 16], m1
-
-; mode 14 [row 2]
-pinsrb m5, m7, [r3 + 2], 0
-pmaddubsw m1, m5, [r5 + 25 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 194 * 16], m1
-
-; mode 14 [row 3]
-pmaddubsw m1, m5, [r5 + 12 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 195 * 16], m1
-
-; mode 15 [row 1]
-pmaddubsw m1, m5, [r5 + 30 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 209 * 16], m1
-
-; mode 15 [row 2]
-pmaddubsw m1, m5, [r5 + 13 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 13 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 210 * 16], m1
-
-; mode 16 [row 1]
-pmaddubsw m1, m5, [r5 + 22 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 225 * 16], m1
-
-; mode 16 [row 2]
-pmaddubsw m1, m5, [r5 + 1 * 16]
-pmulhrsw m1, m3
-pmaddubsw m4, m0, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m1, m4
-movu [r0 + 226 * 16], m1
-
-; mode 16 [row 13 - second half]
-pmaddubsw m1, m5, [r5 + 26 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 237 * 16 + 8], m1
-; mode 16 [row 13 - second half]
-
-; mode 16 [row 14 - second half]
-pmaddubsw m1, m5, [r5 + 5 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 238 * 16 + 8], m1
-; mode 16 [row 14 - second half]
-
-; mode 16 [row 3]
-pslldq m6, m5, 2
-pinsrb m6, [r3 + 2], 1
-pinsrb m6, [r3 + 3], 0
-pmaddubsw m1, m6, [r5 + 12 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 227 * 16], m1
-
-; mode 16 [row 15 - second half]
-pmaddubsw m1, m6, [r5 + 16 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 239 * 16 + 8], m1
-; mode 16 [row 15 - second half] end
-
-; mode 16 [row 4- first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 3], 1
-pinsrb m6, [r3 + 5], 0
-pmaddubsw m1, m6, [r5 + 23 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 228 * 16], m1
-
-; mode 16 [row 5- first half]
-pmaddubsw m1, m6, [r5 + 2 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 229 * 16], m1
-
-; mode 16 [row 6- first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 5], 1
-pinsrb m6, [r3 + 6], 0
-pmaddubsw m1, m6, [r5 + 13 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 230 * 16], m1
-
-; mode 16 [row 7- first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 6], 1
-pinsrb m6, [r3 + 8], 0
-pmaddubsw m1, m6, [r5 + 24 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 231 * 16], m1
-
-; mode 16 [row 8- first half]
-pmaddubsw m1, m6, [r5 + 3 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 232 * 16], m1
-; mode 19 [row 0 - second half] end
-
-; mode 16 [row 9- first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 8], 1
-pinsrb m6, [r3 + 9], 0
-pmaddubsw m1, m6, [r5 + 14 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 233 * 16], m1
-
-; mode 16 [row 10 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 9], 1
-pinsrb m6, [r3 + 11], 0
-pmaddubsw m1, m6, [r5 + 25 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 234 * 16], m1
-
-; mode 16 [row 11 - first half]
-pmaddubsw m1, m6, [r5 + 4 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 235 * 16], m1
-
-; mode 16 [row 12 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 11], 1
-pinsrb m6, [r3 + 12], 0
-pmaddubsw m1, m6, [r5 + 15 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 236 * 16], m1
-
-; mode 16 [row 13 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 12], 1
-pinsrb m6, [r3 + 14], 0
-pmaddubsw m1, m6, [r5 + 26 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 237 * 16], m1
-
-; mode 16 [row 14 - first half]
-pmaddubsw m1, m6, [r5 + 5 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 238 * 16], m1
-
-; mode 16 [row 15 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 14], 1
-pinsrb m6, [r3 + 15], 0
-pmaddubsw m1, m6, [r5 + 16 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 239 * 16], m1
-
-; mode 14 [row 4]
-pslldq m5, 2
-pinsrb m5, [r3 + 2], 1
-pinsrb m5, [r3 + 5], 0
-movu m4, [r4 + 6]
-psrldq m0, m4, 1
-punpcklbw m4, m0
-
-; mode 16 [row 3 - second half]
-pmaddubsw m1, m4, [r5 + 12 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 227 * 16 + 8], m1
-
-; mode 16 [row 3 - second half] end
-pmaddubsw m1, m5, [r5 + 31 * 16]
-pmulhrsw m1, m3
-pmaddubsw m0, m4, [r5 + 31 * 16]
-pmulhrsw m0, m3
-packuswb m1, m0
-movu [r0 + 196 * 16], m1
-
-; mode 14 [row 5]
-pmaddubsw m1, m5, [r5 + 18 * 16]
-pmulhrsw m1, m3
-pmaddubsw m0, m4, [r5 + 18 * 16]
-pmulhrsw m0, m3
-packuswb m1, m0
-movu [r0 + 197 * 16], m1
-
-; mode 14 [row 6]
-pmaddubsw m1, m5, [r5 + 5 * 16]
-pmulhrsw m1, m3
-pmaddubsw m0, m4, [r5 + 5 * 16]
-pmulhrsw m0, m3
-packuswb m1, m0
-movu [r0 + 198 * 16], m1
-
-; mode 15 [row 3]
-movu m6, m5
-pinsrb m6, [r3 + 4], 0
-pmaddubsw m1, m6, [r5 + 28 * 16]
-pmulhrsw m1, m3
-pmaddubsw m0, m4, [r5 + 28 * 16]
-pmulhrsw m0, m3
-packuswb m1, m0
-movu [r0 + 211 * 16], m1
-
-; mode 15 [row 4]
-pmaddubsw m1, m6, [r5 + 11 * 16]
-pmulhrsw m1, m3
-pmaddubsw m0, m4, [r5 + 11 * 16]
-pmulhrsw m0, m3
-packuswb m1, m0
-movu [r0 + 212 * 16], m1
-
-; mode 15 [row 5 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 4], 1
-pinsrb m6, [r3 + 6], 0
-pmaddubsw m1, m6, [r5 + 26 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 213 * 16], m1
-
-; mode 15 [row 6 - first half]
-pmaddubsw m1, m6, [r5 + 9 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 214 * 16], m1
-
-; mode 15 [row 7 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 6], 1
-pinsrb m6, [r3 + 8], 0
-pmaddubsw m1, m6, [r5 + 24 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 215 * 16], m1
-
-; mode 15 [row 8 - first half]
-pmaddubsw m1, m6, [r5 + 7 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 216 * 16], m1
-
-; mode 15 [row 9 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 8], 1
-pinsrb m6, [r3 + 9], 0
-pmaddubsw m1, m6, [r5 + 22 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 217 * 16], m1
-
-; mode 15 [row 10 - first half]
-pmaddubsw m1, m6, [r5 + 5 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 218 * 16], m1
-
-; mode 15 [row 11 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 9], 1
-pinsrb m6, [r3 + 11], 0
-pmaddubsw m1, m6, [r5 + 20 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 219 * 16], m1
-
-; mode 15 [row 12 - first half]
-pmaddubsw m1, m6, [r5 + 3 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 220 * 16], m1
-
-; mode 15 [row 13 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 11], 1
-pinsrb m6, [r3 + 13], 0
-pmaddubsw m1, m6, [r5 + 18 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 221 * 16], m1
-
-; mode 15 [row 14 - first half]
-pmaddubsw m1, m6, [r5 + 1 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 222 * 16], m1
-
-; mode 15 [row 15 - first half]
-pslldq m6, 2
-pinsrb m6, [r3 + 13], 1
-pinsrb m6, [r3 + 15], 0
-pmaddubsw m1, m6, [r5 + 16 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 223 * 16], m1
-
-; mode 14 [row 7]
-pslldq m5, 2
-pinsrb m5, [r3 + 5], 1
-pinsrb m5, [r3 + 7], 0
-movu m0, [r4 + 5]
-psrldq m6, m0, 1
-punpcklbw m0, m6
-
-; mode 15 [row 5 - second half]
-pmaddubsw m1, m0, [r5 + 26 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 213 * 16 + 8], m1
-; mode 15 [row 5 - second half] end
-
-; mode 15 [row 6 - second half]
-pmaddubsw m1, m0, [r5 + 9 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 214 * 16 + 8], m1
-; mode 15 [row 6 - second half] end
-
-; mode 16 [row 4 - second half]
-pmaddubsw m1, m0, [r5 + 23 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 228 * 16 + 8], m1
-; mode 16 [row 4 - second half] end
-
-; mode 16 [row 5 - second half]
-pmaddubsw m1, m0, [r5 + 2 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 229 * 16 + 8], m1
-
-; mode 16 [row 5 - second half] end
-pmaddubsw m1, m5, [r5 + 24 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 24 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 199 * 16], m1
-
-; mode 14 [row 8]
-pmaddubsw m1, m5, [r5 + 11 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 11 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 200 * 16], m1
-
-; mode 14 [row 9]
-pslldq m5, 2
-pinsrb m5, [r3 + 7], 1
-pinsrb m5, [r3 + 10], 0
-movu m0, [r4 + 4]
-psrldq m6, m0, 1
-punpcklbw m0, m6
-
-; mode 15 [row 7 - second half]
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 215 * 16 + 8], m1
-; mode 15 [row 7 - second half] end
-
-; mode 15 [row 8 - second half]
-pmaddubsw m1, m0, [r5 + 7 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 216 * 16 + 8], m1
-; mode 15 [row 8 - second half] end
-
-; mode 16 [row 6 - second half]
-pmaddubsw m1, m0, [r5 + 13 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 230 * 16 + 8], m1
-; mode 16 [row 6 - second half] end
-
-; mode 15 [row 6 - second half] end
-pmaddubsw m1, m5, [r5 + 30 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 30 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 201 * 16], m1
-
-; mode 14 [row 10]
-pmaddubsw m1, m5, [r5 + 17 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 17 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 202 * 16], m1
-
-; mode 14 [row 11]
-pmaddubsw m1, m5, [r5 + 4 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 4 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 203 * 16], m1
-
-; mode 14 [row 12]
-pslldq m5, 2
-pinsrb m5, [r3 + 10], 1
-pinsrb m5, [r3 + 12], 0
-movu m0, [r4 + 3]
-psrldq m6, m0, 1
-punpcklbw m0, m6
-
-; mode 15 [row 9 - second half]
-pmaddubsw m1, m0, [r5 + 22 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 217 * 16 + 8], m1
-; mode 15 [row 9 - second half] end
-
-; mode 15 [row 10 - second half]
-pmaddubsw m1, m0, [r5 + 5 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 218 * 16 + 8], m1
-; mode 15 [row 10 - second half] end
-
-; mode 16 [row 7 - second half]
-pmaddubsw m1, m0, [r5 + 24 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 231 * 16 + 8], m1
-; mode 16 [row 7 - second half] end
-
-; mode 16 [row 8 - second half]
-pmaddubsw m1, m0, [r5 + 3 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 232 * 16 + 8], m1
-; mode 16 [row 8 - second half] end
-
-pmaddubsw m1, m5, [r5 + 23 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 23 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 204 * 16], m1
-
-; mode 14 [row 13]
-pmaddubsw m1, m5, [r5 + 10 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 10 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 205 * 16], m1
-
-; mode 14 [row 14]
-pslldq m5, 2
-pinsrb m5, [r3 + 12], 1
-pinsrb m5, [r3 + 15], 0
-movu m0, [r4 + 2]
-psrldq m6, m0, 1
-punpcklbw m0, m6
-
-; mode 15 [row 11 - second half]
-pmaddubsw m1, m0, [r5 + 20 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 219 * 16 + 8], m1
-; mode 15 [row 11 - second half] end
-
-; mode 15 [row 12 - second half]
-pmaddubsw m1, m0, [r5 + 3 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 220 * 16 + 8], m1
-; mode 15 [row 12 - second half] end
-
-; mode 16 [row 9 - second half]
-pmaddubsw m1, m0, [r5 + 14 * 16]
-pmulhrsw m1, m3
-packuswb m1, m1
-movh [r0 + 233 * 16 + 8], m1
-
-; mode 16 [row 9 - second half] end
-pmaddubsw m1, m5, [r5 + 29 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 29 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 206 * 16], m1
-
-; mode 14 [row 15]
-pmaddubsw m1, m5, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m6, m0, [r5 + 16 * 16]
-pmulhrsw m6, m3
-packuswb m1, m6
-movu [r0 + 207 * 16], m1
-
-; mode 12 [row 12]
-pslldq m0, m2, 2
-pinsrb m0, [r3 + 6], 1
-pinsrb m0, [r3 + 13], 0
-pmaddubsw m1, m0, [r5 + 31 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 31 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 172 * 16], m1
-
-; mode 12 [row 13]
-pmaddubsw m1, m0, [r5 + 26 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 173 * 16], m1
-
-; mode 12 [row 14]
-pmaddubsw m1, m0, [r5 + 21 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 174 * 16], m1
-
-; mode 12 [row 15]
-pmaddubsw m1, m0, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 175 * 16], m1
-
-; mode 13 [row 7]
-pslldq m7, 2
-pinsrb m7, [r3 + 4], 1
-pinsrb m7, [r3 + 7], 0
-pmaddubsw m1, m7, [r5 + 24 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 183 * 16], m1
-
-; mode 13 [row 8]
-pmaddubsw m1, m7, [r5 + 15 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 184 * 16], m1
-
-; mode 13 [row 9]
-pmaddubsw m1, m7, [r5 + 6 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 185 * 16], m1
-
-; mode 13 [row 10]
-pslldq m7, 2
-pinsrb m7, [r3 + 7], 1
-pinsrb m7, [r3 + 11], 0
-pmaddubsw m1, m7, [r5 + 29 * 16]
-pmulhrsw m1, m3
-movu m4, [r4 + 5]
-psrldq m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, [r5 + 29 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 186 * 16], m1
-
-; mode 13 [row 11]
-pmaddubsw m1, m7, [r5 + 20 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 187 * 16], m1
-
-; mode 13 [row 12]
-pmaddubsw m1, m7, [r5 + 11 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 11 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 188 * 16], m1
-
-; mode 13 [row 13]
-pmaddubsw m1, m7, [r5 + 2 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 189 * 16], m1
-
-; mode 13 [row 14]
-pslldq m7, 2
-pinsrb m7, [r3 + 11], 1
-pinsrb m7, [r3 + 14], 0
-pmaddubsw m1, m7, [r5 + 25 * 16]
-pmulhrsw m1, m3
-movu m4, [r4 + 4]
-psrldq m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 190 * 16], m1
-
-; mode 13 [row 15]
-pmaddubsw m1, m7, [r5 + 16 * 16]
-pmulhrsw m1, m3
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m1, m5
-movu [r0 + 191 * 16], m1
-
-; mode 17 [row 15]
-movu m0, [r3]
-pshufb m1, m0, [tab_S1]
-movu [r0 + 255 * 16], m1
-movu m2, [r4]
-movd [r0 + 255 * 16 + 12], m2
-
-; mode 18 [row 0]
-movu [r0 + 256 * 16], m0
-
-; mode 18 [row 1]
-pslldq m4, m0, 1
-pinsrb m4, [r4 + 1], 0
-movu [r0 + 257 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 2], 0
-movu [r0 + 258 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 3], 0
-movu [r0 + 259 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 4], 0
-movu [r0 + 260 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 5], 0
-movu [r0 + 261 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 6], 0
-movu [r0 + 262 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 7], 0
-movu [r0 + 263 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 8], 0
-movu [r0 + 264 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 9], 0
-movu [r0 + 265 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 10], 0
-movu [r0 + 266 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 11], 0
-movu [r0 + 267 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 12], 0
-movu [r0 + 268 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 13], 0
-movu [r0 + 269 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 14], 0
-movu [r0 + 270 * 16], m4
-pslldq m4, 1
-pinsrb m4, [r4 + 15], 0
-movu [r0 + 271 * 16], m4
-
-; mode 19 [row 0]
-psrldq m2, m0, 1
-punpcklbw m0, m2
-movu m5, [r3 + 8]
-psrldq m6, m5, 1
-punpcklbw m5, m6
-pmaddubsw m4, m0, [r5 + 6 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 6 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 272 * 16], m4
-
-; mode 20 [row 0]
-pmaddubsw m4, m0, [r5 + 11 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 11 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 288 * 16], m4
-
-; mode 21 [row 0]
-pmaddubsw m4, m0, [r5 + 15 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 15 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 304 * 16], m4
-
-; mode 22 [row 0]
-pmaddubsw m4, m0, [r5 + 19 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 19 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 320 * 16], m4
-
-; mode 22 [row 1]
-pmaddubsw m4, m0, [r5 + 6 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 6 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 321 * 16], m4
-
-; mode 23 [row 0]
-pmaddubsw m4, m0, [r5 + 23 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 23 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 336 * 16], m4
-
-; mode 23 [row 1]
-pmaddubsw m4, m0, [r5 + 14 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 14 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 337 * 16], m4
-
-; mode 23 [row 2]
-pmaddubsw m4, m0, [r5 + 5 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 5 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 338 * 16], m4
-
-; mode 24 [row 0]
-pmaddubsw m4, m0, [r5 + 27 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 27 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 352 * 16], m4
-
-; mode 24 [row 1]
-pmaddubsw m4, m0, [r5 + 22 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 22 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 353 * 16], m4
-
-; mode 24 [row 2]
-pmaddubsw m4, m0, [r5 + 17 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 17 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 354 * 16], m4
-
-; mode 24 [row 3]
-pmaddubsw m4, m0, [r5 + 12 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 12 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 355 * 16], m4
-
-; mode 24 [row 4]
-pmaddubsw m4, m0, [r5 + 7 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 7 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 356 * 16], m4
-
-; mode 24 [row 5]
-pmaddubsw m4, m0, [r5 + 2 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 2 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 357 * 16], m4
-
-; mode 24 [row 6 - first half]
-pslldq m7, m0, 2
-pinsrb m7, [r4 + 0], 1
-pinsrb m7, [r4 + 6], 0
-pmaddubsw m4, m7, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 358 * 16], m4
-
-; mode 24 [row 7 - first half]
-pmaddubsw m4, m7, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 359 * 16], m4
-
-; mode 24 [row 8 - first half]
-pmaddubsw m4, m7, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 360 * 16], m4
-
-; mode 24 [row 9 - first half]
-pmaddubsw m4, m7, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 361 * 16], m4
-
-; mode 24 [row 10 - first half]
-pmaddubsw m4, m7, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 362 * 16], m4
-
-; mode 24 [row 11 - first half]
-pmaddubsw m4, m7, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 363 * 16], m4
-
-; mode 24 [row 12 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 6], 1
-pinsrb m7, [r4 + 13], 0
-pmaddubsw m4, m7, [r5 + 31 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 364 * 16], m4
-
-; mode 24 [row 13 - first half]
-pmaddubsw m4, m7, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 365 * 16], m4
-
-; mode 24 [row 14 - first half]
-pmaddubsw m4, m7, [r5 + 21 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 366 * 16], m4
-
-; mode 24 [row 15 - first half]
-pmaddubsw m4, m7, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 367 * 16], m4
-
-; mode 23 [row 3 - first half]
-pslldq m7, m0, 2
-pinsrb m7, [r4 + 0], 1
-pinsrb m7, [r4 + 4], 0
-pmaddubsw m4, m7, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 339 * 16], m4
-
-; mode 23 [row 4 - first half]
-pmaddubsw m4, m7, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 340 * 16], m4
-
-; mode 23 [row 5 - first half]
-pmaddubsw m4, m7, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 341 * 16], m4
-
-; mode 23 [row 6 - first half]
-pmaddubsw m4, m7, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 342 * 16], m4
-
-; mode 23 [row 7 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 4], 1
-pinsrb m7, [r4 + 7], 0
-pmaddubsw m4, m7, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 343 * 16], m4
-
-; mode 23 [row 8 - first half]
-pmaddubsw m4, m7, [r5 + 15 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 344 * 16], m4
-
-; mode 23 [row 9 - first half]
-pmaddubsw m4, m7, [r5 + 6 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 345 * 16], m4
-
-; mode 23 [row 10 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 7], 1
-pinsrb m7, [r4 + 11], 0
-pmaddubsw m4, m7, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 346 * 16], m4
-
-; mode 23 [row 11 - first half]
-pmaddubsw m4, m7, [r5 + 20 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 347 * 16], m4
-
-; mode 23 [row 12 - first half]
-pmaddubsw m4, m7, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 348 * 16], m4
-
-; mode 23 [row 13 - first half]
-pmaddubsw m4, m7, [r5 + 2 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 349 * 16], m4
-
-; mode 23 [row 14 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 11], 1
-pinsrb m7, [r4 + 14], 0
-pmaddubsw m4, m7, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 350 * 16], m4
-
-; mode 23 [row 15 - first half]
-pmaddubsw m4, m7, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 351 * 16], m4
-
-; mode 21 [row 15 - first half]
-pmaddubsw m4, m0, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 319 * 16 + 8], m4
-; mode 21 [row 15 - second half] end
-
-; mode 20 [row 1 - first half]
-pslldq m7, m0, 2
-pinsrb m7, [r4 + 0], 1
-pinsrb m7, [r4 + 2], 0
-pmaddubsw m4, m7, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 289 * 16], m4
-
-; mode 20 [row 2 - first half]
-pmaddubsw m4, m7, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 290 * 16], m4
-
-; mode 21 [row 1 - first half]
-pmaddubsw m4, m7, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 305 * 16], m4
-
-; mode 21 [row 2 - first half]
-pmaddubsw m4, m7, [r5 + 13 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 306 * 16], m4
-
-; mode 22 [row 2 - first half]
-pmaddubsw m4, m7, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 322 * 16], m4
-
-; mode 22 [row 3 - first half]
-pmaddubsw m4, m7, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 323 * 16], m4
-
-; mode 22 [row 4 - first half]
-pslldq m1, m7, 2
-pinsrb m1, [r4 + 2], 1
-pinsrb m1, [r4 + 5], 0
-pmaddubsw m4, m1, [r5 + 31 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 324 * 16], m4
-
-; mode 22 [row 5 - first half]
-pmaddubsw m4, m1, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 325 * 16], m4
-
-; mode 22 [row 6 - first half]
-pmaddubsw m4, m1, [r5 + 5 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 326 * 16], m4
-
-; mode 22 [row 7 - first half]
-pslldq m1, 2
-pinsrb m1, [r4 + 5], 1
-pinsrb m1, [r4 + 7], 0
-pmaddubsw m4, m1, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 327 * 16], m4
-
-; mode 22 [row 8 - first half]
-pmaddubsw m4, m1, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 328 * 16], m4
-
-; mode 22 [row 9 - first half]
-pslldq m1, 2
-pinsrb m1, [r4 + 7], 1
-pinsrb m1, [r4 + 10], 0
-pmaddubsw m4, m1, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 329 * 16], m4
-
-; mode 22 [row 10 - first half]
-pmaddubsw m4, m1, [r5 + 17 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 330 * 16], m4
-
-; mode 22 [row 11 - first half]
-pmaddubsw m4, m1, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 331 * 16], m4
-
-; mode 22 [row 12 - first half]
-pslldq m1, 2
-pinsrb m1, [r4 + 10], 1
-pinsrb m1, [r4 + 12], 0
-pmaddubsw m4, m1, [r5 + 23 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 332 * 16], m4
-
-; mode 22 [row 13 - first half]
-pmaddubsw m4, m1, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 333 * 16], m4
-
-; mode 22 [row 14 - first half]
-pslldq m1, 2
-pinsrb m1, [r4 + 12], 1
-pinsrb m1, [r4 + 15], 0
-pmaddubsw m4, m1, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 334 * 16], m4
-
-; mode 22 [row 15 - first half]
-pmaddubsw m4, m1, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 335 * 16], m4
-
-; mode 21 [row 3 - first half]
-pslldq m6, m7, 2
-pinsrb m6, [r4 + 2], 1
-pinsrb m6, [r4 + 4], 0
-pmaddubsw m4, m6, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 307 * 16], m4
-
-; mode 21 [row 4 - first half]
-pmaddubsw m4, m6, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 308 * 16], m4
-
-; mode 21 [row 5 - first half]
-pslldq m6, 2
-pinsrb m6, [r4 + 4], 1
-pinsrb m6, [r4 + 6], 0
-pmaddubsw m4, m6, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 309 * 16], m4
-
-; mode 21 [row 6 - first half]
-pmaddubsw m4, m6, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 310 * 16], m4
-
-; mode 21 [row 7 - first half]
-pslldq m6, 2
-pinsrb m6, [r4 + 6], 1
-pinsrb m6, [r4 + 8], 0
-pmaddubsw m4, m6, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 311 * 16], m4
-
-; mode 21 [row 8 - first half]
-pmaddubsw m4, m6, [r5 + 7 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 312 * 16], m4
-
-; mode 21 [row 9 - first half]
-pslldq m6, 2
-pinsrb m6, [r4 + 8], 1
-pinsrb m6, [r4 + 9], 0
-pmaddubsw m4, m6, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 313 * 16], m4
-
-; mode 21 [row 10 - first half]
-pmaddubsw m4, m6, [r5 + 5 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 314 * 16], m4
-
-; mode 21 [row 11 - first half]
-pslldq m6, 2
-pinsrb m6, [r4 + 9], 1
-pinsrb m6, [r4 + 11], 0
-pmaddubsw m4, m6, [r5 + 20 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 315 * 16], m4
-
-; mode 21 [row 12 - first half]
-pmaddubsw m4, m6, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 316 * 16], m4
-
-; mode 21 [row 13 - first half]
-pslldq m6, 2
-pinsrb m6, [r4 + 11], 1
-pinsrb m6, [r4 + 13], 0
-pmaddubsw m4, m6, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 317 * 16], m4
-
-; mode 21 [row 14 - first half]
-pmaddubsw m4, m6, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 318 * 16], m4
-
-; mode 21 [row 15 - first half]
-pslldq m6, 2
-pinsrb m6, [r4 + 13], 1
-pinsrb m6, [r4 + 15], 0
-pmaddubsw m4, m6, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 319 * 16], m4
-
-; mode 20 [row 13 - second half]
-pmaddubsw m4, m7, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 301 * 16 + 8], m4
-; mode 20 [row 13 - second half]
-
-; mode 20 [row 14 - second half]
-pmaddubsw m4, m7, [r5 + 5 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 302 * 16 + 8], m4
-; mode 20 [row 14 - second half]
-
-; mode 20 [row 3 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 2], 1
-pinsrb m7, [r4 + 3], 0
-pmaddubsw m4, m7, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 291 * 16], m4
-
-; mode 20 [row 15 - second half]
-pmaddubsw m4, m7, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 303 * 16 + 8], m4
-; mode 20 [row 15 - second half]
-
-; mode 20 [row 4 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 3], 1
-pinsrb m7, [r4 + 5], 0
-pmaddubsw m4, m7, [r5 + 23 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 292 * 16], m4
-
-; mode 20 [row 5 - first half]
-pmaddubsw m4, m7, [r5 + 2 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 293 * 16], m4
-
-; mode 20 [row 6 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 5], 1
-pinsrb m7, [r4 + 6], 0
-pmaddubsw m4, m7, [r5 + 13 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 294 * 16], m4
-
-; mode 20 [row 7 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 6], 1
-pinsrb m7, [r4 + 8], 0
-pmaddubsw m4, m7, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 295 * 16], m4
-
-; mode 20 [row 8 - first half]
-pmaddubsw m4, m7, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 296 * 16], m4
-
-; mode 20 [row 9 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 8], 1
-pinsrb m7, [r4 + 9], 0
-pmaddubsw m4, m7, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 297 * 16], m4
-
-; mode 20 [row 10 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 9], 1
-pinsrb m7, [r4 + 11], 0
-pmaddubsw m4, m7, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 298 * 16], m4
-
-; mode 20 [row 11 - first half]
-pmaddubsw m4, m7, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 299 * 16], m4
-
-; mode 20 [row 12 - first half]
-movu m1, [r5 + 15 * 16]
-pslldq m7, 2
-pinsrb m7, [r4 + 11], 1
-pinsrb m7, [r4 + 12], 0
-pmaddubsw m4, m7, [r5 + 15 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 300 * 16], m4
-
-; mode 20 [row 13 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 12], 1
-pinsrb m7, [r4 + 14], 0
-pmaddubsw m4, m7, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 301 * 16], m4
-
-; mode 20 [row 14 - first half]
-pmaddubsw m4, m7, [r5 + 5 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 302 * 16], m4
-
-; mode 20 [row 15 - first half]
-pslldq m7, 2
-pinsrb m7, [r4 + 14], 1
-pinsrb m7, [r4 + 15], 0
-pmaddubsw m4, m7, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 303 * 16], m4
-
-; mode 19 [row 1]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 1], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 8], 1
-pinsrb m5, [r3 + 7], 0
-
-; mode 20 [row 1 - second half]
-pmaddubsw m4, m5, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 289 * 16 + 8], m4
-; mode 20 [row 1 - second half] end
-
-; mode 20 [row 2 - second half]
-pmaddubsw m4, m5, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 290 * 16 + 8], m4
-; mode 20 [row 2 - second half] end
-
-; mode 21 [row 2 - second half]
-pmaddubsw m4, m5, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 305 * 16 + 8], m4
-; mode 21 [row 2 - second half] end
-
-; mode 21 [row 3 - second half]
-pmaddubsw m4, m5, [r5 + 13 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 306 * 16 + 8], m4
-; mode 21 [row 3 - second half] end
-
-; mode 21 [row 4 - second half]
-pmaddubsw m4, m5, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 307 * 16 + 8], m4
-; mode 21 [row 4 - second half] end
-
-; mode 22 [row 2 - second half]
-pmaddubsw m4, m5, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 322 * 16 + 8], m4
-; mode 22 [row 2 - second half] end
-
-; mode 22 [row 3 - second half]
-pmaddubsw m4, m5, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 323 * 16 + 8], m4
-; mode 22 [row 3 - second half] end
-
-; mode 23 [row 3 - second half]
-pmaddubsw m4, m5, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 339 * 16 + 8], m4
-; mode 23 [row 3 - second half] end
-
-; mode 23 [row 4 - second half]
-pmaddubsw m4, m5, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 340 * 16 + 8], m4
-; mode 23 [row 4 - second half] end
-
-; mode 23 [row 5 - second half]
-pmaddubsw m4, m5, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 341 * 16 + 8], m4
-; mode 23 [row 5 - second half] end
-
-; mode 23 [row 6 - second half]
-pmaddubsw m4, m5, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 342 * 16 + 8], m4
-; mode 23 [row 6 - second half] end
-
-; mode 24 [row 6 - second half]
-pmaddubsw m4, m5, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 358 * 16 + 8], m4
-; mode 24 [row 6 - second half] end
-
-; mode 24 [row 7 - second half]
-pmaddubsw m4, m5, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 359 * 16 + 8], m4
-; mode 24 [row 7 - second half] end
-
-; mode 24 [row 8 - second half]
-pmaddubsw m4, m5, [r5 + 19 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 360 * 16 + 8], m4
-; mode 24 [row 8 - second half] end
-
-; mode 24 [row 9 - second half]
-pmaddubsw m4, m5, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 361 * 16 + 8], m4
-; mode 24 [row 9 - second half] end
-
-; mode 24 [row 10 - second half]
-pmaddubsw m4, m5, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 362 * 16 + 8], m4
-; mode 24 [row 10 - second half] end
-
-; mode 24 [row 11 - second half]
-pmaddubsw m4, m5, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 363 * 16 + 8], m4
-; mode 24 [row 11 - second half] end
-
-pmaddubsw m4, m0, [r5 + 12 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 12 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 273 * 16], m4
-
-; mode 19 [row 2]
-pslldq m0, 2
-pinsrb m0, [r4 + 1], 1
-pinsrb m0, [r4 + 2], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 7], 1
-pinsrb m5, [r3 + 6], 0
-
-; mode 20 [row 3 - second half]
-pmaddubsw m4, m5, [r5 + 12 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 291 * 16 + 8], m4
-; mode 20 [row 3 - second half] end
-
-; mode 21 [row 3 - second half]
-pmaddubsw m4, m5, [r5 + 28 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 307 * 16 + 8], m4
-; mode 21 [row 3 - second half] end
-
-; mode 21 [row 4 - second half]
-pmaddubsw m4, m5, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 308 * 16 + 8], m4
-; mode 21 [row 4 - second half] end
-
-; mode 22 [row 4 - second half]
-pmaddubsw m4, m5, [r5 + 31 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 324 * 16 + 8], m4
-; mode 22 [row 4 - second half] end
-
-; mode 22 [row 5 - second half]
-pmaddubsw m4, m5, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 325 * 16 + 8], m4
-; mode 22 [row 5 - second half] end
-
-; mode 22 [row 6 - second half]
-pmaddubsw m4, m5, [r5 + 5 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 326 * 16 + 8], m4
-; mode 22 [row 6 - second half] end
-
-; mode 23 [row 7 - second half]
-pmaddubsw m4, m5, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 343 * 16 + 8], m4
-; mode 23 [row 7 - second half] end
-
-; mode 23 [row 8 - second half]
-pmaddubsw m4, m5, [r5 + 15 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 344 * 16 + 8], m4
-; mode 23 [row 8 - second half] end
-
-; mode 23 [row 9 - second half]
-pmaddubsw m4, m5, [r5 + 6 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 345 * 16 + 8], m4
-; mode 23 [row 9 - second half] end
-
-; mode 24 [row 12 - second half]
-pmaddubsw m4, m5, [r5 + 31 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 364 * 16 + 8], m4
-; mode 24 [row 12 - second half] end
-
-; mode 24 [row 13 - second half]
-pmaddubsw m4, m5, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 365 * 16 + 8], m4
-; mode 24 [row 13 - second half] end
-
-; mode 24 [row 14 - second half]
-pmaddubsw m4, m5, [r5 + 21 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 366 * 16 + 8], m4
-; mode 24 [row 14 - second half] end
-
-; mode 24 [row 15 - second half]
-pmaddubsw m4, m5, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 367 * 16 + 8], m4
-; mode 24 [row 15 - second half] end
-
-pmaddubsw m4, m0, [r5 + 18 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 18 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 274 * 16], m4
-
-; mode 19 [row 3]
-pslldq m0, 2
-pinsrb m0, [r4 + 2], 1
-pinsrb m0, [r4 + 4], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 6], 1
-pinsrb m5, [r3 + 5], 0
-
-; mode 20 [row 4 - second half]
-pmaddubsw m4, m5, [r5 + 23 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 292 * 16 + 8], m4
-; mode 20 [row 4 - second half] end
-
-; mode 20 [row 5 - second half]
-pmaddubsw m4, m5, [r5 + 2 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 293 * 16 + 8], m4
-; mode 20 [row 5 - second half] end
-
-; mode 21 [row 5 - second half]
-pmaddubsw m4, m5, [r5 + 26 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 309 * 16 + 8], m4
-; mode 21 [row 5 - second half] end
-
-; mode 21 [row 6 - second half]
-pmaddubsw m4, m5, [r5 + 9 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 310 * 16 + 8], m4
-; mode 21 [row 6 - second half] end
-
-; mode 22 [row 7 - second half]
-pmaddubsw m4, m5, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 327 * 16 + 8], m4
-; mode 22 [row 7 - second half] end
-
-; mode 22 [row 8 - second half]
-pmaddubsw m4, m5, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 328 * 16 + 8], m4
-; mode 22 [row 7 - second half] end
-
-; mode 23 [row 10 - second half]
-pmaddubsw m4, m5, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 346 * 16 + 8], m4
-; mode 23 [row 10 - second half] end
-
-; mode 23 [row 11 - second half]
-pmaddubsw m4, m5, [r5 + 20 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 347 * 16 + 8], m4
-; mode 23 [row 11 - second half] end
-
-; mode 23 [row 12 - second half]
-pmaddubsw m4, m5, [r5 + 11 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 348 * 16 + 8], m4
-; mode 23 [row 12 - second half] end
-
-; mode 23 [row 13 - second half]
-pmaddubsw m4, m5, [r5 + 2 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 349 * 16 + 8], m4
-; mode 23 [row 13 - second half] end
-
-pmaddubsw m4, m0, [r5 + 24 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 24 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 275 * 16], m4
-
-; mode 19 [row 4]
-pslldq m0, 2
-pinsrb m0, [r4 + 4], 1
-pinsrb m0, [r4 + 5], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 5], 1
-pinsrb m5, [r3 + 4], 0
-
-; mode 20 [row 6 - second half]
-pmaddubsw m4, m5, [r5 + 13 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 294 * 16 + 8], m4
-; mode 20 [row 6 - second half] end
-
-; mode 21 [row 7 - second half]
-pmaddubsw m4, m5, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 311 * 16 + 8], m4
-; mode 21 [row 7 - second half] end
-
-; mode 21 [row 8 - second half]
-pmaddubsw m4, m5, [r5 + 7 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 312 * 16 + 8], m4
-; mode 21 [row 8 - second half] end
-
-; mode 22 [row 9 - second half]
-pmaddubsw m4, m5, [r5 + 30 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 329 * 16 + 8], m4
-; mode 22 [row 9 - second half] end
-
-; mode 22 [row 10 - second half]
-pmaddubsw m4, m5, [r5 + 17 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 330 * 16 + 8], m4
-; mode 22 [row 10 - second half] end
-
-; mode 22 [row 11 - second half]
-pmaddubsw m4, m5, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 331 * 16 + 8], m4
-; mode 22 [row 11 - second half] end
-
-; mode 23 [row 14 - second half]
-pmaddubsw m4, m5, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 350 * 16 + 8], m4
-; mode 23 [row 14 - second half] end
-
-; mode 23 [row 15 - second half]
-pmaddubsw m4, m5, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 351 * 16 + 8], m4
-
-; mode 23 [row 15 - second half] end
-pmaddubsw m4, m0, [r5 + 30 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 30 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 276 * 16], m4
-
-; mode 19 [row 5]
-pmaddubsw m4, m0, [r5 + 4 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 4 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 277 * 16], m4
-
-; mode 19 [row 6]
-pslldq m0, 2
-pinsrb m0, [r4 + 5], 1
-pinsrb m0, [r4 + 6], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 4], 1
-pinsrb m5, [r3 + 3], 0
-
-; mode 20 [row 7 - second half]
-pmaddubsw m4, m5, [r5 + 24 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 295 * 16 + 8], m4
-; mode 20 [row 7 - second half] end
-
-; mode 20 [row 8 - second half]
-pmaddubsw m4, m5, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 296 * 16 + 8], m4
-; mode 20 [row 8 - second half] end
-
-; mode 21 [row 9 - second half]
-pmaddubsw m4, m5, [r5 + 22 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 313 * 16 + 8], m4
-; mode 21 [row 9 - second half] end
-
-; mode 21 [row 10 - second half]
-pmaddubsw m4, m5, [r5 + 5 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 314 * 16 + 8], m4
-; mode 21 [row 10 - second half] end
-
-; mode 22 [row 12 - second half]
-pmaddubsw m4, m5, [r5 + 23 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 332 * 16 + 8], m4
-; mode 22 [row 12 - second half] end
-
-; mode 22 [row 12 - second half]
-pmaddubsw m4, m5, [r5 + 10 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 333 * 16 + 8], m4
-; mode 22 [row 12 - second half] end
-
-pmaddubsw m4, m0, [r5 + 10 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 10 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 278 * 16], m4
-
-; mode 19 [row 7]
-pslldq m0, 2
-pinsrb m0, [r4 + 6], 1
-pinsrb m0, [r4 + 7], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 3], 1
-pinsrb m5, [r3 + 2], 0
-
-; mode 20 [row 9 - second half]
-pmaddubsw m4, m5, [r5 + 14 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 297 * 16 + 8], m4
-; mode 20 [row 9 - second half]
-
-; mode 21 [row 11 - second half]
-pmaddubsw m4, m5, [r5 + 20 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 315 * 16 + 8], m4
-; mode 21 [row 11 - second half] end
-
-; mode 21 [row 12 - second half]
-pmaddubsw m4, m5, [r5 + 3 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 316 * 16 + 8], m4
-; mode 21 [row 12 - second half] end
-
-; mode 22 [row 14 - second half]
-pmaddubsw m4, m5, [r5 + 29 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 334 * 16 + 8], m4
-; mode 22 [row 14 - second half] end
-
-; mode 22 [row 15 - second half]
-pmaddubsw m4, m5, [r5 + 16 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 335 * 16 + 8], m4
-; mode 22 [row 15 - second half] end
-
-pmaddubsw m4, m0, [r5 + 16 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 16 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 279 * 16], m4
-
-; mode 19 [row 8]
-pslldq m0, 2
-pinsrb m0, [r4 + 7], 1
-pinsrb m0, [r4 + 9], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 2], 1
-pinsrb m5, [r3 + 1], 0
-
-; mode 20 [row 10 - second half]
-pmaddubsw m4, m5, [r5 + 25 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 298 * 16 + 8], m4
-; mode 20 [row 10 - second half] end
-
-; mode 20 [row 11 - second half]
-pmaddubsw m4, m5, [r5 + 4 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 299 * 16 + 8], m4
-; mode 20 [row 11 - second half] end
-
-; mode 21 [row 13 - second half]
-pmaddubsw m4, m5, [r5 + 18 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 317 * 16 + 8], m4
-; mode 21 [row 13 - second half] end
-
-; mode 21 [row 14 - second half]
-pmaddubsw m4, m5, [r5 + 1 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 318 * 16 + 8], m4
-; mode 21 [row 14 - second half] end
-
-pmaddubsw m4, m0, [r5 + 22 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 22 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 280 * 16], m4
-
-; mode 19 [row 9]
-pslldq m0, 2
-pinsrb m0, [r4 + 9], 1
-pinsrb m0, [r4 + 10], 0
-pslldq m5, 2
-pinsrb m5, [r3 + 1], 1
-pinsrb m5, [r3 + 0], 0
-
-; mode 20 [row 12 - second half]
-pmaddubsw m4, m5, [r5 + 15 * 16]
-pmulhrsw m4, m3
-packuswb m4, m4
-movh [r0 + 300 * 16 + 8], m4
-
-; mode 20 [row 12 - second half] end
-pmaddubsw m4, m0, [r5 + 28 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 28 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 281 * 16], m4
-
-; mode 19 [row 10]
-pmaddubsw m4, m0, [r5 + 2 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 2 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 282 * 16], m4
-
-; mode 19 [row 11]
-pslldq m0, 2
-pinsrb m0, [r4 + 10], 1
-pinsrb m0, [r4 + 11], 0
-pmaddubsw m4, m0, [r5 + 8 * 16]
-pmulhrsw m4, m3
-pslldq m5, 2
-pinsrb m5, [r4 + 0], 1
-pinsrb m5, [r4 + 1], 0
-pmaddubsw m6, m5, [r5 + 8 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 283 * 16], m4
-
-; mode 19 [row 12]
-pslldq m0, 2
-pinsrb m0, [r4 + 11], 1
-pinsrb m0, [r4 + 12], 0
-pslldq m5, 2
-pinsrb m5, [r4 + 1], 1
-pinsrb m5, [r4 + 2], 0
-pmaddubsw m4, m0, [r5 + 14 * 16]
-pmulhrsw m4, m3
-pmaddubsw m6, m5, [r5 + 14 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 284 * 16], m4
-
-; mode 19 [row 13]
-pslldq m0, 2
-pinsrb m0, [r4 + 12], 1
-pinsrb m0, [r4 + 14], 0
-pmaddubsw m4, m0, [r5 + 20 * 16]
-pmulhrsw m4, m3
-pslldq m5, 2
-pinsrb m5, [r4 + 2], 1
-pinsrb m5, [r4 + 4], 0
-pmaddubsw m6, m5, [r5 + 20 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 285 * 16], m4
-
-; mode 19 [row 14]
-pslldq m0, 2
-pinsrb m0, [r4 + 14], 1
-pinsrb m0, [r4 + 15], 0
-pmaddubsw m4, m0, [r5 + 26 * 16]
-pmulhrsw m4, m3
-pslldq m5, 2
-pinsrb m5, [r4 + 4], 1
-pinsrb m5, [r4 + 5], 0
-pmaddubsw m6, m5, [r5 + 26 * 16]
-pmulhrsw m6, m3
-packuswb m4, m6
-movu [r0 + 286 * 16], m4
-
-; mode 19 [row 15]
-movu m0, [r4]
-pshufb m0, [tab_S1]
-movu [r0 + 287 * 16], m0
-movd m1, [r3]
-movd [r0 + 287 * 16 + 12], m1
-
-; mode 25
-movu m1, [r1]
-
-; mode 26 [all rows]
-psrldq m6, m1, 1
-pinsrb m6, [r1 + 16], 15
-movu m7, m6
-movu [r0 + 384 * 16], m6
-movu [r0 + 385 * 16], m6
-movu [r0 + 386 * 16], m6
-movu [r0 + 387 * 16], m6
-movu [r0 + 388 * 16], m6
-movu [r0 + 389 * 16], m6
-movu [r0 + 390 * 16], m6
-movu [r0 + 391 * 16], m6
-movu [r0 + 392 * 16], m6
-movu [r0 + 393 * 16], m6
-movu [r0 + 394 * 16], m6
-movu [r0 + 395 * 16], m6
-movu [r0 + 396 * 16], m6
-movu [r0 + 397 * 16], m6
-movu [r0 + 398 * 16], m6
-movu [r0 + 399 * 16], m6
-
-pxor m0, m0
-pshufb m6, m6, m0
-punpcklbw m6, m0
-movu m2, [r2]
-pshufb m2, m2, m0
-punpcklbw m2, m0
-movu m4, [r2 + 1]
-punpcklbw m5, m4, m0
-punpckhbw m4, m0
-psubw m5, m2
-psubw m4, m2
-psraw m5, 1
-psraw m4, 1
-paddw m5, m6
-paddw m4, m6
-packuswb m5, m4
-
-pextrb [r0 + 384 * 16], m5, 0
-pextrb [r0 + 385 * 16], m5, 1
-pextrb [r0 + 386 * 16], m5, 2
-pextrb [r0 + 387 * 16], m5, 3
-pextrb [r0 + 388 * 16], m5, 4
-pextrb [r0 + 389 * 16], m5, 5
-pextrb [r0 + 390 * 16], m5, 6
-pextrb [r0 + 391 * 16], m5, 7
-pextrb [r0 + 392 * 16], m5, 8
-pextrb [r0 + 393 * 16], m5, 9
-pextrb [r0 + 394 * 16], m5, 10
-pextrb [r0 + 395 * 16], m5, 11
-pextrb [r0 + 396 * 16], m5, 12
-pextrb [r0 + 397 * 16], m5, 13
-pextrb [r0 + 398 * 16], m5, 14
-pextrb [r0 + 399 * 16], m5, 15
-
-; mode 25 [row 15]
-movu [r0 + 383 * 16], m1
-
-; mode 25 [row 0]
-psrldq m2, m1, 1
-punpcklbw m1, m2
-movu m2, [r1 + 8]
-psrldq m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m4, m1, [r5 + 30 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 30 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 368 * 16], m4
-
-; mode 25 [row 1]
-pmaddubsw m4, m1, [r5 + 28 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 369 * 16], m4
-
-; mode 25 [row 2]
-pmaddubsw m4, m1, [r5 + 26 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 370 * 16], m4
-
-; mode 25 [row 3]
-pmaddubsw m4, m1, [r5 + 24 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 371 * 16], m4
-
-; mode 25 [row 4]
-pmaddubsw m4, m1, [r5 + 22 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 22 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 372 * 16], m4
-
-; mode 25 [row 5]
-pmaddubsw m4, m1, [r5 + 20 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 373 * 16], m4
-
-; mode 25 [row 6]
-pmaddubsw m4, m1, [r5 + 18 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 374 * 16], m4
-
-; mode 25 [row 7]
-pmaddubsw m4, m1, [r5 + 16 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 375 * 16], m4
-
-; mode 25 [row 8]
-pmaddubsw m4, m1, [r5 + 14 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 376 * 16], m4
-
-; mode 25 [row 9]
-pmaddubsw m4, m1, [r5 + 12 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 12 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 377 * 16], m4
-
-; mode 25 [row 10]
-pmaddubsw m4, m1, [r5 + 10 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 10 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 378 * 16], m4
-
-; mode 25 [row 11]
-pmaddubsw m4, m1, [r5 + 8 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 379 * 16], m4
-
-; mode 25 [row 12]
-pmaddubsw m4, m1, [r5 + 6 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 380 * 16], m4
-
-; mode 25 [row 13]
-pmaddubsw m4, m1, [r5 + 4 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 381 * 16], m4
-
-; mode 25 [row 14]
-pmaddubsw m4, m1, [r5 + 2 * 16]
-pmulhrsw m4, m3
-pmaddubsw m5, m2, [r5 + 2 * 16]
-pmulhrsw m5, m3
-packuswb m4, m5
-movu [r0 + 382 * 16], m4
-
-; mode 27 [row 15]
-psrldq m6, m7, 1
-punpcklbw m7, m6
-pinsrb m6, [r1 + 17], 15
-movu [r0 + 415 * 16], m6
-
-; mode 27 [row 0]
-movu m4, [r1 + 9]
-psrldq m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m6, m7, [r5 + 2 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 400 * 16], m6
-
-; mode 27 [row 1]
-pmaddubsw m6, m7, [r5 + 4 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 401 * 16], m6
-
-; mode 27 [row 2]
-pmaddubsw m6, m7, [r5 + 6 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 402 * 16], m6
-
-; mode 27 [row 3]
-pmaddubsw m6, m7, [r5 + 8 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 403 * 16], m6
-
-; mode 27 [row 4]
-pmaddubsw m6, m7, [r5 + 10 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 404 * 16], m6
-
-; mode 27 [row 5]
-pmaddubsw m6, m7, [r5 + 12 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 405 * 16], m6
-
-; mode 27 [row 6]
-pmaddubsw m6, m7, [r5 + 14 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 406 * 16], m6
-
-; mode 27 [row 7]
-pmaddubsw m6, m7, [r5 + 16 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 407 * 16], m6
-
-; mode 27 [row 8]
-pmaddubsw m6, m7, [r5 + 18 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 408 * 16], m6
-
-; mode 27 [row 9]
-pmaddubsw m6, m7, [r5 + 20 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 409 * 16], m6
-
-; mode 27 [row 10]
-pmaddubsw m6, m7, [r5 + 22 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 22 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 410 * 16], m6
-
-; mode 27 [row 11]
-pmaddubsw m6, m7, [r5 + 24 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 411 * 16], m6
-
-; mode 27 [row 12]
-pmaddubsw m6, m7, [r5 + 26 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 412 * 16], m6
-
-; mode 27 [row 13]
-pmaddubsw m6, m7, [r5 + 28 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 413 * 16], m6
-
-; mode 27 [row 14]
-pmaddubsw m6, m7, [r5 + 30 * 16]
-pmulhrsw m6, m3
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, m3
-packuswb m6, m5
-movu [r0 + 414 * 16], m6
-
-; mode 28 [row 0]
-movu m1, [r3 + 1]
-psrldq m2, m1, 1
-punpcklbw m1, m2
-movu m4, [r3 + 9]
-psrldq m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m2, m1, [r5 + 5 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 5 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 416 * 16], m2
-
-; mode 28 [row 0]
-pmaddubsw m2, m1, [r5 + 5 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 5 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 416 * 16], m2
-
-; mode 28 [row 1]
-pmaddubsw m2, m1, [r5 + 10 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 417 * 16], m2
-
-; mode 28 [row 2]
-pmaddubsw m2, m1, [r5 + 15 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 418 * 16], m2
-
-; mode 28 [row 3]
-pmaddubsw m2, m1, [r5 + 20 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 419 * 16], m2
-
-; mode 28 [row 4]
-pmaddubsw m2, m1, [r5 + 25 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 420 * 16], m2
-
-; mode 28 [row 5]
-pmaddubsw m2, m1, [r5 + 30 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 421 * 16], m2
-
-; mode 29 [row 0]
-pmaddubsw m2, m1, [r5 + 9 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 9 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 432 * 16], m2
-
-; mode 29 [row 1]
-pmaddubsw m2, m1, [r5 + 18 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 433 * 16], m2
-
-; mode 29 [row 2]
-pmaddubsw m2, m1, [r5 + 27 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 27 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 434 * 16], m2
-
-; mode 30 [row 0]
-pmaddubsw m2, m1, [r5 + 13 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 13 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 448 * 16], m2
-
-; mode 30 [row 1]
-pmaddubsw m2, m1, [r5 + 26 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 449 * 16], m2
-
-; mode 33 [row 0]
-movu [r0 + 496 * 16], m2
-
-; mode 31 [row 0]
-pmaddubsw m2, m1, [r5 + 17 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 464 * 16], m2
-
-; mode 32 [row 0]
-pmaddubsw m2, m1, [r5 + 21 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 480 * 16], m2
-
-; mode 28 [row 6]
-movd m7, [r3 + 9]
-palignr m7, m1, 2
-pmaddubsw m2, m7, [r5 + 3 * 16]
-pmulhrsw m2, m3
-movd m6, [r3 + 17]
-palignr m6, m4, 2
-pmaddubsw m5, m6, [r5 + 3 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 422 * 16], m2
-
-; mode 28 [row 7]
-pmaddubsw m2, m7, [r5 + 8 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 423 * 16], m2
-
-; mode 28 [row 8]
-pmaddubsw m2, m7, [r5 + 13 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 13 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 424 * 16], m2
-
-; mode 28 [row 9]
-pmaddubsw m2, m7, [r5 + 18 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 18 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 425 * 16], m2
-
-; mode 28 [row 10]
-pmaddubsw m2, m7, [r5 + 23 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 23 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 426 * 16], m2
-
-; mode 29 [row 3]
-pmaddubsw m2, m7, [r5 + 4 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 4 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 435 * 16], m2
-
-; mode 29 [row 4]
-pmaddubsw m2, m7, [r5 + 13 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 13 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 436 * 16], m2
-
-; mode 29 [row 5]
-pmaddubsw m2, m7, [r5 + 22 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 22 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 437 * 16], m2
-
-; mode 29 [row 6]
-pmaddubsw m2, m7, [r5 + 31 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 31 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 438 * 16], m2
-
-; mode 32 [row 2]
-movu [r0 + 482 * 16], m2
-
-; mode 30 [row 2]
-pmaddubsw m2, m7, [r5 + 7 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 7 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 450 * 16], m2
-
-; mode 30 [row 3]
-pmaddubsw m2, m7, [r5 + 20 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 20 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 451 * 16], m2
-
-; mode 33 [row 1]
-movu [r0 + 497 * 16], m2
-
-; mode 31 [row 1]
-pmaddubsw m2, m7, [r5 + 2 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 2 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 465 * 16], m2
-
-; mode 31 [row 2]
-pmaddubsw m2, m7, [r5 + 19 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 19 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 466 * 16], m2
-
-; mode 32 [row 1]
-pmaddubsw m2, m7, [r5 + 10 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 10 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 481 * 16], m2
-
-; mode 28 [row 11]
-pmaddubsw m2, m7, [r5 + 28 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 28 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 427 * 16], m2
-
-; mode 28 [row 12]
-movd m1, [r3 + 10]
-palignr m1, m7, 2
-pmaddubsw m2, m1, [r5 + 1 * 16]
-pmulhrsw m2, m3
-movd m4, [r3 + 18]
-palignr m4, m6, 2
-pmaddubsw m5, m4, [r5 + 1 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 428 * 16], m2
-
-; mode 30 [row 4]
-movu [r0 + 452 * 16], m2
-
-; mode 28 [row 13]
-pmaddubsw m2, m1, [r5 + 6 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 429 * 16], m2
-
-; mode 28 [row 14]
-pmaddubsw m2, m1, [r5 + 11 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 11 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 430 * 16], m2
-
-; mode 28 [row 15]
-pmaddubsw m2, m1, [r5 + 16 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 431 * 16], m2
-
-; mode 29 [row 7]
-pmaddubsw m2, m1, [r5 + 8 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 439 * 16], m2
-
-; mode 29 [row 8]
-pmaddubsw m2, m1, [r5 + 17 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 440 * 16], m2
-
-; mode 29 [row 9]
-pmaddubsw m2, m1, [r5 + 26 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 441 * 16], m2
-
-; mode 30 [row 5]
-pmaddubsw m2, m1, [r5 + 14 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 453 * 16], m2
-
-; mode 33 [row 2]
-movu [r0 + 498 * 16], m2
-
-; mode 30 [row 6]
-pmaddubsw m2, m1, [r5 + 27 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 27 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 454 * 16], m2
-
-; mode 31 [row 3]
-pmaddubsw m2, m1, [r5 + 4 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 467 * 16], m2
-
-; mode 31 [row 4]
-pmaddubsw m2, m1, [r5 + 21 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 468 * 16], m2
-
-; mode 32 [row 3]
-pmaddubsw m2, m1, [r5 + 20 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 483 * 16], m2
-
-; mode 29 [row 10]
-movd m7, [r3 + 11]
-palignr m7, m1, 2
-pmaddubsw m2, m7, [r5 + 3 * 16]
-pmulhrsw m2, m3
-movd m6, [r3 + 19]
-palignr m6, m4, 2
-pmaddubsw m5, m6, [r5 + 3 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 442 * 16], m2
-
-; mode 29 [row 11]
-pmaddubsw m2, m7, [r5 + 12 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 12 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 443 * 16], m2
-
-; mode 29 [row 12]
-pmaddubsw m2, m7, [r5 + 21 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 21 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 444 * 16], m2
-
-; mode 30 [row 8]
-movu [r0 + 456 * 16], m2
-
-; mode 29 [row 13]
-pmaddubsw m2, m7, [r5 + 30 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 30 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 445 * 16], m2
-
-; mode 32 [row 5]
-movu [r0 + 485 * 16], m2
-
-; mode 30 [row 7]
-pmaddubsw m2, m7, [r5 + 8 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 455 * 16], m2
-
-; mode 33 [row 3]
-movu [r0 + 499 * 16], m2
-
-; mode 31 [row 5]
-pmaddubsw m2, m7, [r5 + 6 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 469 * 16], m2
-
-; mode 31 [row 6]
-pmaddubsw m2, m7, [r5 + 23 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 23 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 470 * 16], m2
-
-; mode 32 [row 4]
-pmaddubsw m2, m7, [r5 + 9 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 9 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 484 * 16], m2
-
-movu m1, m7
-movu m4, m6
-
-; mode 29 [row 14]
-movu m1, [r3 + 12]
-palignr m1, m7, 2
-pmaddubsw m2, m1, [r5 + 7 * 16]
-pmulhrsw m2, m3
-movd m4, [r3 + 20]
-palignr m4, m6, 2
-pmaddubsw m5, m4, [r5 + 7 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 446 * 16], m2
-
-; mode 29 [row 15]
-pmaddubsw m2, m1, [r5 + 16 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 447 * 16], m2
-
-; mode 30 [row 9]
-pmaddubsw m2, m1, [r5 + 2 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 457 * 16], m2
-
-; mode 33 [row 4]
-movu [r0 + 500 * 16], m2
-
-; mode 30 [row 10]
-pmaddubsw m2, m1, [r5 + 15 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 458 * 16], m2
-
-; mode 30 [row 11]
-pmaddubsw m2, m1, [r5 + 28 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 459 * 16], m2
-
-; mode 33 [row 5]
-movu [r0 + 501 * 16], m2
-
-; mode 31 [row 7]
-pmaddubsw m2, m1, [r5 + 8 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 471 * 16], m2
-
-; mode 31 [row 8]
-pmaddubsw m2, m1, [r5 + 25 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 472 * 16], m2
-
-; mode 32 [row 6]
-pmaddubsw m2, m1, [r5 + 19 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 19 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 486 * 16], m2
-
-; mode 30 [row 12]
-movd m7, [r3 + 13]
-palignr m7, m1, 2
-pmaddubsw m2, m7, [r5 + 9 * 16]
-pmulhrsw m2, m3
-movd m6, [r3 + 21]
-palignr m6, m4, 2
-pmaddubsw m5, m6, [r5 + 9 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 460 * 16], m2
-
-; mode 30 [row 13]
-pmaddubsw m2, m7, [r5 + 22 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 22 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 461 * 16], m2
-
-; mode 33 [row 6]
-movu [r0 + 502 * 16], m2
-
-; mode 31 [row 9]
-pmaddubsw m2, m7, [r5 + 10 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 10 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 473 * 16], m2
-
-; mode 31 [row 10]
-pmaddubsw m2, m7, [r5 + 27 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 27 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 474 * 16], m2
-
-; mode 32 [row 7]
-pmaddubsw m2, m7, [r5 + 8 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 8 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 487 * 16], m2
-
-; mode 32 [row 8]
-pmaddubsw m2, m7, [r5 + 29 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 29 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 488 * 16], m2
-
-
-movu m1, m7
-movu m4, m6
-
-; mode 30 [row 14]
-movd m1, [r3 + 14]
-palignr m1, m7, 2
-pmaddubsw m2, m1, [r5 + 3 * 16]
-pmulhrsw m2, m3
-movd m4, [r3 + 22]
-palignr m4, m6, 2
-pmaddubsw m5, m4, [r5 + 3 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 462 * 16], m2
-
-; mode 30 [row 15]
-pmaddubsw m2, m1, [r5 + 16 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 463 * 16], m2
-
-; mode 33 [row 7]
-movu [r0 + 503 * 16], m2
-
-; mode 31 [row 11]
-pmaddubsw m2, m1, [r5 + 12 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 475 * 16], m2
-
-; mode 31 [row 12]
-pmaddubsw m2, m1, [r5 + 29 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 29 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 476 * 16], m2
-
-; mode 32 [row 9]
-pmaddubsw m2, m1, [r5 + 18 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 489 * 16], m2
-
-; mode 31 [row 13]
-movd m7, [r3 + 15]
-palignr m7, m1, 2
-pmaddubsw m2, m7, [r5 + 14 * 16]
-pmulhrsw m2, m3
-movd m6, [r3 + 23]
-palignr m6, m4, 2
-pmaddubsw m5, m6, [r5 + 14 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 477 * 16], m2
-
-; mode 31 [row 14]
-pmaddubsw m2, m7, [r5 + 31 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 31 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 478 * 16], m2
-
-; mode 32 [row 10]
-pmaddubsw m2, m7, [r5 + 7 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 7 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 490 * 16], m2
-
-; mode 32 [row 11]
-pmaddubsw m2, m7, [r5 + 28 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 28 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 491 * 16], m2
-
-; mode 33 [row 8]
-pmaddubsw m2, m7, [r5 + 10 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 10 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 504 * 16], m2
-
-; mode 31 [row 15]
-movd m1, [r3 + 16]
-palignr m1, m7, 2
-pmaddubsw m2, m1, [r5 + 16 * 16]
-pmulhrsw m2, m3
-movd m4, [r3 + 24]
-palignr m4, m6, 2
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 479 * 16], m2
-
-; mode 32 [row 12]
-pmaddubsw m2, m1, [r5 + 17 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 492 * 16], m2
-
-; mode 33 [row 9]
-pmaddubsw m2, m1, [r5 + 4 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 505 * 16], m2
-
-; mode 33 [row 10]
-pmaddubsw m2, m1, [r5 + 30 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 506 * 16], m2
-
-; mode 33 [row 10]
-pmaddubsw m2, m1, [r5 + 4 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 505 * 16], m2
-
-; mode 32 [row 13]
-movd m7, [r3 + 17]
-palignr m7, m1, 2
-pmaddubsw m2, m7, [r5 + 6 * 16]
-pmulhrsw m2, m3
-
-movd m6, [r3 + 25]
-palignr m6, m4, 2
-pmaddubsw m5, m6, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 493 * 16], m2
-
-; mode 32 [row 14]
-pmaddubsw m2, m7, [r5 + 27 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 27 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 494 * 16], m2
-
-; mode 33 [row 11]
-pmaddubsw m2, m7, [r5 + 24 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m6, [r5 + 24 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 507 * 16], m2
-
-; mode 32 [row 15]
-movd m1, [r3 + 18]
-palignr m1, m7, 2
-pmaddubsw m2, m1, [r5 + 16 * 16]
-pmulhrsw m2, m3
-psrldq m4, 2
-pinsrb m4, [r3 + 26], 14
-pinsrb m4, [r3 + 27], 15
-movd m4, [r3 + 26]
-palignr m4, m6, 2
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 495 * 16], m2
-
-; mode 33 [row 12]
-pmaddubsw m2, m1, [r5 + 18 * 16]
-pmulhrsw m2, m3
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 508 * 16], m2
-
-; mode 33 [row 13]
-movd m7, [r3 + 19]
-palignr m7, m1, 2
-pmaddubsw m2, m7, [r5 + 12 * 16]
-pmulhrsw m2, m3
-movd m6, [r3 + 27]
-palignr m6, m4, 2
-pmaddubsw m5, m6, [r5 + 12 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 509 * 16], m2
-
-; mode 33 [row 14]
-movd m1, [r3 + 20]
-palignr m1, m7, 2
-pmaddubsw m2, m1, [r5 + 6 * 16]
-pmulhrsw m2, m3
-movd m4, [r3 + 28]
-palignr m4, m6, 2
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m3
-packuswb m2, m5
-movu [r0 + 510 * 16], m2
-
-; mode 34 [row 0]
-movu m1, [r3 + 2]
-movu [r0 + 512 * 16], m1
-movu m2, [r3 + 18]
-palignr m3, m2, m1, 1
-movu [r0 + 513 * 16], m3
-palignr m3, m2, m1, 2
-movu [r0 + 514 * 16], m3
-palignr m3, m2, m1, 3
-movu [r0 + 515 * 16], m3
-palignr m3, m2, m1, 4
-movu [r0 + 516 * 16], m3
-palignr m3, m2, m1, 5
-movu [r0 + 517 * 16], m3
-palignr m3, m2, m1, 6
-movu [r0 + 518 * 16], m3
-palignr m3, m2, m1, 7
-movu [r0 + 519 * 16], m3
-palignr m3, m2, m1, 8
-movu [r0 + 520 * 16], m3
-palignr m3, m2, m1, 9
-movu [r0 + 521 * 16], m3
-palignr m3, m2, m1, 10
-movu [r0 + 522 * 16], m3
-palignr m3, m2, m1, 11
-movu [r0 + 523 * 16], m3
-palignr m3, m2, m1, 12
-movu [r0 + 524 * 16], m3
-
-; mode 33 [row 15]
-movu [r0 + 511 * 16], m3
-
-; mode 34
-palignr m3, m2, m1, 13
-movu [r0 + 525 * 16], m3
-palignr m3, m2, m1, 14
-movu [r0 + 526 * 16], m3
-palignr m3, m2, m1, 15
-movu [r0 + 527 * 16], m3
-
-RET
+cglobal all_angs_pred_16x16, 3,4,8
+ ; mode 2
+
+ movu m0, [r2 + 2 + 32]
+ movu [r0 + 0 * 16], m0
+
+ movu m1, m0
+
+ movu m6, [r2 + 18 + 32]
+ palignr m5, m6, m0, 1
+ movu [r0 + 1 * 16], m5
+
+ movu m4, m5
+
+ palignr m5, m6, m0, 2
+ movu [r0 + 2 * 16], m5
+ palignr m5, m6, m0, 3
+ movu [r0 + 3 * 16], m5
+ palignr m5, m6, m0, 4
+ movu [r0 + 4 * 16], m5
+ palignr m5, m6, m0, 5
+ movu [r0 + 5 * 16], m5
+ palignr m5, m6, m0, 6
+ movu [r0 + 6 * 16], m5
+ palignr m5, m6, m0, 7
+ movu [r0 + 7 * 16], m5
+
+ movu m7, m5
+
+ palignr m5, m6, m0, 8
+ movu [r0 + 8 * 16], m5
+
+ movu m2, m5
+
+ palignr m5, m6, m0, 9
+ movu [r0 + 9 * 16], m5
+
+ palignr m3, m6, m0, 10
+ movu [r0 + 10 * 16], m3
+ palignr m3, m6, m0, 11
+ movu [r0 + 11 * 16], m3
+ palignr m3, m6, m0, 12
+ movu [r0 + 12 * 16], m3
+
+ ; mode 3 [row 15]
+ movu [r0 + (3-2)*16*16 + 15 * 16], m3
+
+ palignr m3, m6, m0, 13
+ movu [r0 + 13 * 16], m3
+ palignr m3, m6, m0, 14
+ movu [r0 + 14 * 16], m3
+ palignr m3, m6, m0, 15
+ movu [r0 + 15 * 16], m3
+
+ ; mode 3 [row 0]
+ lea r3, [ang_table]
+ movu m3, [pw_1024]
+ movu m0, [r2 + 1 + 32]
+ punpcklbw m0, m1
+
+ ; mode 17 [row 8 - second half]
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 248 * 16 + 8], m1
+ ; mode 17 [row 8 - second half] end
+
+ pmaddubsw m1, m0, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ punpcklbw m7, m2
+ pmaddubsw m2, m7, [r3 + 26 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 16 * 16], m1
+
+ ;mode 6 [row 1]
+ movu [r0 + 65 * 16], m1
+
+ ; mode 4 [row 0]
+ pmaddubsw m1, m0, [r3 + 21 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 21 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 32 * 16], m1
+
+ ; mode 5 [row 0]
+ pmaddubsw m1, m0, [r3 + 17 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 17 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 48 * 16], m1
+
+ ; mode 6 [row 0]
+ pmaddubsw m1, m0, [r3 + 13 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 13 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 64 * 16], m1
+
+ ; mode 7 [row 0]
+ pmaddubsw m1, m0, [r3 + 9 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 9 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 80 * 16], m1
+
+ ; mode 7 [row 1]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 81 * 16], m1
+
+ ; mode 7 [row 2]
+ pmaddubsw m1, m0, [r3 + 27 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 27 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 82 * 16], m1
+
+ ; mode 8 [row 0]
+ pmaddubsw m1, m0, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 5 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 96 * 16], m1
+
+ ; mode 8 [row 1]
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 97 * 16], m1
+
+ ; mode 8 [row 2]
+ pmaddubsw m1, m0, [r3 + 15 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 15 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 98 * 16], m1
+
+ ; mode 8 [row 3]
+ pmaddubsw m1, m0, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 20 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 99 * 16], m1
+
+ ; mode 8 [row 4]
+ pmaddubsw m1, m0, [r3 + 25 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 25 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 100 * 16], m1
+
+ ; mode 8 [row 5]
+ pmaddubsw m1, m0, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 30 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 101 * 16], m1
+
+ ; mode 15 [row 13 - second half]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 221 * 16 + 8], m1
+ ; mode 15 [row 13 - second half] end
+
+ ; mode 15 [row 14 - second half]
+ pmaddubsw m1, m0, [r3 + 1 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 222 * 16 + 8], m1
+ ; mode 15 [row 14 - second half] end
+
+ ; mode 16 [row 10 - second half]
+ pmaddubsw m1, m0, [r3 + 25 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 234 * 16 + 8], m1
+ ; mode 16 [row 10 - second half] end
+
+ ; mode 16 [row 11 - second half]
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 235 * 16 + 8], m1
+ ; mode 16 [row 11 - second half] end
+
+ ; mode 3 [row 1]
+ movu m6, [r3 + 20 * 16]
+ movu m0, [r2 + 2 + 32]
+ punpcklbw m0, m4
+
+ ; mode 17 [row 7 - second half]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 247 * 16 + 8], m1
+
+ ; mode 17 [row 7 - second half] end
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m3
+ movu m2, [r2 + 10 + 32]
+ punpcklbw m2, m5
+ pmaddubsw m4, m2, m6
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 17 * 16], m1
+
+ ;mode 6 [row 3]
+ movu [r0 + 67 * 16], m1
+
+ ; mode 4 row [row 1]
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 33 * 16], m1
+
+ ; mode 4 row [row 2]
+ pmaddubsw m1, m0, [r3 + 31 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 31 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 34 * 16], m1
+
+ ; mode 7 [row 6]
+ movu [r0 + 86 * 16], m1
+
+ ; mode 5 row [row 1]
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 49 * 16], m1
+
+ ; mode 5 row [row 2]
+ pmaddubsw m1, m0, [r3 + 19 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 50 * 16], m1
+
+ ; mode 6 [row 2]
+ pmaddubsw m1, m0, [r3 + 7 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 7 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 66 * 16], m1
+
+ ; mode 7 [row 3]
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 83 * 16], m1
+
+ ; mode 7 [row 4]
+ pmaddubsw m1, m0, [r3 + 13 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 13 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 84 * 16], m1
+
+ ; mode 8 [row 8]
+ movu [r0 + 104 * 16], m1
+
+ ; mode 7 [row 5]
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 85 * 16], m1
+
+ ; mode 8 [row 6]
+ pmaddubsw m1, m0, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 102 * 16], m1
+
+ ; mode 8 [row 7]
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 103 * 16], m1
+
+ ; mode 8 [row 9]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 105 * 16], m1
+
+ ; mode 8 [row 10]
+ pmaddubsw m1, m0, [r3 + 23 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 106 * 16], m1
+
+ ; mode 8 [row 11]
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 107 * 16], m1
+
+ ; mode 3 [row 2]
+ movu m0, [r2 + 3 + 32]
+ movd m1, [r2 + 19 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+
+ ; mode 17 [row 6 - second half]
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 246 * 16 + 8], m1
+ ; mode 17 [row 6 - second half] end
+
+ pmaddubsw m1, m0, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 11 + 32]
+ movd m4, [r2 + 27 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 18 * 16], m1
+
+ ; mode 6 [row 5]
+ movu [r0 + 69 * 16], m1
+
+ ; mode 4 row [row 3]
+ pmaddubsw m1, m0, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 35 * 16], m1
+
+ ; mode 5 row [row 3]
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 51 * 16], m1
+
+ ; mode 5 row [row 4]
+ pmaddubsw m1, m0, [r3 + 21 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 21 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 52 * 16], m1
+
+ ; mode 6 [row 4]
+ pmaddubsw m1, m0, [r3 + 1 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 68 * 16], m1
+
+ ; mode 6 [row 6]
+ pmaddubsw m1, m0, [r3 + 27 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 27 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 70 * 16], m1
+
+ ; mode 7 [row 7]
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 87 * 16], m1
+
+ ; mode 7 [row 8]
+ pmaddubsw m1, m0, [r3 + 17 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 17 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 88 * 16], m1
+
+ ; mode 7 [row 9]
+ pmaddubsw m1, m0, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 89 * 16], m1
+
+ ; mode 8 [row 12]
+ pmaddubsw m1, m0, [r3 + 1 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 108 * 16], m1
+
+ ; mode 8 [row 13]
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 109 * 16], m1
+
+ ; mode 8 [row 14]
+ pmaddubsw m1, m0, [r3 + 11 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 110 * 16], m1
+
+ ; mode 8 [row 15]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 111 * 16], m1
+
+ ; mode 3 [row 3]
+ movu m0, [r2 + 4 + 32]
+ movd m1, [r2 + 20 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+
+ ; mode 17 [row 4 - second half]
+ pmaddubsw m1, m0, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 244 * 16 + 8], m1
+ ; mode 17 [row 4 - second half] end
+
+ ; mode 17 [row 5 - second half]
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 245 * 16 + 8], m1
+ ; mode 17 [row 5 - second half] end
+
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 12 + 32]
+ movd m4, [r2 + 28 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 19 * 16], m1
+
+ ; mode 6 [row 7]
+ movu [r0 + 71 * 16], m1
+
+ ; mode 4 row [row 4]
+ pmaddubsw m1, m0, [r3 + 9 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 36 * 16], m1
+
+ ; mode 4 row [row 5]
+ pmaddubsw m1, m0, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 37 * 16], m1
+
+ ; mode 7 row [row 13]
+ movu [r0 + 93 * 16], m1
+
+ ; mode 5 row [row 5]
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 53 * 16], m1
+
+ ; mode 5 row [row 6]
+ pmaddubsw m1, m0, [r3 + 23 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 54 * 16], m1
+
+ ; mode 6 [row 8]
+ pmaddubsw m1, m0, [r3 + 21 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 21 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 72 * 16], m1
+
+ ; mode 7 [row 12]
+ movu [r0 + 92 * 16], m1
+
+ ; mode 7 [row 10]
+ pmaddubsw m1, m0, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 90 * 16], m1
+
+ ; mode 7 [row 11]
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 91 * 16], m1
+
+ ; mode 3 [row 4]
+ movu m0, [r2 + 5 + 32]
+ movd m1, [r2 + 20 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+
+ ; mode 17 [row 3 - second half]
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 243 * 16 + 8], m1
+
+ ; mode 17 [row 3 - second half] end
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 13 + 32]
+ movd m4, [r2 + 29 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 20 * 16], m1
+
+ ;mode 6 [row 9]
+ movu [r0 + 73 * 16], m1
+
+ ; mode 4 row [row 6]
+ movu m6, [r3 + 19 * 16]
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, m6
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 38 * 16], m1
+
+ ; mode 3 [row 5]
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 21 * 16], m1
+
+ ;mode 6 [row 11]
+ movu [r0 + 75 * 16], m1
+
+ ; mode 5 row [row 7]
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 55 * 16], m1
+
+ ; mode 5 row [row 8]
+ pmaddubsw m1, m0, [r3 + 25 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 56 * 16], m1
+
+ ; mode 6 [row 10]
+ pmaddubsw m1, m0, [r3 + 15 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 15 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 74 * 16], m1
+
+ ; mode 7 [row 14]
+ pmaddubsw m1, m0, [r3 + 7 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 7 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 94 * 16], m1
+
+ ; mode 7 [row 15]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 95 * 16], m1
+
+ ; mode 3 [row 6]
+ movu m0, [r2 + 6 + 32]
+ movd m1, [r2 + 22 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+
+ ; mode 17 [row 2 - second half]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 242 * 16 + 8], m1
+ ; mode 17 [row 2 - second half] end
+
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 14 + 32]
+ movd m4, [r2 + 30 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 22 * 16], m1
+
+ ; mode 6 [row 13]
+ movu [r0 + 77 * 16], m1
+
+ ; mode 4 row [row 7]
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 39 * 16], m1
+
+ ; mode 4 row [row 8]
+ pmaddubsw m1, m0, [r3 + 29 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 40 * 16], m1
+
+ ; mode 5 row [row 9]
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 57 * 16], m1
+
+ ; mode 5 row [row 10]
+ pmaddubsw m1, m0, [r3 + 27 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 27 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 58 * 16], m1
+
+ ; mode 6 [row 12]
+ pmaddubsw m1, m0, [r3 + 9 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 76 * 16], m1
+
+ ; mode 3 [row 7]
+ movu m0, [r2 + 7 + 32]
+ movd m1, [r2 + 27 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+
+ ; mode 17 [row 1 - second half]
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 241 * 16 + 8], m1
+ ; mode 17 [row 1 - second half] end
+
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 15 + 32]
+ movd m4, [r2 + 25 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 23 * 16], m1
+
+ ; mode 6 [row 15]
+ movu [r0 + 79 * 16], m1
+
+ ; mode 4 row [row 9]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 41 * 16], m1
+
+ ; mode 5 row [row 11]
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 59 * 16], m1
+
+ ; mode 5 row [row 12]
+ pmaddubsw m1, m0, [r3 + 29 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 60 * 16], m1
+
+ ; mode 6 [row 14]
+ pmaddubsw m1, m0, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 78 * 16], m1
+
+ ; mode 3 [row 8]
+ movu m0, [r2 + 8 + 32]
+ movd m1, [r2 + 24 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 16 + 32]
+ psrldq m4, m2, 1
+ pinsrb m4, [r2 + 32], 15
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 24 * 16], m1
+
+ ; mode 4 row [row 10]
+ pmaddubsw m1, m0, [r3 + 7 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 7 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 42 * 16], m1
+
+ ; mode 4 row [row 11]
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 43 * 16], m1
+
+ ; mode 5 row [row 13]
+ pmaddubsw m1, m0, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 61 * 16], m1
+
+ ; mode 5 row [row 14]
+ pmaddubsw m1, m0, [r3 + 31 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 31 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 62 * 16], m1
+
+ ; mode 3 [row 9]
+ movu m0, [r2 + 9 + 32]
+ movd m1, [r2 + 16 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 17 + 32]
+ movd m4, [r2 + 33 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 25 * 16], m1
+
+ ; mode 4 row [row 12]
+ pmaddubsw m1, m0, [r3 + 17 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 17 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 44 * 16], m1
+
+ ; mode 3 [row 10]
+ pmaddubsw m1, m0, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 26 * 16], m1
+
+ ; mode 5 row [row 15]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 63 * 16], m1
+
+ ; mode 3 [row 11]
+ movu m0, [r2 + 10 + 32]
+ movd m1, [r2 + 26 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 18 + 32]
+ movd m4, [r2 + 34 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 27 * 16], m1
+
+ ; mode 4 row [row 13]
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 45 * 16], m1
+
+ ; mode 4 row [row 14]
+ pmaddubsw m1, m0, [r3 + 27 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 27 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 46 * 16], m1
+
+ ; mode 3 [row 12]
+ movu m0, [r2 + 11 + 32]
+ movd m1, [r2 + 27 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 19 + 32]
+ movd m4, [r2 + 35 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 28 * 16], m1
+
+ ; mode 4 row [row 15]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m2, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 47 * 16], m1
+
+ ; mode 3 [row 13]
+ movu m0, [r2 + 12 + 32]
+ movd m1, [r2 + 28 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 20 + 32]
+ movd m4, [r2 + 36 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 29 * 16], m1
+
+ ; mode 3 [row 14]
+ movu m0, [r2 + 13 + 32]
+ movd m1, [r2 + 29 + 32]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ movu m2, [r2 + 21 + 32]
+ movd m4, [r2 + 37 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m2, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 30 * 16], m1
+
+ ; mode 9
+ movu m0, [r1 + 1 + 32]
+ movd m1, [r1 + 17 + 32]
+ palignr m1, m0, 1
+
+ ; mode 9 [row 15]
+ movu [r0 + 127 * 16], m1
+
+ ; mode 9 [row 0]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ movu m7, [r1 + 9 + 32]
+ movd m4, [r2 + 25 + 32]
+ palignr m2, m7, 1
+ punpcklbw m7, m2
+ pmaddubsw m2, m7, [r3 + 2 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 112 * 16], m1
+
+ ; mode 9 [row 1]
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 4 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 113 * 16], m1
+
+ ; mode 9 [row 2]
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 114 * 16], m1
+
+ ; mode 9 [row 3]
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 115 * 16], m1
+
+ ; mode 9 [row 4]
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 116 * 16], m1
+
+ ; mode 9 [row 5]
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 12 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 117 * 16], m1
+
+ ; mode 9 [row 6]
+ pmaddubsw m1, m0, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 14 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 118 * 16], m1
+
+ ; mode 9 [row 7]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 119 * 16], m1
+
+ ; mode 9 [row 8]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 120 * 16], m1
+
+ ; mode 9 [row 9]
+ pmaddubsw m1, m0, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 20 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 121 * 16], m1
+
+ ; mode 9 [row 10]
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 22 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 122 * 16], m1
+
+ ; mode 9 [row 11]
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 24 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 123 * 16], m1
+
+ ; mode 9 [row 12]
+ pmaddubsw m1, m0, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 26 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 124 * 16], m1
+
+ ; mode 9 [row 13]
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 28 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 125 * 16], m1
+
+ ; mode 9 [row 14]
+ pmaddubsw m1, m0, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 30 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 126 * 16], m1
+
+ ; mode 10
+ movu m1, [r1 + 1 + 32]
+ movu [r0 + 128 * 16], m1
+ movu [r0 + 129 * 16], m1
+ movu [r0 + 130 * 16], m1
+ movu [r0 + 131 * 16], m1
+ movu [r0 + 132 * 16], m1
+ movu [r0 + 133 * 16], m1
+ movu [r0 + 134 * 16], m1
+ movu [r0 + 135 * 16], m1
+ movu [r0 + 136 * 16], m1
+ movu [r0 + 137 * 16], m1
+ movu [r0 + 138 * 16], m1
+ movu [r0 + 139 * 16], m1
+ movu [r0 + 140 * 16], m1
+ movu [r0 + 141 * 16], m1
+ movu [r0 + 142 * 16], m1
+ movu [r0 + 143 * 16], m1
+
+ pxor m0, m0
+ pshufb m1, m1, m0
+ punpcklbw m1, m0
+ pinsrb m2, [r1], 0
+ pshufb m2, m2, m0
+ punpcklbw m2, m0
+ movu m4, [r1 + 1]
+ punpcklbw m5, m4, m0
+ punpckhbw m4, m0
+ psubw m5, m2
+ psubw m4, m2
+ psraw m5, 1
+ psraw m4, 1
+ paddw m5, m1
+ paddw m4, m1
+ packuswb m5, m4
+
+ pextrb [r0 + 128 * 16], m5, 0
+ pextrb [r0 + 129 * 16], m5, 1
+ pextrb [r0 + 130 * 16], m5, 2
+ pextrb [r0 + 131 * 16], m5, 3
+ pextrb [r0 + 132 * 16], m5, 4
+ pextrb [r0 + 133 * 16], m5, 5
+ pextrb [r0 + 134 * 16], m5, 6
+ pextrb [r0 + 135 * 16], m5, 7
+ pextrb [r0 + 136 * 16], m5, 8
+ pextrb [r0 + 137 * 16], m5, 9
+ pextrb [r0 + 138 * 16], m5, 10
+ pextrb [r0 + 139 * 16], m5, 11
+ pextrb [r0 + 140 * 16], m5, 12
+ pextrb [r0 + 141 * 16], m5, 13
+ pextrb [r0 + 142 * 16], m5, 14
+ pextrb [r0 + 143 * 16], m5, 15
+
+ ; mode 11
+ movu m0, [r1 + 32]
+ pinsrb m0, [r1], 0
+
+ ; mode 11 [row 15]
+ movu [r0 + 159 * 16], m0
+
+ ; mode 11 [row 0]
+ movu m1, [r1 + 1 + 32]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ movu m7, [r1 + 8 + 32]
+ movu m2, [r1 + 9 + 32]
+ punpcklbw m7, m2
+ pmaddubsw m2, m7, [r3 + 30 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 144 * 16], m1
+
+ ; mode 11 [row 1]
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 28 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 145 * 16], m1
+
+ ; mode 11 [row 2]
+ pmaddubsw m1, m0, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 26 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 146 * 16], m1
+
+ ; mode 11 [row 3]
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 24 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 147 * 16], m1
+
+ ; mode 11 [row 4]
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 22 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 148 * 16], m1
+
+ ; mode 11 [row 5]
+ pmaddubsw m1, m0, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 20 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 149 * 16], m1
+
+ ; mode 11 [row 6]
+ pmaddubsw m1, m0, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 150 * 16], m1
+
+ ; mode 11 [row 7]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 151 * 16], m1
+
+ ; mode 11 [row 8]
+ pmaddubsw m1, m0, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 14 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 152 * 16], m1
+
+ ; mode 11 [row 9]
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 12 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 153 * 16], m1
+
+ ; mode 11 [row 10]
+ pmaddubsw m1, m0, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 154 * 16], m1
+
+ ; mode 11 [row 11]
+ pmaddubsw m1, m0, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 155 * 16], m1
+
+ ; mode 11 [row 12]
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 156 * 16], m1
+
+ ; mode 11 [row 13]
+ pmaddubsw m1, m0, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 4 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 157 * 16], m1
+
+ ; mode 11 [row 14]
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 2 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 158 * 16], m1
+
+ ; mode 12 [row 0]
+ movu m0, [r2 + 32]
+ pinsrb m0, [r2], 0
+ movu m1, [r2 + 1 + 32]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r3 + 27 * 16]
+ pmulhrsw m1, m3
+ movu m7, [r2 + 8 + 32]
+ movd m2, [r2 + 24 + 32]
+ palignr m2, m7, 1
+ punpcklbw m7, m2
+ pmaddubsw m2, m7, [r3 + 27 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 160 * 16], m1
+
+ ; mode 12 [row 1]
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 22 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 161 * 16], m1
+
+ ; mode 12 [row 2]
+ pmaddubsw m1, m0, [r3 + 17 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 17 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 162 * 16], m1
+
+ ; mode 12 [row 3]
+ pmaddubsw m1, m0, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 12 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 163 * 16], m1
+
+ ; mode 12 [row 4]
+ pmaddubsw m1, m0, [r3 + 7 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 7 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 164 * 16], m1
+
+ ; mode 12 [row 5]
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 2 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 165 * 16], m1
+
+ ; mode 13 [row 0]
+ pmaddubsw m1, m0, [r3 + 23 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 23 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 176 * 16], m1
+
+ ; mode 13 [row 1]
+ pmaddubsw m1, m0, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 14 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 177 * 16], m1
+
+ ; mode 13 [row 2]
+ pmaddubsw m1, m0, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 5 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 178 * 16], m1
+
+ ; mode 14 [row 0]
+ pmaddubsw m1, m0, [r3 + 19 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 19 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 192 * 16], m1
+
+ ; mode 14 [row 1]
+ pmaddubsw m1, m0, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 193 * 16], m1
+
+ ; mode 17 [row 0]
+ movu [r0 + 240 * 16], m1
+
+ ; mode 15 [row 0]
+ pmaddubsw m1, m0, [r3 + 15 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 15 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 208 * 16], m1
+
+ ; mode 15 [row 15 - second half]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 223 * 16 + 8], m1
+ ; mode 15 [row 15 - second half] end
+
+ ; mode 16 [row 0]
+ pmaddubsw m1, m0, [r3 + 11 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m2, m7, [r3 + 11 * 16]
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ movu [r0 + 224 * 16], m1
+
+ ; mode 17 [row 9 - second half]
+ pmaddubsw m1, m0, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 249 * 16 + 8], m1
+ ; mode 17 [row 9 - second half] end
+
+ ; mode 17 [row 10 - second half]
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 250 * 16 + 8], m1
+ ; mode 17 [row 10 - second half] end
+
+ ; mode 17 [row 1 - first half]
+ pslldq m6, m0, 2
+ pinsrb m6, [r2], 1
+ pinsrb m6, [r2 + 1], 0
+ pmaddubsw m1, m6, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 241 * 16], m1
+
+ ; mode 17 [row 11 - second half]
+ pmaddubsw m1, m6, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 251 * 16 + 8], m1
+ ; mode 17 [row 11 - second half] end
+
+ ; mode 17 [row 2 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 1], 1
+ pinsrb m6, [r2 + 2], 0
+ pmaddubsw m1, m6, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 242 * 16], m1
+
+ ; mode 17 [row 12 - second half]
+ pmaddubsw m1, m6, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 252 * 16 + 8], m1
+ ; mode 17 [row 12 - second half] end
+
+ ; mode 17 [row 3 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 2], 1
+ pinsrb m6, [r2 + 4], 0
+ pmaddubsw m1, m6, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 243 * 16], m1
+
+ ; mode 17 [row 13 - first half]
+ pmaddubsw m1, m6, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 253 * 16 + 8], m1
+
+ ; mode 17 [row 4 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 4], 1
+ pinsrb m6, [r2 + 5], 0
+ pmaddubsw m1, m6, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 244 * 16], m1
+
+ ; mode 17 [row 5 - first half]
+ pmaddubsw m1, m6, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 245 * 16], m1
+
+ ; mode 17 [row 14 - second half]
+ pmaddubsw m1, m6, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 254 * 16 + 8], m1
+ ; mode 17 [row 14 - second half] end
+
+ ; mode 17 [row 6 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 5], 1
+ pinsrb m6, [r2 + 6], 0
+ pmaddubsw m1, m6, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 246 * 16], m1
+
+ ; mode 17 [row 7 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 6], 1
+ pinsrb m6, [r2 + 7], 0
+ pmaddubsw m1, m6, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 247 * 16], m1
+
+ ; mode 17 [row 8 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 7], 1
+ pinsrb m6, [r2 + 9], 0
+ pmaddubsw m1, m6, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 248 * 16], m1
+
+ ; mode 17 [row 9 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 9], 1
+ pinsrb m6, [r2 + 10], 0
+ pmaddubsw m1, m6, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 249 * 16], m1
+
+ ; mode 17 [row 10 - first half]
+ pmaddubsw m1, m6, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 250 * 16], m1
+
+ ; mode 17 [row 11 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 10], 1
+ pinsrb m6, [r2 + 11], 0
+ pmaddubsw m1, m6, [r3 + 8 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 251 * 16], m1
+
+ ; mode 17 [row 12 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 11], 1
+ pinsrb m6, [r2 + 12], 0
+ pmaddubsw m1, m6, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 252 * 16], m1
+
+ ; mode 17 [row 13 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 12], 1
+ pinsrb m6, [r2 + 14], 0
+ pmaddubsw m1, m6, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 253 * 16], m1
+
+ ; mode 17 [row 14 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 14], 1
+ pinsrb m6, [r2 + 15], 0
+ pmaddubsw m1, m6, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 254 * 16], m1
+
+ ; mode 16 [row 12 - second half]
+ pmaddubsw m1, m0, [r3 + 15 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 236 * 16 + 8], m1
+ ; mode 16 [row 12 - second half]
+
+ ; mode 12 [row 6]
+ pslldq m2, m0, 2
+ pinsrb m2, [r2], 1
+ pinsrb m2, [r2 + 6], 0
+ pmaddubsw m1, m2, [r3 + 29 * 16]
+ pmulhrsw m1, m3
+ movu m0, [r2 + 7 + 32]
+ psrldq m4, m0, 1
+ punpcklbw m0, m4
+ pmaddubsw m4, m0, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 166 * 16], m1
+
+ ; mode 12 [row 7]
+ pmaddubsw m1, m2, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 167 * 16], m1
+
+ ; mode 12 [row 8]
+ pmaddubsw m1, m2, [r3 + 19 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 168 * 16], m1
+
+ ; mode 12 [row 9]
+ pmaddubsw m1, m2, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 169 * 16], m1
+
+ ; mode 12 [row 10]
+ pmaddubsw m1, m2, [r3 + 9 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 170 * 16], m1
+
+ ; mode 12 [row 11]
+ pmaddubsw m1, m2, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 171 * 16], m1
+
+ ; mode 13 [row 3]
+ pinsrb m7, m2, [r2 + 4], 0
+ pmaddubsw m1, m7, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 179 * 16], m1
+
+ ; mode 13 [row 4]
+ pmaddubsw m1, m7, [r3 + 19 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 180 * 16], m1
+
+ ; mode 13 [row 5]
+ pmaddubsw m1, m7, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 181 * 16], m1
+
+ ; mode 13 [row 6]
+ pmaddubsw m1, m7, [r3 + 1 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 182 * 16], m1
+
+ ; mode 14 [row 2]
+ pinsrb m5, m7, [r2 + 2], 0
+ pmaddubsw m1, m5, [r3 + 25 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 194 * 16], m1
+
+ ; mode 14 [row 3]
+ pmaddubsw m1, m5, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 195 * 16], m1
+
+ ; mode 15 [row 1]
+ pmaddubsw m1, m5, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 209 * 16], m1
+
+ ; mode 15 [row 2]
+ pmaddubsw m1, m5, [r3 + 13 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 13 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 210 * 16], m1
+
+ ; mode 16 [row 1]
+ pmaddubsw m1, m5, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 225 * 16], m1
+
+ ; mode 16 [row 2]
+ pmaddubsw m1, m5, [r3 + 1 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m4, m0, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m1, m4
+ movu [r0 + 226 * 16], m1
+
+ ; mode 16 [row 13 - second half]
+ pmaddubsw m1, m5, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 237 * 16 + 8], m1
+ ; mode 16 [row 13 - second half]
+
+ ; mode 16 [row 14 - second half]
+ pmaddubsw m1, m5, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 238 * 16 + 8], m1
+ ; mode 16 [row 14 - second half]
+
+ ; mode 16 [row 3]
+ pslldq m6, m5, 2
+ pinsrb m6, [r2 + 2], 1
+ pinsrb m6, [r2 + 3], 0
+ pmaddubsw m1, m6, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 227 * 16], m1
+
+ ; mode 16 [row 15 - second half]
+ pmaddubsw m1, m6, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 239 * 16 + 8], m1
+ ; mode 16 [row 15 - second half] end
+
+ ; mode 16 [row 4- first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 3], 1
+ pinsrb m6, [r2 + 5], 0
+ pmaddubsw m1, m6, [r3 + 23 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 228 * 16], m1
+
+ ; mode 16 [row 5- first half]
+ pmaddubsw m1, m6, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 229 * 16], m1
+
+ ; mode 16 [row 6- first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 5], 1
+ pinsrb m6, [r2 + 6], 0
+ pmaddubsw m1, m6, [r3 + 13 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 230 * 16], m1
+
+ ; mode 16 [row 7- first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 6], 1
+ pinsrb m6, [r2 + 8], 0
+ pmaddubsw m1, m6, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 231 * 16], m1
+
+ ; mode 16 [row 8- first half]
+ pmaddubsw m1, m6, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 232 * 16], m1
+ ; mode 19 [row 0 - second half] end
+
+ ; mode 16 [row 9- first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 8], 1
+ pinsrb m6, [r2 + 9], 0
+ pmaddubsw m1, m6, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 233 * 16], m1
+
+ ; mode 16 [row 10 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 9], 1
+ pinsrb m6, [r2 + 11], 0
+ pmaddubsw m1, m6, [r3 + 25 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 234 * 16], m1
+
+ ; mode 16 [row 11 - first half]
+ pmaddubsw m1, m6, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 235 * 16], m1
+
+ ; mode 16 [row 12 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 11], 1
+ pinsrb m6, [r2 + 12], 0
+ pmaddubsw m1, m6, [r3 + 15 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 236 * 16], m1
+
+ ; mode 16 [row 13 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 12], 1
+ pinsrb m6, [r2 + 14], 0
+ pmaddubsw m1, m6, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 237 * 16], m1
+
+ ; mode 16 [row 14 - first half]
+ pmaddubsw m1, m6, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 238 * 16], m1
+
+ ; mode 16 [row 15 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 14], 1
+ pinsrb m6, [r2 + 15], 0
+ pmaddubsw m1, m6, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 239 * 16], m1
+
+ ; mode 14 [row 4]
+ pslldq m5, 2
+ pinsrb m5, [r2 + 2], 1
+ pinsrb m5, [r2 + 5], 0
+ movu m4, [r2 + 6 + 32]
+ psrldq m0, m4, 1
+ punpcklbw m4, m0
+
+ ; mode 16 [row 3 - second half]
+ pmaddubsw m1, m4, [r3 + 12 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 227 * 16 + 8], m1
+
+ ; mode 16 [row 3 - second half] end
+ pmaddubsw m1, m5, [r3 + 31 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m0, m4, [r3 + 31 * 16]
+ pmulhrsw m0, m3
+ packuswb m1, m0
+ movu [r0 + 196 * 16], m1
+
+ ; mode 14 [row 5]
+ pmaddubsw m1, m5, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m0, m4, [r3 + 18 * 16]
+ pmulhrsw m0, m3
+ packuswb m1, m0
+ movu [r0 + 197 * 16], m1
+
+ ; mode 14 [row 6]
+ pmaddubsw m1, m5, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m0, m4, [r3 + 5 * 16]
+ pmulhrsw m0, m3
+ packuswb m1, m0
+ movu [r0 + 198 * 16], m1
+
+ ; mode 15 [row 3]
+ movu m6, m5
+ pinsrb m6, [r2 + 4], 0
+ pmaddubsw m1, m6, [r3 + 28 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m0, m4, [r3 + 28 * 16]
+ pmulhrsw m0, m3
+ packuswb m1, m0
+ movu [r0 + 211 * 16], m1
+
+ ; mode 15 [row 4]
+ pmaddubsw m1, m6, [r3 + 11 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m0, m4, [r3 + 11 * 16]
+ pmulhrsw m0, m3
+ packuswb m1, m0
+ movu [r0 + 212 * 16], m1
+
+ ; mode 15 [row 5 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 4], 1
+ pinsrb m6, [r2 + 6], 0
+ pmaddubsw m1, m6, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 213 * 16], m1
+
+ ; mode 15 [row 6 - first half]
+ pmaddubsw m1, m6, [r3 + 9 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 214 * 16], m1
+
+ ; mode 15 [row 7 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 6], 1
+ pinsrb m6, [r2 + 8], 0
+ pmaddubsw m1, m6, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 215 * 16], m1
+
+ ; mode 15 [row 8 - first half]
+ pmaddubsw m1, m6, [r3 + 7 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 216 * 16], m1
+
+ ; mode 15 [row 9 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 8], 1
+ pinsrb m6, [r2 + 9], 0
+ pmaddubsw m1, m6, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 217 * 16], m1
+
+ ; mode 15 [row 10 - first half]
+ pmaddubsw m1, m6, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 218 * 16], m1
+
+ ; mode 15 [row 11 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 9], 1
+ pinsrb m6, [r2 + 11], 0
+ pmaddubsw m1, m6, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 219 * 16], m1
+
+ ; mode 15 [row 12 - first half]
+ pmaddubsw m1, m6, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 220 * 16], m1
+
+ ; mode 15 [row 13 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 11], 1
+ pinsrb m6, [r2 + 13], 0
+ pmaddubsw m1, m6, [r3 + 18 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 221 * 16], m1
+
+ ; mode 15 [row 14 - first half]
+ pmaddubsw m1, m6, [r3 + 1 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 222 * 16], m1
+
+ ; mode 15 [row 15 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 13], 1
+ pinsrb m6, [r2 + 15], 0
+ pmaddubsw m1, m6, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 223 * 16], m1
+
+ ; mode 14 [row 7]
+ pslldq m5, 2
+ pinsrb m5, [r2 + 5], 1
+ pinsrb m5, [r2 + 7], 0
+ movu m0, [r2 + 5 + 32]
+ psrldq m6, m0, 1
+ punpcklbw m0, m6
+
+ ; mode 15 [row 5 - second half]
+ pmaddubsw m1, m0, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 213 * 16 + 8], m1
+ ; mode 15 [row 5 - second half] end
+
+ ; mode 15 [row 6 - second half]
+ pmaddubsw m1, m0, [r3 + 9 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 214 * 16 + 8], m1
+ ; mode 15 [row 6 - second half] end
+
+ ; mode 16 [row 4 - second half]
+ pmaddubsw m1, m0, [r3 + 23 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 228 * 16 + 8], m1
+ ; mode 16 [row 4 - second half] end
+
+ ; mode 16 [row 5 - second half]
+ pmaddubsw m1, m0, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 229 * 16 + 8], m1
+
+ ; mode 16 [row 5 - second half] end
+ pmaddubsw m1, m5, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 24 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 199 * 16], m1
+
+ ; mode 14 [row 8]
+ pmaddubsw m1, m5, [r3 + 11 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 11 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 200 * 16], m1
+
+ ; mode 14 [row 9]
+ pslldq m5, 2
+ pinsrb m5, [r2 + 7], 1
+ pinsrb m5, [r2 + 10], 0
+ movu m0, [r2 + 4 + 32]
+ psrldq m6, m0, 1
+ punpcklbw m0, m6
+
+ ; mode 15 [row 7 - second half]
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 215 * 16 + 8], m1
+ ; mode 15 [row 7 - second half] end
+
+ ; mode 15 [row 8 - second half]
+ pmaddubsw m1, m0, [r3 + 7 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 216 * 16 + 8], m1
+ ; mode 15 [row 8 - second half] end
+
+ ; mode 16 [row 6 - second half]
+ pmaddubsw m1, m0, [r3 + 13 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 230 * 16 + 8], m1
+ ; mode 16 [row 6 - second half] end
+
+ ; mode 15 [row 6 - second half] end
+ pmaddubsw m1, m5, [r3 + 30 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 30 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 201 * 16], m1
+
+ ; mode 14 [row 10]
+ pmaddubsw m1, m5, [r3 + 17 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 17 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 202 * 16], m1
+
+ ; mode 14 [row 11]
+ pmaddubsw m1, m5, [r3 + 4 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 4 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 203 * 16], m1
+
+ ; mode 14 [row 12]
+ pslldq m5, 2
+ pinsrb m5, [r2 + 10], 1
+ pinsrb m5, [r2 + 12], 0
+ movu m0, [r2 + 3 + 32]
+ psrldq m6, m0, 1
+ punpcklbw m0, m6
+
+ ; mode 15 [row 9 - second half]
+ pmaddubsw m1, m0, [r3 + 22 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 217 * 16 + 8], m1
+ ; mode 15 [row 9 - second half] end
+
+ ; mode 15 [row 10 - second half]
+ pmaddubsw m1, m0, [r3 + 5 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 218 * 16 + 8], m1
+ ; mode 15 [row 10 - second half] end
+
+ ; mode 16 [row 7 - second half]
+ pmaddubsw m1, m0, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 231 * 16 + 8], m1
+ ; mode 16 [row 7 - second half] end
+
+ ; mode 16 [row 8 - second half]
+ pmaddubsw m1, m0, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 232 * 16 + 8], m1
+ ; mode 16 [row 8 - second half] end
+
+ pmaddubsw m1, m5, [r3 + 23 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 23 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 204 * 16], m1
+
+ ; mode 14 [row 13]
+ pmaddubsw m1, m5, [r3 + 10 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 10 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 205 * 16], m1
+
+ ; mode 14 [row 14]
+ pslldq m5, 2
+ pinsrb m5, [r2 + 12], 1
+ pinsrb m5, [r2 + 15], 0
+ movu m0, [r2 + 2 + 32]
+ psrldq m6, m0, 1
+ punpcklbw m0, m6
+
+ ; mode 15 [row 11 - second half]
+ pmaddubsw m1, m0, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 219 * 16 + 8], m1
+ ; mode 15 [row 11 - second half] end
+
+ ; mode 15 [row 12 - second half]
+ pmaddubsw m1, m0, [r3 + 3 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 220 * 16 + 8], m1
+ ; mode 15 [row 12 - second half] end
+
+ ; mode 16 [row 9 - second half]
+ pmaddubsw m1, m0, [r3 + 14 * 16]
+ pmulhrsw m1, m3
+ packuswb m1, m1
+ movh [r0 + 233 * 16 + 8], m1
+
+ ; mode 16 [row 9 - second half] end
+ pmaddubsw m1, m5, [r3 + 29 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 29 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 206 * 16], m1
+
+ ; mode 14 [row 15]
+ pmaddubsw m1, m5, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m6, m0, [r3 + 16 * 16]
+ pmulhrsw m6, m3
+ packuswb m1, m6
+ movu [r0 + 207 * 16], m1
+
+ ; mode 12 [row 12]
+ pslldq m0, m2, 2
+ pinsrb m0, [r2 + 6], 1
+ pinsrb m0, [r2 + 13], 0
+ pmaddubsw m1, m0, [r3 + 31 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 31 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 172 * 16], m1
+
+ ; mode 12 [row 13]
+ pmaddubsw m1, m0, [r3 + 26 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 26 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 173 * 16], m1
+
+ ; mode 12 [row 14]
+ pmaddubsw m1, m0, [r3 + 21 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 21 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 174 * 16], m1
+
+ ; mode 12 [row 15]
+ pmaddubsw m1, m0, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 175 * 16], m1
+
+ ; mode 13 [row 7]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 4], 1
+ pinsrb m7, [r2 + 7], 0
+ pmaddubsw m1, m7, [r3 + 24 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 24 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 183 * 16], m1
+
+ ; mode 13 [row 8]
+ pmaddubsw m1, m7, [r3 + 15 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 15 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 184 * 16], m1
+
+ ; mode 13 [row 9]
+ pmaddubsw m1, m7, [r3 + 6 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 185 * 16], m1
+
+ ; mode 13 [row 10]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 7], 1
+ pinsrb m7, [r2 + 11], 0
+ pmaddubsw m1, m7, [r3 + 29 * 16]
+ pmulhrsw m1, m3
+ movu m4, [r2 + 5 + 32]
+ psrldq m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, [r3 + 29 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 186 * 16], m1
+
+ ; mode 13 [row 11]
+ pmaddubsw m1, m7, [r3 + 20 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 20 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 187 * 16], m1
+
+ ; mode 13 [row 12]
+ pmaddubsw m1, m7, [r3 + 11 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 11 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 188 * 16], m1
+
+ ; mode 13 [row 13]
+ pmaddubsw m1, m7, [r3 + 2 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 2 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 189 * 16], m1
+
+ ; mode 13 [row 14]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 11], 1
+ pinsrb m7, [r2 + 14], 0
+ pmaddubsw m1, m7, [r3 + 25 * 16]
+ pmulhrsw m1, m3
+ movu m4, [r2 + 4 + 32]
+ psrldq m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, [r3 + 25 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 190 * 16], m1
+
+ ; mode 13 [row 15]
+ pmaddubsw m1, m7, [r3 + 16 * 16]
+ pmulhrsw m1, m3
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m1, m5
+ movu [r0 + 191 * 16], m1
+
+ ; mode 17 [row 15]
+ movu m0, [r2]
+ pshufb m1, m0, [tab_S1]
+ movu [r0 + 255 * 16], m1
+ movu m2, [r2 + 32]
+ pinsrb m2, [r2], 0
+ movd [r0 + 255 * 16 + 12], m2
+
+ ; mode 18 [row 0]
+ movu [r0 + 256 * 16], m0
+
+ ; mode 18 [row 1]
+ pslldq m4, m0, 1
+ pinsrb m4, [r2 + 1 + 32], 0
+ movu [r0 + 257 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 2 + 32], 0
+ movu [r0 + 258 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 3 + 32], 0
+ movu [r0 + 259 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 4 + 32], 0
+ movu [r0 + 260 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 5 + 32], 0
+ movu [r0 + 261 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 6 + 32], 0
+ movu [r0 + 262 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 7 + 32], 0
+ movu [r0 + 263 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 8 + 32], 0
+ movu [r0 + 264 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 9 + 32], 0
+ movu [r0 + 265 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 10 + 32], 0
+ movu [r0 + 266 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 11 + 32], 0
+ movu [r0 + 267 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 12 + 32], 0
+ movu [r0 + 268 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 13 + 32], 0
+ movu [r0 + 269 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 14 + 32], 0
+ movu [r0 + 270 * 16], m4
+ pslldq m4, 1
+ pinsrb m4, [r2 + 15 + 32], 0
+ movu [r0 + 271 * 16], m4
+
+ ; mode 19 [row 0]
+ psrldq m2, m0, 1
+ punpcklbw m0, m2
+ movu m5, [r2 + 8]
+ psrldq m6, m5, 1
+ punpcklbw m5, m6
+ pmaddubsw m4, m0, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 6 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 272 * 16], m4
+
+ ; mode 20 [row 0]
+ pmaddubsw m4, m0, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 11 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 288 * 16], m4
+
+ ; mode 21 [row 0]
+ pmaddubsw m4, m0, [r3 + 15 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 15 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 304 * 16], m4
+
+ ; mode 22 [row 0]
+ pmaddubsw m4, m0, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 19 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 320 * 16], m4
+
+ ; mode 22 [row 1]
+ pmaddubsw m4, m0, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 6 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 321 * 16], m4
+
+ ; mode 23 [row 0]
+ pmaddubsw m4, m0, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 23 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 336 * 16], m4
+
+ ; mode 23 [row 1]
+ pmaddubsw m4, m0, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 14 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 337 * 16], m4
+
+ ; mode 23 [row 2]
+ pmaddubsw m4, m0, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 5 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 338 * 16], m4
+
+ ; mode 24 [row 0]
+ pmaddubsw m4, m0, [r3 + 27 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 27 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 352 * 16], m4
+
+ ; mode 24 [row 1]
+ pmaddubsw m4, m0, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 22 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 353 * 16], m4
+
+ ; mode 24 [row 2]
+ pmaddubsw m4, m0, [r3 + 17 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 17 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 354 * 16], m4
+
+ ; mode 24 [row 3]
+ pmaddubsw m4, m0, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 12 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 355 * 16], m4
+
+ ; mode 24 [row 4]
+ pmaddubsw m4, m0, [r3 + 7 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 7 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 356 * 16], m4
+
+ ; mode 24 [row 5]
+ pmaddubsw m4, m0, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 2 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 357 * 16], m4
+
+ ; mode 24 [row 6 - first half]
+ pslldq m7, m0, 2
+ pinsrb m7, [r2 + 0], 1
+ pinsrb m7, [r2 + 6 + 32], 0
+ pmaddubsw m4, m7, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 358 * 16], m4
+
+ ; mode 24 [row 7 - first half]
+ pmaddubsw m4, m7, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 359 * 16], m4
+
+ ; mode 24 [row 8 - first half]
+ pmaddubsw m4, m7, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 360 * 16], m4
+
+ ; mode 24 [row 9 - first half]
+ pmaddubsw m4, m7, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 361 * 16], m4
+
+ ; mode 24 [row 10 - first half]
+ pmaddubsw m4, m7, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 362 * 16], m4
+
+ ; mode 24 [row 11 - first half]
+ pmaddubsw m4, m7, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 363 * 16], m4
+
+ ; mode 24 [row 12 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 6 + 32], 1
+ pinsrb m7, [r2 + 13 + 32], 0
+ pmaddubsw m4, m7, [r3 + 31 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 364 * 16], m4
+
+ ; mode 24 [row 13 - first half]
+ pmaddubsw m4, m7, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 365 * 16], m4
+
+ ; mode 24 [row 14 - first half]
+ pmaddubsw m4, m7, [r3 + 21 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 366 * 16], m4
+
+ ; mode 24 [row 15 - first half]
+ pmaddubsw m4, m7, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 367 * 16], m4
+
+ ; mode 23 [row 3 - first half]
+ pslldq m7, m0, 2
+ pinsrb m7, [r2 + 0], 1
+ pinsrb m7, [r2 + 4 + 32], 0
+ pmaddubsw m4, m7, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 339 * 16], m4
+
+ ; mode 23 [row 4 - first half]
+ pmaddubsw m4, m7, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 340 * 16], m4
+
+ ; mode 23 [row 5 - first half]
+ pmaddubsw m4, m7, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 341 * 16], m4
+
+ ; mode 23 [row 6 - first half]
+ pmaddubsw m4, m7, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 342 * 16], m4
+
+ ; mode 23 [row 7 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 4 + 32], 1
+ pinsrb m7, [r2 + 7 + 32], 0
+ pmaddubsw m4, m7, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 343 * 16], m4
+
+ ; mode 23 [row 8 - first half]
+ pmaddubsw m4, m7, [r3 + 15 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 344 * 16], m4
+
+ ; mode 23 [row 9 - first half]
+ pmaddubsw m4, m7, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 345 * 16], m4
+
+ ; mode 23 [row 10 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 7 + 32], 1
+ pinsrb m7, [r2 + 11 + 32], 0
+ pmaddubsw m4, m7, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 346 * 16], m4
+
+ ; mode 23 [row 11 - first half]
+ pmaddubsw m4, m7, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 347 * 16], m4
+
+ ; mode 23 [row 12 - first half]
+ pmaddubsw m4, m7, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 348 * 16], m4
+
+ ; mode 23 [row 13 - first half]
+ pmaddubsw m4, m7, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 349 * 16], m4
+
+ ; mode 23 [row 14 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 11 + 32], 1
+ pinsrb m7, [r2 + 14 + 32], 0
+ pmaddubsw m4, m7, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 350 * 16], m4
+
+ ; mode 23 [row 15 - first half]
+ pmaddubsw m4, m7, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 351 * 16], m4
+
+ ; mode 21 [row 15 - first half]
+ pmaddubsw m4, m0, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 319 * 16 + 8], m4
+ ; mode 21 [row 15 - second half] end
+
+ ; mode 20 [row 1 - first half]
+ pslldq m7, m0, 2
+ pinsrb m7, [r2 + 0], 1
+ pinsrb m7, [r2 + 2 + 32], 0
+ pmaddubsw m4, m7, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 289 * 16], m4
+
+ ; mode 20 [row 2 - first half]
+ pmaddubsw m4, m7, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 290 * 16], m4
+
+ ; mode 21 [row 1 - first half]
+ pmaddubsw m4, m7, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 305 * 16], m4
+
+ ; mode 21 [row 2 - first half]
+ pmaddubsw m4, m7, [r3 + 13 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 306 * 16], m4
+
+ ; mode 22 [row 2 - first half]
+ pmaddubsw m4, m7, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 322 * 16], m4
+
+ ; mode 22 [row 3 - first half]
+ pmaddubsw m4, m7, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 323 * 16], m4
+
+ ; mode 22 [row 4 - first half]
+ pslldq m1, m7, 2
+ pinsrb m1, [r2 + 2 + 32], 1
+ pinsrb m1, [r2 + 5 + 32], 0
+ pmaddubsw m4, m1, [r3 + 31 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 324 * 16], m4
+
+ ; mode 22 [row 5 - first half]
+ pmaddubsw m4, m1, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 325 * 16], m4
+
+ ; mode 22 [row 6 - first half]
+ pmaddubsw m4, m1, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 326 * 16], m4
+
+ ; mode 22 [row 7 - first half]
+ pslldq m1, 2
+ pinsrb m1, [r2 + 5 + 32], 1
+ pinsrb m1, [r2 + 7 + 32], 0
+ pmaddubsw m4, m1, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 327 * 16], m4
+
+ ; mode 22 [row 8 - first half]
+ pmaddubsw m4, m1, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 328 * 16], m4
+
+ ; mode 22 [row 9 - first half]
+ pslldq m1, 2
+ pinsrb m1, [r2 + 7 + 32], 1
+ pinsrb m1, [r2 + 10 + 32], 0
+ pmaddubsw m4, m1, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 329 * 16], m4
+
+ ; mode 22 [row 10 - first half]
+ pmaddubsw m4, m1, [r3 + 17 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 330 * 16], m4
+
+ ; mode 22 [row 11 - first half]
+ pmaddubsw m4, m1, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 331 * 16], m4
+
+ ; mode 22 [row 12 - first half]
+ pslldq m1, 2
+ pinsrb m1, [r2 + 10 + 32], 1
+ pinsrb m1, [r2 + 12 + 32], 0
+ pmaddubsw m4, m1, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 332 * 16], m4
+
+ ; mode 22 [row 13 - first half]
+ pmaddubsw m4, m1, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 333 * 16], m4
+
+ ; mode 22 [row 14 - first half]
+ pslldq m1, 2
+ pinsrb m1, [r2 + 12 + 32], 1
+ pinsrb m1, [r2 + 15 + 32], 0
+ pmaddubsw m4, m1, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 334 * 16], m4
+
+ ; mode 22 [row 15 - first half]
+ pmaddubsw m4, m1, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 335 * 16], m4
+
+ ; mode 21 [row 3 - first half]
+ pslldq m6, m7, 2
+ pinsrb m6, [r2 + 2 + 32], 1
+ pinsrb m6, [r2 + 4 + 32], 0
+ pmaddubsw m4, m6, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 307 * 16], m4
+
+ ; mode 21 [row 4 - first half]
+ pmaddubsw m4, m6, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 308 * 16], m4
+
+ ; mode 21 [row 5 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 4 + 32], 1
+ pinsrb m6, [r2 + 6 + 32], 0
+ pmaddubsw m4, m6, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 309 * 16], m4
+
+ ; mode 21 [row 6 - first half]
+ pmaddubsw m4, m6, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 310 * 16], m4
+
+ ; mode 21 [row 7 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 6 + 32], 1
+ pinsrb m6, [r2 + 8 + 32], 0
+ pmaddubsw m4, m6, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 311 * 16], m4
+
+ ; mode 21 [row 8 - first half]
+ pmaddubsw m4, m6, [r3 + 7 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 312 * 16], m4
+
+ ; mode 21 [row 9 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 8 + 32], 1
+ pinsrb m6, [r2 + 9 + 32], 0
+ pmaddubsw m4, m6, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 313 * 16], m4
+
+ ; mode 21 [row 10 - first half]
+ pmaddubsw m4, m6, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 314 * 16], m4
+
+ ; mode 21 [row 11 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 9 + 32], 1
+ pinsrb m6, [r2 + 11 + 32], 0
+ pmaddubsw m4, m6, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 315 * 16], m4
+
+ ; mode 21 [row 12 - first half]
+ pmaddubsw m4, m6, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 316 * 16], m4
+
+ ; mode 21 [row 13 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 11 + 32], 1
+ pinsrb m6, [r2 + 13 + 32], 0
+ pmaddubsw m4, m6, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 317 * 16], m4
+
+ ; mode 21 [row 14 - first half]
+ pmaddubsw m4, m6, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 318 * 16], m4
+
+ ; mode 21 [row 15 - first half]
+ pslldq m6, 2
+ pinsrb m6, [r2 + 32 + 13], 1
+ pinsrb m6, [r2 + 32 + 15], 0
+ pmaddubsw m4, m6, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 319 * 16], m4
+
+ ; mode 20 [row 13 - second half]
+ pmaddubsw m4, m7, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 301 * 16 + 8], m4
+ ; mode 20 [row 13 - second half]
+
+ ; mode 20 [row 14 - second half]
+ pmaddubsw m4, m7, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 302 * 16 + 8], m4
+ ; mode 20 [row 14 - second half]
+
+ ; mode 20 [row 3 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 2], 1
+ pinsrb m7, [r2 + 32 + 3], 0
+ pmaddubsw m4, m7, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 291 * 16], m4
+
+ ; mode 20 [row 15 - second half]
+ pmaddubsw m4, m7, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 303 * 16 + 8], m4
+ ; mode 20 [row 15 - second half]
+
+ ; mode 20 [row 4 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 3], 1
+ pinsrb m7, [r2 + 32 + 5], 0
+ pmaddubsw m4, m7, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 292 * 16], m4
+
+ ; mode 20 [row 5 - first half]
+ pmaddubsw m4, m7, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 293 * 16], m4
+
+ ; mode 20 [row 6 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 5], 1
+ pinsrb m7, [r2 + 32 + 6], 0
+ pmaddubsw m4, m7, [r3 + 13 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 294 * 16], m4
+
+ ; mode 20 [row 7 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 6], 1
+ pinsrb m7, [r2 + 32 + 8], 0
+ pmaddubsw m4, m7, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 295 * 16], m4
+
+ ; mode 20 [row 8 - first half]
+ pmaddubsw m4, m7, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 296 * 16], m4
+
+ ; mode 20 [row 9 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 8], 1
+ pinsrb m7, [r2 + 32 + 9], 0
+ pmaddubsw m4, m7, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 297 * 16], m4
+
+ ; mode 20 [row 10 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 9], 1
+ pinsrb m7, [r2 + 32 + 11], 0
+ pmaddubsw m4, m7, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 298 * 16], m4
+
+ ; mode 20 [row 11 - first half]
+ pmaddubsw m4, m7, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 299 * 16], m4
+
+ ; mode 20 [row 12 - first half]
+ movu m1, [r3 + 15 * 16]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 11], 1
+ pinsrb m7, [r2 + 32 + 12], 0
+ pmaddubsw m4, m7, [r3 + 15 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 300 * 16], m4
+
+ ; mode 20 [row 13 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 12], 1
+ pinsrb m7, [r2 + 32 + 14], 0
+ pmaddubsw m4, m7, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 301 * 16], m4
+
+ ; mode 20 [row 14 - first half]
+ pmaddubsw m4, m7, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 302 * 16], m4
+
+ ; mode 20 [row 15 - first half]
+ pslldq m7, 2
+ pinsrb m7, [r2 + 32 + 14], 1
+ pinsrb m7, [r2 + 32 + 15], 0
+ pmaddubsw m4, m7, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 303 * 16], m4
+
+ ; mode 19 [row 1]
+ pslldq m0, 2
+ pinsrb m0, [r2], 1
+ pinsrb m0, [r2 + 32 + 1], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 8], 1
+ pinsrb m5, [r2 + 7], 0
+
+ ; mode 20 [row 1 - second half]
+ pmaddubsw m4, m5, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 289 * 16 + 8], m4
+ ; mode 20 [row 1 - second half] end
+
+ ; mode 20 [row 2 - second half]
+ pmaddubsw m4, m5, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 290 * 16 + 8], m4
+ ; mode 20 [row 2 - second half] end
+
+ ; mode 21 [row 2 - second half]
+ pmaddubsw m4, m5, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 305 * 16 + 8], m4
+ ; mode 21 [row 2 - second half] end
+
+ ; mode 21 [row 3 - second half]
+ pmaddubsw m4, m5, [r3 + 13 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 306 * 16 + 8], m4
+ ; mode 21 [row 3 - second half] end
+
+ ; mode 21 [row 4 - second half]
+ pmaddubsw m4, m5, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 307 * 16 + 8], m4
+ ; mode 21 [row 4 - second half] end
+
+ ; mode 22 [row 2 - second half]
+ pmaddubsw m4, m5, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 322 * 16 + 8], m4
+ ; mode 22 [row 2 - second half] end
+
+ ; mode 22 [row 3 - second half]
+ pmaddubsw m4, m5, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 323 * 16 + 8], m4
+ ; mode 22 [row 3 - second half] end
+
+ ; mode 23 [row 3 - second half]
+ pmaddubsw m4, m5, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 339 * 16 + 8], m4
+ ; mode 23 [row 3 - second half] end
+
+ ; mode 23 [row 4 - second half]
+ pmaddubsw m4, m5, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 340 * 16 + 8], m4
+ ; mode 23 [row 4 - second half] end
+
+ ; mode 23 [row 5 - second half]
+ pmaddubsw m4, m5, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 341 * 16 + 8], m4
+ ; mode 23 [row 5 - second half] end
+
+ ; mode 23 [row 6 - second half]
+ pmaddubsw m4, m5, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 342 * 16 + 8], m4
+ ; mode 23 [row 6 - second half] end
+
+ ; mode 24 [row 6 - second half]
+ pmaddubsw m4, m5, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 358 * 16 + 8], m4
+ ; mode 24 [row 6 - second half] end
+
+ ; mode 24 [row 7 - second half]
+ pmaddubsw m4, m5, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 359 * 16 + 8], m4
+ ; mode 24 [row 7 - second half] end
+
+ ; mode 24 [row 8 - second half]
+ pmaddubsw m4, m5, [r3 + 19 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 360 * 16 + 8], m4
+ ; mode 24 [row 8 - second half] end
+
+ ; mode 24 [row 9 - second half]
+ pmaddubsw m4, m5, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 361 * 16 + 8], m4
+ ; mode 24 [row 9 - second half] end
+
+ ; mode 24 [row 10 - second half]
+ pmaddubsw m4, m5, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 362 * 16 + 8], m4
+ ; mode 24 [row 10 - second half] end
+
+ ; mode 24 [row 11 - second half]
+ pmaddubsw m4, m5, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 363 * 16 + 8], m4
+ ; mode 24 [row 11 - second half] end
+
+ pmaddubsw m4, m0, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 12 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 273 * 16], m4
+
+ ; mode 19 [row 2]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 1], 1
+ pinsrb m0, [r2 + 32 + 2], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 7], 1
+ pinsrb m5, [r2 + 6], 0
+
+ ; mode 20 [row 3 - second half]
+ pmaddubsw m4, m5, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 291 * 16 + 8], m4
+ ; mode 20 [row 3 - second half] end
+
+ ; mode 21 [row 3 - second half]
+ pmaddubsw m4, m5, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 307 * 16 + 8], m4
+ ; mode 21 [row 3 - second half] end
+
+ ; mode 21 [row 4 - second half]
+ pmaddubsw m4, m5, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 308 * 16 + 8], m4
+ ; mode 21 [row 4 - second half] end
+
+ ; mode 22 [row 4 - second half]
+ pmaddubsw m4, m5, [r3 + 31 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 324 * 16 + 8], m4
+ ; mode 22 [row 4 - second half] end
+
+ ; mode 22 [row 5 - second half]
+ pmaddubsw m4, m5, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 325 * 16 + 8], m4
+ ; mode 22 [row 5 - second half] end
+
+ ; mode 22 [row 6 - second half]
+ pmaddubsw m4, m5, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 326 * 16 + 8], m4
+ ; mode 22 [row 6 - second half] end
+
+ ; mode 23 [row 7 - second half]
+ pmaddubsw m4, m5, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 343 * 16 + 8], m4
+ ; mode 23 [row 7 - second half] end
+
+ ; mode 23 [row 8 - second half]
+ pmaddubsw m4, m5, [r3 + 15 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 344 * 16 + 8], m4
+ ; mode 23 [row 8 - second half] end
+
+ ; mode 23 [row 9 - second half]
+ pmaddubsw m4, m5, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 345 * 16 + 8], m4
+ ; mode 23 [row 9 - second half] end
+
+ ; mode 24 [row 12 - second half]
+ pmaddubsw m4, m5, [r3 + 31 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 364 * 16 + 8], m4
+ ; mode 24 [row 12 - second half] end
+
+ ; mode 24 [row 13 - second half]
+ pmaddubsw m4, m5, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 365 * 16 + 8], m4
+ ; mode 24 [row 13 - second half] end
+
+ ; mode 24 [row 14 - second half]
+ pmaddubsw m4, m5, [r3 + 21 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 366 * 16 + 8], m4
+ ; mode 24 [row 14 - second half] end
+
+ ; mode 24 [row 15 - second half]
+ pmaddubsw m4, m5, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 367 * 16 + 8], m4
+ ; mode 24 [row 15 - second half] end
+
+ pmaddubsw m4, m0, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 18 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 274 * 16], m4
+
+ ; mode 19 [row 3]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 2], 1
+ pinsrb m0, [r2 + 32 + 4], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 6], 1
+ pinsrb m5, [r2 + 5], 0
+
+ ; mode 20 [row 4 - second half]
+ pmaddubsw m4, m5, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 292 * 16 + 8], m4
+ ; mode 20 [row 4 - second half] end
+
+ ; mode 20 [row 5 - second half]
+ pmaddubsw m4, m5, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 293 * 16 + 8], m4
+ ; mode 20 [row 5 - second half] end
+
+ ; mode 21 [row 5 - second half]
+ pmaddubsw m4, m5, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 309 * 16 + 8], m4
+ ; mode 21 [row 5 - second half] end
+
+ ; mode 21 [row 6 - second half]
+ pmaddubsw m4, m5, [r3 + 9 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 310 * 16 + 8], m4
+ ; mode 21 [row 6 - second half] end
+
+ ; mode 22 [row 7 - second half]
+ pmaddubsw m4, m5, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 327 * 16 + 8], m4
+ ; mode 22 [row 7 - second half] end
+
+ ; mode 22 [row 8 - second half]
+ pmaddubsw m4, m5, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 328 * 16 + 8], m4
+ ; mode 22 [row 7 - second half] end
+
+ ; mode 23 [row 10 - second half]
+ pmaddubsw m4, m5, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 346 * 16 + 8], m4
+ ; mode 23 [row 10 - second half] end
+
+ ; mode 23 [row 11 - second half]
+ pmaddubsw m4, m5, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 347 * 16 + 8], m4
+ ; mode 23 [row 11 - second half] end
+
+ ; mode 23 [row 12 - second half]
+ pmaddubsw m4, m5, [r3 + 11 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 348 * 16 + 8], m4
+ ; mode 23 [row 12 - second half] end
+
+ ; mode 23 [row 13 - second half]
+ pmaddubsw m4, m5, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 349 * 16 + 8], m4
+ ; mode 23 [row 13 - second half] end
+
+ pmaddubsw m4, m0, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 24 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 275 * 16], m4
+
+ ; mode 19 [row 4]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 4], 1
+ pinsrb m0, [r2 + 32 + 5], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 5], 1
+ pinsrb m5, [r2 + 4], 0
+
+ ; mode 20 [row 6 - second half]
+ pmaddubsw m4, m5, [r3 + 13 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 294 * 16 + 8], m4
+ ; mode 20 [row 6 - second half] end
+
+ ; mode 21 [row 7 - second half]
+ pmaddubsw m4, m5, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 311 * 16 + 8], m4
+ ; mode 21 [row 7 - second half] end
+
+ ; mode 21 [row 8 - second half]
+ pmaddubsw m4, m5, [r3 + 7 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 312 * 16 + 8], m4
+ ; mode 21 [row 8 - second half] end
+
+ ; mode 22 [row 9 - second half]
+ pmaddubsw m4, m5, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 329 * 16 + 8], m4
+ ; mode 22 [row 9 - second half] end
+
+ ; mode 22 [row 10 - second half]
+ pmaddubsw m4, m5, [r3 + 17 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 330 * 16 + 8], m4
+ ; mode 22 [row 10 - second half] end
+
+ ; mode 22 [row 11 - second half]
+ pmaddubsw m4, m5, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 331 * 16 + 8], m4
+ ; mode 22 [row 11 - second half] end
+
+ ; mode 23 [row 14 - second half]
+ pmaddubsw m4, m5, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 350 * 16 + 8], m4
+ ; mode 23 [row 14 - second half] end
+
+ ; mode 23 [row 15 - second half]
+ pmaddubsw m4, m5, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 351 * 16 + 8], m4
+
+ ; mode 23 [row 15 - second half] end
+ pmaddubsw m4, m0, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 30 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 276 * 16], m4
+
+ ; mode 19 [row 5]
+ pmaddubsw m4, m0, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 4 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 277 * 16], m4
+
+ ; mode 19 [row 6]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 5], 1
+ pinsrb m0, [r2 + 32 + 6], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 4], 1
+ pinsrb m5, [r2 + 3], 0
+
+ ; mode 20 [row 7 - second half]
+ pmaddubsw m4, m5, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 295 * 16 + 8], m4
+ ; mode 20 [row 7 - second half] end
+
+ ; mode 20 [row 8 - second half]
+ pmaddubsw m4, m5, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 296 * 16 + 8], m4
+ ; mode 20 [row 8 - second half] end
+
+ ; mode 21 [row 9 - second half]
+ pmaddubsw m4, m5, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 313 * 16 + 8], m4
+ ; mode 21 [row 9 - second half] end
+
+ ; mode 21 [row 10 - second half]
+ pmaddubsw m4, m5, [r3 + 5 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 314 * 16 + 8], m4
+ ; mode 21 [row 10 - second half] end
+
+ ; mode 22 [row 12 - second half]
+ pmaddubsw m4, m5, [r3 + 23 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 332 * 16 + 8], m4
+ ; mode 22 [row 12 - second half] end
+
+ ; mode 22 [row 12 - second half]
+ pmaddubsw m4, m5, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 333 * 16 + 8], m4
+ ; mode 22 [row 12 - second half] end
+
+ pmaddubsw m4, m0, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 10 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 278 * 16], m4
+
+ ; mode 19 [row 7]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 6], 1
+ pinsrb m0, [r2 + 32 + 7], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 3], 1
+ pinsrb m5, [r2 + 2], 0
+
+ ; mode 20 [row 9 - second half]
+ pmaddubsw m4, m5, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 297 * 16 + 8], m4
+ ; mode 20 [row 9 - second half]
+
+ ; mode 21 [row 11 - second half]
+ pmaddubsw m4, m5, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 315 * 16 + 8], m4
+ ; mode 21 [row 11 - second half] end
+
+ ; mode 21 [row 12 - second half]
+ pmaddubsw m4, m5, [r3 + 3 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 316 * 16 + 8], m4
+ ; mode 21 [row 12 - second half] end
+
+ ; mode 22 [row 14 - second half]
+ pmaddubsw m4, m5, [r3 + 29 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 334 * 16 + 8], m4
+ ; mode 22 [row 14 - second half] end
+
+ ; mode 22 [row 15 - second half]
+ pmaddubsw m4, m5, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 335 * 16 + 8], m4
+ ; mode 22 [row 15 - second half] end
+
+ pmaddubsw m4, m0, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 16 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 279 * 16], m4
+
+ ; mode 19 [row 8]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 7], 1
+ pinsrb m0, [r2 + 32 + 9], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 2], 1
+ pinsrb m5, [r2 + 1], 0
+
+ ; mode 20 [row 10 - second half]
+ pmaddubsw m4, m5, [r3 + 25 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 298 * 16 + 8], m4
+ ; mode 20 [row 10 - second half] end
+
+ ; mode 20 [row 11 - second half]
+ pmaddubsw m4, m5, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 299 * 16 + 8], m4
+ ; mode 20 [row 11 - second half] end
+
+ ; mode 21 [row 13 - second half]
+ pmaddubsw m4, m5, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 317 * 16 + 8], m4
+ ; mode 21 [row 13 - second half] end
+
+ ; mode 21 [row 14 - second half]
+ pmaddubsw m4, m5, [r3 + 1 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 318 * 16 + 8], m4
+ ; mode 21 [row 14 - second half] end
+
+ pmaddubsw m4, m0, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 22 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 280 * 16], m4
+
+ ; mode 19 [row 9]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 9], 1
+ pinsrb m0, [r2 + 32 + 10], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 1], 1
+ pinsrb m5, [r2 + 0], 0
+
+ ; mode 20 [row 12 - second half]
+ pmaddubsw m4, m5, [r3 + 15 * 16]
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movh [r0 + 300 * 16 + 8], m4
+
+ ; mode 20 [row 12 - second half] end
+ pmaddubsw m4, m0, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 28 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 281 * 16], m4
+
+ ; mode 19 [row 10]
+ pmaddubsw m4, m0, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 2 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 282 * 16], m4
+
+ ; mode 19 [row 11]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 10], 1
+ pinsrb m0, [r2 + 32 + 11], 0
+ pmaddubsw m4, m0, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ pslldq m5, 2
+ pinsrb m5, [r2], 1
+ pinsrb m5, [r2 + 32 + 1], 0
+ pmaddubsw m6, m5, [r3 + 8 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 283 * 16], m4
+
+ ; mode 19 [row 12]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 11], 1
+ pinsrb m0, [r2 + 32 + 12], 0
+ pslldq m5, 2
+ pinsrb m5, [r2 + 32 + 1], 1
+ pinsrb m5, [r2 + 32 + 2], 0
+ pmaddubsw m4, m0, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m6, m5, [r3 + 14 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 284 * 16], m4
+
+ ; mode 19 [row 13]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 12], 1
+ pinsrb m0, [r2 + 32 + 14], 0
+ pmaddubsw m4, m0, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ pslldq m5, 2
+ pinsrb m5, [r2 + 32 + 2], 1
+ pinsrb m5, [r2 + 32 + 4], 0
+ pmaddubsw m6, m5, [r3 + 20 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 285 * 16], m4
+
+ ; mode 19 [row 14]
+ pslldq m0, 2
+ pinsrb m0, [r2 + 32 + 14], 1
+ pinsrb m0, [r2 + 32 + 15], 0
+ pmaddubsw m4, m0, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ pslldq m5, 2
+ pinsrb m5, [r2 + 32 + 4], 1
+ pinsrb m5, [r2 + 32 + 5], 0
+ pmaddubsw m6, m5, [r3 + 26 * 16]
+ pmulhrsw m6, m3
+ packuswb m4, m6
+ movu [r0 + 286 * 16], m4
+
+ ; mode 19 [row 15]
+ movu m0, [r2 + 32]
+ pshufb m0, [tab_S1]
+ movu [r0 + 287 * 16], m0
+ movd m1, [r2]
+ movd [r0 + 287 * 16 + 12], m1
+
+ ; mode 25
+ movu m1, [r1]
+
+ ; mode 26 [all rows]
+ psrldq m6, m1, 1
+ pinsrb m6, [r1 + 16], 15
+ movu m7, m6
+ movu [r0 + 384 * 16], m6
+ movu [r0 + 385 * 16], m6
+ movu [r0 + 386 * 16], m6
+ movu [r0 + 387 * 16], m6
+ movu [r0 + 388 * 16], m6
+ movu [r0 + 389 * 16], m6
+ movu [r0 + 390 * 16], m6
+ movu [r0 + 391 * 16], m6
+ movu [r0 + 392 * 16], m6
+ movu [r0 + 393 * 16], m6
+ movu [r0 + 394 * 16], m6
+ movu [r0 + 395 * 16], m6
+ movu [r0 + 396 * 16], m6
+ movu [r0 + 397 * 16], m6
+ movu [r0 + 398 * 16], m6
+ movu [r0 + 399 * 16], m6
+
+ pxor m0, m0
+ pshufb m6, m6, m0
+ punpcklbw m6, m0
+ pinsrb m2, [r1], 0
+ pshufb m2, m2, m0
+ punpcklbw m2, m0
+ movu m4, [r1 + 1 + 32]
+ punpcklbw m5, m4, m0
+ punpckhbw m4, m0
+ psubw m5, m2
+ psubw m4, m2
+ psraw m5, 1
+ psraw m4, 1
+ paddw m5, m6
+ paddw m4, m6
+ packuswb m5, m4
+
+ pextrb [r0 + 384 * 16], m5, 0
+ pextrb [r0 + 385 * 16], m5, 1
+ pextrb [r0 + 386 * 16], m5, 2
+ pextrb [r0 + 387 * 16], m5, 3
+ pextrb [r0 + 388 * 16], m5, 4
+ pextrb [r0 + 389 * 16], m5, 5
+ pextrb [r0 + 390 * 16], m5, 6
+ pextrb [r0 + 391 * 16], m5, 7
+ pextrb [r0 + 392 * 16], m5, 8
+ pextrb [r0 + 393 * 16], m5, 9
+ pextrb [r0 + 394 * 16], m5, 10
+ pextrb [r0 + 395 * 16], m5, 11
+ pextrb [r0 + 396 * 16], m5, 12
+ pextrb [r0 + 397 * 16], m5, 13
+ pextrb [r0 + 398 * 16], m5, 14
+ pextrb [r0 + 399 * 16], m5, 15
+
+ ; mode 25 [row 15]
+ movu [r0 + 383 * 16], m1
+
+ ; mode 25 [row 0]
+ psrldq m2, m1, 1
+ punpcklbw m1, m2
+ movu m2, [r1 + 8]
+ psrldq m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m4, m1, [r3 + 30 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 30 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 368 * 16], m4
+
+ ; mode 25 [row 1]
+ pmaddubsw m4, m1, [r3 + 28 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 28 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 369 * 16], m4
+
+ ; mode 25 [row 2]
+ pmaddubsw m4, m1, [r3 + 26 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 26 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 370 * 16], m4
+
+ ; mode 25 [row 3]
+ pmaddubsw m4, m1, [r3 + 24 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 24 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 371 * 16], m4
+
+ ; mode 25 [row 4]
+ pmaddubsw m4, m1, [r3 + 22 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 22 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 372 * 16], m4
+
+ ; mode 25 [row 5]
+ pmaddubsw m4, m1, [r3 + 20 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 20 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 373 * 16], m4
+
+ ; mode 25 [row 6]
+ pmaddubsw m4, m1, [r3 + 18 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 18 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 374 * 16], m4
+
+ ; mode 25 [row 7]
+ pmaddubsw m4, m1, [r3 + 16 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 375 * 16], m4
+
+ ; mode 25 [row 8]
+ pmaddubsw m4, m1, [r3 + 14 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 14 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 376 * 16], m4
+
+ ; mode 25 [row 9]
+ pmaddubsw m4, m1, [r3 + 12 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 12 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 377 * 16], m4
+
+ ; mode 25 [row 10]
+ pmaddubsw m4, m1, [r3 + 10 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 10 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 378 * 16], m4
+
+ ; mode 25 [row 11]
+ pmaddubsw m4, m1, [r3 + 8 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 379 * 16], m4
+
+ ; mode 25 [row 12]
+ pmaddubsw m4, m1, [r3 + 6 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 380 * 16], m4
+
+ ; mode 25 [row 13]
+ pmaddubsw m4, m1, [r3 + 4 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 4 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 381 * 16], m4
+
+ ; mode 25 [row 14]
+ pmaddubsw m4, m1, [r3 + 2 * 16]
+ pmulhrsw m4, m3
+ pmaddubsw m5, m2, [r3 + 2 * 16]
+ pmulhrsw m5, m3
+ packuswb m4, m5
+ movu [r0 + 382 * 16], m4
+
+ ; mode 27 [row 15]
+ psrldq m6, m7, 1
+ punpcklbw m7, m6
+ pinsrb m6, [r1 + 17], 15
+ movu [r0 + 415 * 16], m6
+
+ ; mode 27 [row 0]
+ movu m4, [r1 + 9]
+ psrldq m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m6, m7, [r3 + 2 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 2 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 400 * 16], m6
+
+ ; mode 27 [row 1]
+ pmaddubsw m6, m7, [r3 + 4 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 4 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 401 * 16], m6
+
+ ; mode 27 [row 2]
+ pmaddubsw m6, m7, [r3 + 6 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 402 * 16], m6
+
+ ; mode 27 [row 3]
+ pmaddubsw m6, m7, [r3 + 8 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 403 * 16], m6
+
+ ; mode 27 [row 4]
+ pmaddubsw m6, m7, [r3 + 10 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 10 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 404 * 16], m6
+
+ ; mode 27 [row 5]
+ pmaddubsw m6, m7, [r3 + 12 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 12 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 405 * 16], m6
+
+ ; mode 27 [row 6]
+ pmaddubsw m6, m7, [r3 + 14 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 14 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 406 * 16], m6
+
+ ; mode 27 [row 7]
+ pmaddubsw m6, m7, [r3 + 16 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 407 * 16], m6
+
+ ; mode 27 [row 8]
+ pmaddubsw m6, m7, [r3 + 18 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 18 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 408 * 16], m6
+
+ ; mode 27 [row 9]
+ pmaddubsw m6, m7, [r3 + 20 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 20 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 409 * 16], m6
+
+ ; mode 27 [row 10]
+ pmaddubsw m6, m7, [r3 + 22 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 22 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 410 * 16], m6
+
+ ; mode 27 [row 11]
+ pmaddubsw m6, m7, [r3 + 24 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 24 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 411 * 16], m6
+
+ ; mode 27 [row 12]
+ pmaddubsw m6, m7, [r3 + 26 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 26 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 412 * 16], m6
+
+ ; mode 27 [row 13]
+ pmaddubsw m6, m7, [r3 + 28 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 28 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 413 * 16], m6
+
+ ; mode 27 [row 14]
+ pmaddubsw m6, m7, [r3 + 30 * 16]
+ pmulhrsw m6, m3
+ pmaddubsw m5, m4, [r3 + 30 * 16]
+ pmulhrsw m5, m3
+ packuswb m6, m5
+ movu [r0 + 414 * 16], m6
+
+ ; mode 28 [row 0]
+ movu m1, [r2 + 1]
+ psrldq m2, m1, 1
+ punpcklbw m1, m2
+ movu m4, [r2 + 9]
+ psrldq m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m2, m1, [r3 + 5 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 5 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 416 * 16], m2
+
+ ; mode 28 [row 0]
+ pmaddubsw m2, m1, [r3 + 5 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 5 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 416 * 16], m2
+
+ ; mode 28 [row 1]
+ pmaddubsw m2, m1, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 10 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 417 * 16], m2
+
+ ; mode 28 [row 2]
+ pmaddubsw m2, m1, [r3 + 15 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 15 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 418 * 16], m2
+
+ ; mode 28 [row 3]
+ pmaddubsw m2, m1, [r3 + 20 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 20 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 419 * 16], m2
+
+ ; mode 28 [row 4]
+ pmaddubsw m2, m1, [r3 + 25 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 25 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 420 * 16], m2
+
+ ; mode 28 [row 5]
+ pmaddubsw m2, m1, [r3 + 30 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 30 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 421 * 16], m2
+
+ ; mode 29 [row 0]
+ pmaddubsw m2, m1, [r3 + 9 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 9 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 432 * 16], m2
+
+ ; mode 29 [row 1]
+ pmaddubsw m2, m1, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 18 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 433 * 16], m2
+
+ ; mode 29 [row 2]
+ pmaddubsw m2, m1, [r3 + 27 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 27 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 434 * 16], m2
+
+ ; mode 30 [row 0]
+ pmaddubsw m2, m1, [r3 + 13 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 13 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 448 * 16], m2
+
+ ; mode 30 [row 1]
+ pmaddubsw m2, m1, [r3 + 26 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 26 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 449 * 16], m2
+
+ ; mode 33 [row 0]
+ movu [r0 + 496 * 16], m2
+
+ ; mode 31 [row 0]
+ pmaddubsw m2, m1, [r3 + 17 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 17 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 464 * 16], m2
+
+ ; mode 32 [row 0]
+ pmaddubsw m2, m1, [r3 + 21 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 21 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 480 * 16], m2
+
+ ; mode 28 [row 6]
+ movd m7, [r2 + 9]
+ palignr m7, m1, 2
+ pmaddubsw m2, m7, [r3 + 3 * 16]
+ pmulhrsw m2, m3
+ movd m6, [r2 + 17]
+ palignr m6, m4, 2
+ pmaddubsw m5, m6, [r3 + 3 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 422 * 16], m2
+
+ ; mode 28 [row 7]
+ pmaddubsw m2, m7, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 423 * 16], m2
+
+ ; mode 28 [row 8]
+ pmaddubsw m2, m7, [r3 + 13 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 13 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 424 * 16], m2
+
+ ; mode 28 [row 9]
+ pmaddubsw m2, m7, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 18 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 425 * 16], m2
+
+ ; mode 28 [row 10]
+ pmaddubsw m2, m7, [r3 + 23 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 23 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 426 * 16], m2
+
+ ; mode 29 [row 3]
+ pmaddubsw m2, m7, [r3 + 4 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 4 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 435 * 16], m2
+
+ ; mode 29 [row 4]
+ pmaddubsw m2, m7, [r3 + 13 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 13 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 436 * 16], m2
+
+ ; mode 29 [row 5]
+ pmaddubsw m2, m7, [r3 + 22 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 22 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 437 * 16], m2
+
+ ; mode 29 [row 6]
+ pmaddubsw m2, m7, [r3 + 31 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 31 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 438 * 16], m2
+
+ ; mode 32 [row 2]
+ movu [r0 + 482 * 16], m2
+
+ ; mode 30 [row 2]
+ pmaddubsw m2, m7, [r3 + 7 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 7 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 450 * 16], m2
+
+ ; mode 30 [row 3]
+ pmaddubsw m2, m7, [r3 + 20 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 20 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 451 * 16], m2
+
+ ; mode 33 [row 1]
+ movu [r0 + 497 * 16], m2
+
+ ; mode 31 [row 1]
+ pmaddubsw m2, m7, [r3 + 2 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 2 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 465 * 16], m2
+
+ ; mode 31 [row 2]
+ pmaddubsw m2, m7, [r3 + 19 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 19 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 466 * 16], m2
+
+ ; mode 32 [row 1]
+ pmaddubsw m2, m7, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 10 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 481 * 16], m2
+
+ ; mode 28 [row 11]
+ pmaddubsw m2, m7, [r3 + 28 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 28 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 427 * 16], m2
+
+ ; mode 28 [row 12]
+ movd m1, [r2 + 10]
+ palignr m1, m7, 2
+ pmaddubsw m2, m1, [r3 + 1 * 16]
+ pmulhrsw m2, m3
+ movd m4, [r2 + 18]
+ palignr m4, m6, 2
+ pmaddubsw m5, m4, [r3 + 1 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 428 * 16], m2
+
+ ; mode 30 [row 4]
+ movu [r0 + 452 * 16], m2
+
+ ; mode 28 [row 13]
+ pmaddubsw m2, m1, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 429 * 16], m2
+
+ ; mode 28 [row 14]
+ pmaddubsw m2, m1, [r3 + 11 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 11 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 430 * 16], m2
+
+ ; mode 28 [row 15]
+ pmaddubsw m2, m1, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 431 * 16], m2
+
+ ; mode 29 [row 7]
+ pmaddubsw m2, m1, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 439 * 16], m2
+
+ ; mode 29 [row 8]
+ pmaddubsw m2, m1, [r3 + 17 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 17 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 440 * 16], m2
+
+ ; mode 29 [row 9]
+ pmaddubsw m2, m1, [r3 + 26 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 26 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 441 * 16], m2
+
+ ; mode 30 [row 5]
+ pmaddubsw m2, m1, [r3 + 14 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 14 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 453 * 16], m2
+
+ ; mode 33 [row 2]
+ movu [r0 + 498 * 16], m2
+
+ ; mode 30 [row 6]
+ pmaddubsw m2, m1, [r3 + 27 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 27 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 454 * 16], m2
+
+ ; mode 31 [row 3]
+ pmaddubsw m2, m1, [r3 + 4 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 4 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 467 * 16], m2
+
+ ; mode 31 [row 4]
+ pmaddubsw m2, m1, [r3 + 21 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 21 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 468 * 16], m2
+
+ ; mode 32 [row 3]
+ pmaddubsw m2, m1, [r3 + 20 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 20 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 483 * 16], m2
+
+ ; mode 29 [row 10]
+ movd m7, [r2 + 11]
+ palignr m7, m1, 2
+ pmaddubsw m2, m7, [r3 + 3 * 16]
+ pmulhrsw m2, m3
+ movd m6, [r2 + 19]
+ palignr m6, m4, 2
+ pmaddubsw m5, m6, [r3 + 3 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 442 * 16], m2
+
+ ; mode 29 [row 11]
+ pmaddubsw m2, m7, [r3 + 12 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 12 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 443 * 16], m2
+
+ ; mode 29 [row 12]
+ pmaddubsw m2, m7, [r3 + 21 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 21 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 444 * 16], m2
+
+ ; mode 30 [row 8]
+ movu [r0 + 456 * 16], m2
+
+ ; mode 29 [row 13]
+ pmaddubsw m2, m7, [r3 + 30 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 30 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 445 * 16], m2
+
+ ; mode 32 [row 5]
+ movu [r0 + 485 * 16], m2
+
+ ; mode 30 [row 7]
+ pmaddubsw m2, m7, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 455 * 16], m2
+
+ ; mode 33 [row 3]
+ movu [r0 + 499 * 16], m2
+
+ ; mode 31 [row 5]
+ pmaddubsw m2, m7, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 469 * 16], m2
+
+ ; mode 31 [row 6]
+ pmaddubsw m2, m7, [r3 + 23 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 23 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 470 * 16], m2
+
+ ; mode 32 [row 4]
+ pmaddubsw m2, m7, [r3 + 9 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 9 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 484 * 16], m2
+
+ movu m1, m7
+ movu m4, m6
+
+ ; mode 29 [row 14]
+ movu m1, [r2 + 12]
+ palignr m1, m7, 2
+ pmaddubsw m2, m1, [r3 + 7 * 16]
+ pmulhrsw m2, m3
+ movd m4, [r2 + 20]
+ palignr m4, m6, 2
+ pmaddubsw m5, m4, [r3 + 7 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 446 * 16], m2
+
+ ; mode 29 [row 15]
+ pmaddubsw m2, m1, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 447 * 16], m2
+
+ ; mode 30 [row 9]
+ pmaddubsw m2, m1, [r3 + 2 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 2 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 457 * 16], m2
+
+ ; mode 33 [row 4]
+ movu [r0 + 500 * 16], m2
+
+ ; mode 30 [row 10]
+ pmaddubsw m2, m1, [r3 + 15 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 15 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 458 * 16], m2
+
+ ; mode 30 [row 11]
+ pmaddubsw m2, m1, [r3 + 28 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 28 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 459 * 16], m2
+
+ ; mode 33 [row 5]
+ movu [r0 + 501 * 16], m2
+
+ ; mode 31 [row 7]
+ pmaddubsw m2, m1, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 471 * 16], m2
+
+ ; mode 31 [row 8]
+ pmaddubsw m2, m1, [r3 + 25 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 25 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 472 * 16], m2
+
+ ; mode 32 [row 6]
+ pmaddubsw m2, m1, [r3 + 19 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 19 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 486 * 16], m2
+
+ ; mode 30 [row 12]
+ movd m7, [r2 + 13]
+ palignr m7, m1, 2
+ pmaddubsw m2, m7, [r3 + 9 * 16]
+ pmulhrsw m2, m3
+ movd m6, [r2 + 21]
+ palignr m6, m4, 2
+ pmaddubsw m5, m6, [r3 + 9 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 460 * 16], m2
+
+ ; mode 30 [row 13]
+ pmaddubsw m2, m7, [r3 + 22 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 22 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 461 * 16], m2
+
+ ; mode 33 [row 6]
+ movu [r0 + 502 * 16], m2
+
+ ; mode 31 [row 9]
+ pmaddubsw m2, m7, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 10 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 473 * 16], m2
+
+ ; mode 31 [row 10]
+ pmaddubsw m2, m7, [r3 + 27 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 27 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 474 * 16], m2
+
+ ; mode 32 [row 7]
+ pmaddubsw m2, m7, [r3 + 8 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 8 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 487 * 16], m2
+
+ ; mode 32 [row 8]
+ pmaddubsw m2, m7, [r3 + 29 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 29 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 488 * 16], m2
+
+
+ movu m1, m7
+ movu m4, m6
+
+ ; mode 30 [row 14]
+ movd m1, [r2 + 14]
+ palignr m1, m7, 2
+ pmaddubsw m2, m1, [r3 + 3 * 16]
+ pmulhrsw m2, m3
+ movd m4, [r2 + 22]
+ palignr m4, m6, 2
+ pmaddubsw m5, m4, [r3 + 3 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 462 * 16], m2
+
+ ; mode 30 [row 15]
+ pmaddubsw m2, m1, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 463 * 16], m2
+
+ ; mode 33 [row 7]
+ movu [r0 + 503 * 16], m2
+
+ ; mode 31 [row 11]
+ pmaddubsw m2, m1, [r3 + 12 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 12 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 475 * 16], m2
+
+ ; mode 31 [row 12]
+ pmaddubsw m2, m1, [r3 + 29 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 29 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 476 * 16], m2
+
+ ; mode 32 [row 9]
+ pmaddubsw m2, m1, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 18 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 489 * 16], m2
+
+ ; mode 31 [row 13]
+ movd m7, [r2 + 15]
+ palignr m7, m1, 2
+ pmaddubsw m2, m7, [r3 + 14 * 16]
+ pmulhrsw m2, m3
+ movd m6, [r2 + 23]
+ palignr m6, m4, 2
+ pmaddubsw m5, m6, [r3 + 14 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 477 * 16], m2
+
+ ; mode 31 [row 14]
+ pmaddubsw m2, m7, [r3 + 31 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 31 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 478 * 16], m2
+
+ ; mode 32 [row 10]
+ pmaddubsw m2, m7, [r3 + 7 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 7 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 490 * 16], m2
+
+ ; mode 32 [row 11]
+ pmaddubsw m2, m7, [r3 + 28 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 28 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 491 * 16], m2
+
+ ; mode 33 [row 8]
+ pmaddubsw m2, m7, [r3 + 10 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 10 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 504 * 16], m2
+
+ ; mode 31 [row 15]
+ movd m1, [r2 + 16]
+ palignr m1, m7, 2
+ pmaddubsw m2, m1, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ movd m4, [r2 + 24]
+ palignr m4, m6, 2
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 479 * 16], m2
+
+ ; mode 32 [row 12]
+ pmaddubsw m2, m1, [r3 + 17 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 17 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 492 * 16], m2
+
+ ; mode 33 [row 9]
+ pmaddubsw m2, m1, [r3 + 4 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 4 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 505 * 16], m2
+
+ ; mode 33 [row 10]
+ pmaddubsw m2, m1, [r3 + 30 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 30 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 506 * 16], m2
+
+ ; mode 33 [row 10]
+ pmaddubsw m2, m1, [r3 + 4 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 4 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 505 * 16], m2
+
+ ; mode 32 [row 13]
+ movd m7, [r2 + 17]
+ palignr m7, m1, 2
+ pmaddubsw m2, m7, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+
+ movd m6, [r2 + 25]
+ palignr m6, m4, 2
+ pmaddubsw m5, m6, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 493 * 16], m2
+
+ ; mode 32 [row 14]
+ pmaddubsw m2, m7, [r3 + 27 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 27 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 494 * 16], m2
+
+ ; mode 33 [row 11]
+ pmaddubsw m2, m7, [r3 + 24 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m6, [r3 + 24 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 507 * 16], m2
+
+ ; mode 32 [row 15]
+ movd m1, [r2 + 18]
+ palignr m1, m7, 2
+ pmaddubsw m2, m1, [r3 + 16 * 16]
+ pmulhrsw m2, m3
+ psrldq m4, 2
+ pinsrb m4, [r2 + 26], 14
+ pinsrb m4, [r2 + 27], 15
+ movd m4, [r2 + 26]
+ palignr m4, m6, 2
+ pmaddubsw m5, m4, [r3 + 16 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 495 * 16], m2
+
+ ; mode 33 [row 12]
+ pmaddubsw m2, m1, [r3 + 18 * 16]
+ pmulhrsw m2, m3
+ pmaddubsw m5, m4, [r3 + 18 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 508 * 16], m2
+
+ ; mode 33 [row 13]
+ movd m7, [r2 + 19]
+ palignr m7, m1, 2
+ pmaddubsw m2, m7, [r3 + 12 * 16]
+ pmulhrsw m2, m3
+ movd m6, [r2 + 27]
+ palignr m6, m4, 2
+ pmaddubsw m5, m6, [r3 + 12 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 509 * 16], m2
+
+ ; mode 33 [row 14]
+ movd m1, [r2 + 20]
+ palignr m1, m7, 2
+ pmaddubsw m2, m1, [r3 + 6 * 16]
+ pmulhrsw m2, m3
+ movd m4, [r2 + 28]
+ palignr m4, m6, 2
+ pmaddubsw m5, m4, [r3 + 6 * 16]
+ pmulhrsw m5, m3
+ packuswb m2, m5
+ movu [r0 + 510 * 16], m2
+
+ ; mode 34 [row 0]
+ movu m1, [r2 + 2]
+ movu [r0 + 512 * 16], m1
+ movu m2, [r2 + 18]
+ palignr m3, m2, m1, 1
+ movu [r0 + 513 * 16], m3
+ palignr m3, m2, m1, 2
+ movu [r0 + 514 * 16], m3
+ palignr m3, m2, m1, 3
+ movu [r0 + 515 * 16], m3
+ palignr m3, m2, m1, 4
+ movu [r0 + 516 * 16], m3
+ palignr m3, m2, m1, 5
+ movu [r0 + 517 * 16], m3
+ palignr m3, m2, m1, 6
+ movu [r0 + 518 * 16], m3
+ palignr m3, m2, m1, 7
+ movu [r0 + 519 * 16], m3
+ palignr m3, m2, m1, 8
+ movu [r0 + 520 * 16], m3
+ palignr m3, m2, m1, 9
+ movu [r0 + 521 * 16], m3
+ palignr m3, m2, m1, 10
+ movu [r0 + 522 * 16], m3
+ palignr m3, m2, m1, 11
+ movu [r0 + 523 * 16], m3
+ palignr m3, m2, m1, 12
+ movu [r0 + 524 * 16], m3
+
+ ; mode 33 [row 15]
+ movu [r0 + 511 * 16], m3
+
+ ; mode 34
+ palignr m3, m2, m1, 13
+ movu [r0 + 525 * 16], m3
+ palignr m3, m2, m1, 14
+ movu [r0 + 526 * 16], m3
+ palignr m3, m2, m1, 15
+ movu [r0 + 527 * 16], m3
+ RET
-;-----------------------------------------------------------------------------
-; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
-;-----------------------------------------------------------------------------
+;--------------------------------------------------------------------------------
+; void all_angs_pred_32x32(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;--------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
-
-;mode 2[row 0]
-movu m0, [r4 + 2]
-movu [r0 + 0 * 16], m0
-movu m1, [r4 + 18]
-movu [r0 + 1 * 16], m1
-
-;mode 9 [row 15]
-movu [r0 + 478 * 16], m0
-movu [r0 + 479 * 16], m1
-
-;mode 2[row 1]
-movu m2, [r4 + 34]
-palignr m3, m1, m0, 1
-movu [r0 + 2 * 16], m3
-palignr m4, m2, m1, 1
-movu [r0 + 3 * 16], m4
-
-; mode 9 [row 31]
-movu [r0 + 510 * 16], m3
-movu [r0 + 511 * 16], m4
-
-;mode 2[row 17]
-movu [r0 + 34 * 16], m4
-movu m5, [r4 + 35]
-movu [r0 + 35 * 16], m5
-
-;mode 2[row 2]
-palignr m3, m1, m0, 2
-movu [r0 + 4 * 16], m3
-palignr m4, m2, m1, 2
-movu [r0 + 5 * 16], m4
-
-;mode 2[row 18]
-movu [r0 + 36 * 16], m4
-movu m6, [r4 + 51]
-palignr m7, m6, m5, 1
-movu [r0 + 37 * 16], m7
-
-;mode 2[row 3]
-palignr m3, m1, m0, 3
-movu [r0 + 6 * 16], m3
-palignr m4, m2, m1, 3
-movu [r0 + 7 * 16], m4
-
-;mode 2[row 19]
-movu [r0 + 38 * 16], m4
-palignr m7, m6, m5, 2
-movu [r0 + 39 * 16], m7
-
-;mode 2[row 4]
-palignr m3, m1, m0, 4
-movu [r0 + 8 * 16], m3
-palignr m4, m2, m1, 4
-movu [r0 + 9 * 16], m4
-
-; mode 8 [row 31]
-movu [r0 + 446 * 16], m3
-movu [r0 + 447 * 16], m4
-
-;mode 2[row 20]
-movu [r0 + 40 * 16], m4
-palignr m7, m6, m5, 3
-movu [r0 + 41 * 16], m7
-
-; mode 4 [row 31]
-movu [r0 + 190 * 16], m4
-movu [r0 + 191 * 16], m7
-
-;mode 2[row 5]
-palignr m3, m1, m0, 5
-movu [r0 + 10 * 16], m3
-palignr m4, m2, m1, 5
-movu [r0 + 11 * 16], m4
-
-;mode 2[row 21]
-movu [r0 + 42 * 16], m4
-palignr m7, m6, m5, 4
-movu [r0 + 43 * 16], m7
-
-;mode 2[row 6]
-palignr m3, m1, m0, 6
-movu [r0 + 12 * 16], m3
-palignr m4, m2, m1, 6
-movu [r0 + 13 * 16], m4
-
-;mode 2[row 22]
-movu [r0 + 44 * 16], m4
-palignr m7, m6, m5, 5
-movu [r0 + 45 * 16], m7
-
-;mode 2[row 7]
-palignr m3, m1, m0, 7
-movu [r0 + 14 * 16], m3
-palignr m4, m2, m1, 7
-movu [r0 + 15 * 16], m4
-
-;mode 2[row 23]
-movu [r0 + 46 * 16], m4
-palignr m7, m6, m5, 6
-movu [r0 + 47 * 16], m7
-
-;mode 2[row 8]
-palignr m3, m1, m0, 8
-movu [r0 + 16 * 16], m3
-palignr m4, m2, m1, 8
-movu [r0 + 17 * 16], m4
-
-;mode 7[row 31]
-movu [r0 + 382 * 16], m3
-movu [r0 + 383 * 16], m4
-
-;mode 2[row 24]
-movu [r0 + 48 * 16], m4
-palignr m7, m6, m5, 7
-movu [r0 + 49 * 16], m7
-
-;mode 2[row 9]
-palignr m3, m1, m0, 9
-movu [r0 + 18 * 16], m3
-palignr m4, m2, m1, 9
-movu [r0 + 19 * 16], m4
-
-;mode 2[row 25]
-movu [r0 + 50 * 16], m4
-palignr m7, m6, m5, 8
-movu [r0 + 51 * 16], m7
-
-; mode 3 [row 31]
-movu [r0 + 126 * 16], m4
-movu [r0 + 127 * 16], m7
-
-;mode 2[row 10]
-palignr m3, m1, m0, 10
-movu [r0 + 20 * 16], m3
-palignr m4, m2, m1, 10
-movu [r0 + 21 * 16], m4
-
-;mode 2[row 26]
-movu [r0 + 52 * 16], m4
-palignr m7, m6, m5, 9
-movu [r0 + 53 * 16], m7
-
-;mode 2[row 11]
-palignr m3, m1, m0, 11
-movu [r0 + 22 * 16], m3
-palignr m4, m2, m1, 11
-movu [r0 + 23 * 16], m4
-
-;mode 2[row 27]
-movu [r0 + 54 * 16], m4
-palignr m7, m6, m5, 10
-movu [r0 + 55 * 16], m7
-
-;mode 2[row 12]
-palignr m3, m1, m0, 12
-movu [r0 + 24 * 16], m3
-palignr m4, m2, m1, 12
-movu [r0 + 25 * 16], m4
-
-; mode 6 [row 31]
-movu [r0 + 318 * 16], m3
-movu [r0 + 319 * 16], m4
-
-; mode 3 [row 15]
-movu [r0 + 94 * 16], m3
-movu [r0 + 95 * 16], m4
-
-;mode 2[row 28]
-movu [r0 + 56 * 16], m4
-palignr m7, m6, m5, 11
-movu [r0 + 57 * 16], m7
-
-;mode 2[row 13]
-palignr m3, m1, m0, 13
-movu [r0 + 26 * 16], m3
-palignr m4, m2, m1, 13
-movu [r0 + 27 * 16], m4
-
-;mode 2[row 29]
-movu [r0 + 58 * 16], m4
-palignr m7, m6, m5, 12
-movu [r0 + 59 * 16], m7
-
-;mode 2[row 14]
-palignr m3, m1, m0, 14
-movu [r0 + 28 * 16], m3
-palignr m4, m2, m1, 14
-movu [r0 + 29 * 16], m4
-
-;mode 2[row 30]
-movu [r0 + 60 * 16], m4
-palignr m7, m6, m5, 13
-movu [r0 + 61 * 16], m7
-
-;mode 2[row 15]
-palignr m3, m1, m0, 15
-movu [r0 + 30 * 16], m3
-palignr m4, m2, m1, 15
-movu [r0 + 31 * 16], m4
-
-;mode 2[row 31]
-movu [r0 + 62 * 16], m4
-palignr m7, m6, m5, 14
-movu [r0 + 63 * 16], m7
-
-;mode 2[row 16]
-movu [r0 + 32 * 16], m1
-movu [r0 + 33 * 16], m2
-
-; mode 5[row 31]
-movu [r0 + 254 * 16], m1
-movu [r0 + 255 * 16], m2
-
-; mode 3 [row 0]
-lea r5, [ang_table]
-movu m6, [r5 + 26 * 16]
-movu m7, [pw_1024 ]
-movu m1, [r4 + 1 ]
-punpcklbw m1, m0
-pmaddubsw m0, m1, m6
-pmulhrsw m0, m7
-movu m2, [r4 + 9]
-movd m3, [r4 + 10]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m0, m3
-movu [r0 + 64 * 16], m0
-
-; mode 6 [row 1 - first half]
-movu [r0 + 258 * 16], m0
-
-; mode 9 [row 12 - first half]
-movu [r0 + 472 * 16], m0
-
-movu m0, [r4 + 17]
-movd m3, [r4 + 18]
-palignr m3, m0, 1
-punpcklbw m0, m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 25]
-movd m5, [r4 + 26]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 65 * 16], m3
-
-; mode 6 [row 1 - second half]
-movu [r0 + 259 * 16], m3
-
-; mode 9 [row 12 - second half]
-movu [r0 + 473 * 16], m3
-
-; mode 4 [row 0]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 128 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 129 * 16], m3
-
-; mode 5 [row 0]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 192 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 193 * 16], m3
-
-; mode 6 [row 0]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 256 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 257 * 16], m3
-
-; mode 7 [row 0]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 320 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 321 * 16], m3
-
-; mode 7 [row 1]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 322 * 16], m3
-
-; mode 9 [row 8 - first half]
-movu [r0 + 464 * 16], m3
-
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 323 * 16], m3
-
-; mode 9 [row 8 - second half]
-movu [r0 + 465 * 16], m3
-
-; mode 7 [row 2]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 324 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 325 * 16], m3
-
-; mode 8 [row 0]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 384 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 385 * 16], m3
-
-; mode 8 [row 1]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 386 * 16], m3
-
-; mode 9 [row 4 - first half]
-movu [r0 + 456 * 16], m3
-
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 387 * 16], m3
-
-; mode 9 [row 4 - second half]
-movu [r0 + 457 * 16], m3
-
-; mode 8 [row 2]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 388 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 389 * 16], m3
-
-; mode 8 [row 3]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 390 * 16], m3
-
-; mode 9 [row 9 - first half]
-movu [r0 + 466 * 16], m3
-
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 391 * 16], m3
-
-; mode 9 [row 9 - second half]
-movu [r0 + 467 * 16], m3
-
-; mode 8 [row 4]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 392 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 393 * 16], m3
-
-; mode 8 [row 5]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 394 * 16], m3
-
-; mode 9 [row 14 - first half]
-movu [r0 + 476 * 16], m3
-
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 395 * 16], m3
-
-; mode 9 [row 14 - second half]
-movu [r0 + 477 * 16], m3
-
-; mode 9 [row 0]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 448 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 449 * 16], m3
-
-; mode 9 [row 1]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 450 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 451 * 16], m3
-
-; mode 9 [row 2]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 452 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 453 * 16], m3
-
-; mode 9 [row 3]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 454 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 455 * 16], m3
-
-; mode 9 [row 5]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 458 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 459 * 16], m3
-
-; mode 9 [row 6]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 460 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 461 * 16], m3
-
-; mode 9 [row 7]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 462 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 463 * 16], m3
-
-; mode 9 [row 10]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 468 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 469 * 16], m3
-
-; mode 9 [row 11]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 470 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 471 * 16], m3
-
-; mode 9 [row 13]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 474 * 16], m3
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 475 * 16], m3
-
-; mode 3 [row 1]
-movu m6, [r5 + 20 * 16]
-movu m0, [r4 + 2]
-movd m1, [r4 + 3]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 10]
-movd m3, [r4 + 11]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 66 * 16], m1
-
-; mode 6 [row 3 - first half]
-movu [r0 + 262 * 16], m1
-
-; mode 9 [row 25 - first half]
-movu [r0 + 498 * 16], m1
-
-movu m1, [r4 + 18]
-movd m3, [r4 + 19]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 26]
-movd m5, [r4 + 27]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 67 * 16], m3
-
-; mode 6 [row 3 - second half]
-movu [r0 + 263 * 16], m3
-
-; mode 9 [row 25 - second half]
-movu [r0 + 499 * 16], m3
-
-; mode 4 [row 1]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 130 * 16], m3
-
-; mode 9 [row 20 - first half]
-movu [r0 + 488 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 131 * 16], m3
-
-; mode 9 [row 20 - second half]
-movu [r0 + 489 * 16], m3
-
-; mode 4 [row 2]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 132 * 16], m3
-
-; mode 7 [row 6 - first half]
-movu [r0 + 332 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 133 * 16], m3
-
-; mode 7 [row 6 - second half]
-movu [r0 + 333 * 16], m3
-
-; mode 5 [row 1]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 194 * 16], m3
-
-; mode 5 [row 1 - first half]
-movu [r0 + 480 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 195 * 16], m3
-
-; mode 5 [row 1 - second half]
-movu [r0 + 481 * 16], m3
-
-; mode 5 [row 2]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 196 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 197 * 16], m3
-
-; mode 6 [row 2]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 260 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 261 * 16], m3
-
-; mode 7 [row 3]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 326 * 16], m3
-
-; mode 9 [row 17 - first half]
-movu [r0 + 482 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 327 * 16], m3
-
-; mode 9 [row 17 - second half]
-movu [r0 + 483 * 16], m3
-
-; mode 7 [row 4]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 328 * 16], m3
-
-; mode 8 [row 8 - first half]
-movu [r0 + 400 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 329 * 16], m3
-
-; mode 8 [row 8 - second half]
-movu [r0 + 401 * 16], m3
-
-; mode 7 [row 5]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 330 * 16], m3
-
-; mode 9 [row 26 - first half]
-movu [r0 + 500 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 331 * 16], m3
-
-; mode 9 [row 26 - second half]
-movu [r0 + 501 * 16], m3
-
-; mode 8 [row 6]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 396 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 397 * 16], m3
-
-; mode 9 [row 18]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 484 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 485 * 16], m3
-
-; mode 9 [row 21]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 490 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 491 * 16], m3
-
-; mode 9 [row 22]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 492 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 493 * 16], m3
-
-; mode 9 [row 23]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 494 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 495 * 16], m3
-
-; mode 9 [row 27]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 502 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 503 * 16], m3
-
-; mode 9 [row 28]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 504 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 505 * 16], m3
-
-; mode 9 [row 30]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 508 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 509 * 16], m3
-
-; mode 8 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 398 * 16], m3
-
-; mode 9 [row 19 - first half]
-movu [r0 + 486 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 399 * 16], m3
-
-; mode 9 [row 19 - second half]
-movu [r0 + 487 * 16], m3
-
-; mode 8 [row 9]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 402 * 16], m3
-
-; mode 9 [row 24 - first half]
-movu [r0 + 496 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 403 * 16], m3
-
-; mode 9 [row 24 - second half]
-movu [r0 + 497 * 16], m3
-
-; mode 8 [row 10]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 404 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 405 * 16], m3
-
-; mode 8 [row 11]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 406 * 16], m3
-
-; mode 9 [row 29 - first half]
-movu [r0 + 506 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 407 * 16], m3
-
-; mode 9 [row 29 - second half]
-movu [r0 + 507 * 16], m3
-
-; mode 3 [row 2]
-movu m6, [r5 + 14 * 16]
-movu m0, [r4 + 3]
-movd m1, [r4 + 4]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 11]
-movd m3, [r4 + 12]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 68 * 16], m1
-
-; mode 3 [row 2 - first half]
-movu [r0 + 266 * 16], m1
-
-movu m1, [r4 + 19]
-movd m3, [r4 + 20]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 27]
-movd m5, [r4 + 28]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 69 * 16], m3
-
-; mode 3 [row 2 - second half]
-movu [r0 + 267 * 16], m3
-
-; mode 4 [row 3]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 134 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 135 * 16], m3
-
-; mode 5 [row 3]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 198 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 199 * 16], m3
-
-; mode 5 [row 4]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 200 * 16], m3
-
-; mode 8 [row 16 - first half]
-movu [r0 + 416 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 201 * 16], m3
-
-; mode 8 [row 16 - second half]
-movu [r0 + 417 * 16], m3
-
-; mode 6 [row 4]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 264 * 16], m3
-
-; mode 6 [row 4 - first half]
-movu [r0 + 408 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 265 * 16], m3
-
-; mode 6 [row 4 - second half]
-movu [r0 + 409 * 16], m3
-
-; mode 6 [row 6]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 268 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 269 * 16], m3
-
-; mode 7 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 334 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 335 * 16], m3
-
-; mode 7 [row 8]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 336 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 337 * 16], m3
-
-; mode 7 [row 9]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 338 * 16], m3
-
-; mode 8 [row 17 - first half]
-movu [r0 + 418 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 339 * 16], m3
-
-; mode 8 [row 17 - second half]
-movu [r0 + 419 * 16], m3
-
-; mode 8 [row 13]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 410 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 411 * 16], m3
-
-; mode 8 [row 14]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 412 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 413 * 16], m3
-
-; mode 8 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 414 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 415 * 16], m3
-
-; mode 8 [row 18]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 420 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 421 * 16], m3
-
-; mode 3 [row 3]
-movu m6, [r5 + 8 * 16]
-movu m0, [r4 + 4]
-movd m1, [r4 + 5]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 12]
-movd m3, [r4 + 13]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 70 * 16], m1
-
-; mode 6 [row 7 - first half]
-movu [r0 + 270 * 16], m1
-
-movu m1, [r4 + 20]
-movd m3, [r4 + 21]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 28]
-movd m5, [r4 + 29]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 71 * 16], m3
-
-; mode 6 [row 7 - second half]
-movu [r0 + 271 * 16], m3
-
-; mode 4 [row 4]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 136 * 16], m3
-
-; mode 4 [row 4 - first half]
-movu [r0 + 424 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 137 * 16], m3
-
-; mode 4 [row 4 - second half]
-movu [r0 + 425 * 16], m3
-
-; mode 4 [row 5]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 138 * 16], m3
-
-; mode 7 [row 13 - first half]
-movu [r0 + 346 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 139 * 16], m3
-
-; mode 7 [row 13 - second half]
-movu [r0 + 347 * 16], m3
-
-; mode 5 [row 5]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 202 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 203 * 16], m3
-
-; mode 5 [row 6]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 204 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 205 * 16], m3
-
-; mode 6 [row 8]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 272 * 16], m3
-
-; mode 7 [row 12 - first half]
-movu [r0 + 344 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 273 * 16], m3
-
-; mode 7 [row 12 - second half]
-movu [r0 + 345 * 16], m3
-
-; mode 7 [row 10]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 340 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 341 * 16], m3
-
-; mode 7 [row 11]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 342 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 343 * 16], m3
-
-; mode 8 [row 19]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 422 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 423 * 16], m3
-
-; mode 8 [row 21]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 426 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 427 * 16], m3
-
-; mode 8 [row 22]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 428 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 429 * 16], m3
-
-; mode 8 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 430 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 431 * 16], m3
-
-; mode 8 [row 24]
-movu m6, [r5 + 29 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 432 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 433 * 16], m3
-
-; mode 3 [row 4]
-movu m6, [r5 + 2 * 16]
-movu m0, [r4 + 5]
-movd m1, [r4 + 6]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 13]
-movd m3, [r4 + 14]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 72 * 16], m1
-
-; mode 3 [row 4 - first half]
-movu [r0 + 274 * 16], m1
-
-; mode 8 [row 25 - first half]
-movu [r0 + 434 * 16], m1
-
-movu m1, [r4 + 21]
-movd m3, [r4 + 22]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 29]
-movd m5, [r4 + 30]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 73 * 16], m3
-
-; mode 3 [row 4 - second half]
-movu [r0 + 275 * 16], m3
-
-; mode 8 [row 25 - second half]
-movu [r0 + 435 * 16], m3
-
-; mode 3 [row 5]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 74 * 16], m3
-
-; mode 3 [row 5 - first half]
-movu [r0 + 278 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 75 * 16], m3
-
-; mode 3 [row 5 - second half]
-movu [r0 + 279 * 16], m3
-
-; mode 4 [row 6]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 140 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 141 * 16], m3
-
-; mode 5 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 206 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 207 * 16], m3
-
-; mode 5 [row 8]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 208 * 16], m3
-
-; mode 7 [row 16 - first half]
-movu [r0 + 352 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 209 * 16], m3
-
-; mode 7 [row 16 - second half]
-movu [r0 + 353 * 16], m3
-
-; mode 6 [row 10]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 276 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 277 * 16], m3
-
-; mode 7 [row 14]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 348 * 16], m3
-
-; mode 8 [row 26 - first half]
-movu [r0 + 436 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 349 * 16], m3
-
-; mode 8 [row 26 - second half]
-movu [r0 + 437 * 16], m3
-
-; mode 7 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 350 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 351 * 16], m3
-
-; mode 8 [row 27]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 438 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 439 * 16], m3
-
-; mode 8 [row 28]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 440 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 441 * 16], m3
-
-; mode 8 [row 29]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 442 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 443 * 16], m3
-
-; mode 8 [row 30]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 444 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 445 * 16], m3
-
-; mode 3 [row 6]
-movu m6, [r5 + 22 * 16]
-movu m0, [r4 + 6]
-movd m1, [r4 + 7]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 14]
-movd m3, [r4 + 15]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 76 * 16], m1
-
-; mode 6 [row 13 - first half]
-movu [r0 + 282 * 16], m1
-
-movu m1, [r4 + 22]
-movd m3, [r4 + 23]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 30]
-movd m5, [r4 + 31]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 77 * 16], m3
-
-; mode 6 [row 13 - second half]
-movu [r0 + 283 * 16], m3
-
-; mode 4 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 142 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 143 * 16], m3
-
-; mode 4 [row 8]
-movu m6, [r5 + 29 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 144 * 16], m3
-
-; mode 4 [row 8 - first half]
-movu [r0 + 360 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 145 * 16], m3
-
-; mode 4 [row 8 - second half]
-movu [r0 + 361 * 16], m3
-
-; mode 5 [row 9]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 210 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 211 * 16], m3
-
-; mode 5 [row 10]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 212 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 213 * 16], m3
-
-; mode 7 [row 17]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 354 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 355 * 16], m3
-
-; mode 7 [row 18]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 356 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 357 * 16], m3
-
-; mode 7 [row 19]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 358 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 359 * 16], m3
-
-; mode 6 [row 12]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 280 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 281 * 16], m3
-
-; mode 3 [row 7]
-movu m6, [r5 + 16 * 16]
-movu m0, [r4 + 7]
-movd m1, [r4 + 8]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 15]
-movd m3, [r4 + 16]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 78 * 16], m1
-
-; mode 6 [row 15 - first half]
-movu [r0 + 286 * 16], m1
-
-movu m1, [r4 + 23]
-movd m3, [r4 + 24]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 31]
-movd m5, [r4 + 32]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 79 * 16], m3
-
-; mode 6 [row 15 - second half]
-movu [r0 + 287 * 16], m3
-
-; mode 4 [row 9]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 146 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 147 * 16], m3
-
-; mode 5 [row 11]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 214 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 215 * 16], m3
-
-; mode 5 [row 12]
-movu m6, [r5 + 29 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 216 * 16], m3
-
-; mode 6 [row 16 - first half]
-movu [r0 + 288 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 217 * 16], m3
-
-; mode 6 [row 16 - second half]
-movu [r0 + 289 * 16], m3
-
-; mode 6 [row 14]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 284 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 285 * 16], m3
-
-; mode 7 [row 21]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 362 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 363 * 16], m3
-
-; mode 7 [row 22]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 364 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 365 * 16], m3
-
-; mode 7 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 366 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 367 * 16], m3
-
-; mode 3 [row 8]
-movu m6, [r5 + 10 * 16]
-movu m0, [r4 + 8]
-movd m1, [r4 + 9]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 16]
-movd m3, [r4 + 17]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 80 * 16], m1
-
-; mode 7 [row 25 - first half]
-movu [r0 + 290 * 16], m1
-
-; mode 6 [row 17 - first half]
-movu [r0 + 370 * 16], m1
-
-movu m1, [r4 + 24]
-movd m3, [r4 + 25]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 32]
-movd m5, [r4 + 33]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 81 * 16], m3
-
-; mode 7 [row 25 - second half]
-movu [r0 + 291 * 16], m3
-
-; mode 6 [row 17 - second half]
-movu [r0 + 371 * 16], m3
-
-; mode 4 [row 10]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 148 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 149 * 16], m3
-
-; mode 4 [row 11]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 150 * 16], m3
-
-; mode 7 [row 27 - first half]
-movu [r0 + 374 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 151 * 16], m3
-
-; mode 7 [row 27 - second half]
-movu [r0 + 375 * 16], m3
-
-; mode 5 [row 13]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 218 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 219 * 16], m3
-
-; mode 5 [row 14]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 220 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 221 * 16], m3
-
-; mode 6 [row 18]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 292 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 293 * 16], m3
-
-; mode 7 [row 24]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 368 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 369 * 16], m3
-
-; mode 7 [row 26]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 372 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 373 * 16], m3
-
-; mode 3 [row 9]
-movu m6, [r5 + 4 * 16]
-movu m0, [r4 + 9]
-movd m1, [r4 + 10]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 17]
-movd m3, [r4 + 18]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 82 * 16], m1
-
-; mode 6 [row 19 - first half]
-movu [r0 + 294 * 16], m1
-
-movu m1, [r4 + 25]
-movd m3, [r4 + 26]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 33]
-movd m5, [r4 + 34]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 83 * 16], m3
-
-; mode 6 [row 19 - second half]
-movu [r0 + 295 * 16], m3
-
-; mode 4 [row 12]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 152 * 16], m3
-
-; mode 4 [row 12 - first half]
-movu [r0 + 296 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 153 * 16], m3
-
-; mode 4 [row 12 - second half]
-movu [r0 + 297 * 16], m3
-
-; mode 3 [row 10]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 84 * 16], m3
-
-; mode 6 [row 21 - first half]
-movu [r0 + 298 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 85 * 16], m3
-
-; mode 6 [row 21 - second half]
-movu [r0 + 299 * 16], m3
-
-; mode 5 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 222 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 223 * 16], m3
-
-; mode 7 [row 28]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 376 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 377 * 16], m3
-
-; mode 7 [row 29]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 378 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 379 * 16], m3
-
-; mode 7 [row 30]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 380 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 381 * 16], m3
-
-; mode 3 [row 11]
-movu m6, [r5 + 24 * 16]
-movu m0, [r4 + 10]
-movd m1, [r4 + 11]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 18]
-movd m3, [r4 + 19]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 86 * 16], m1
-
-; mode 6 [row 23 - first half]
-movu [r0 + 302 * 16], m1
-
-movu m1, [r4 + 26]
-movd m3, [r4 + 27]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 34]
-movd m5, [r4 + 35]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 87 * 16], m3
-
-; mode 6 [row 23 - second half]
-movu [r0 + 303 * 16], m3
-
-; mode 4 [row 13]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 154 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 155 * 16], m3
-
-; mode 4 [row 14]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 156 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 157 * 16], m3
-
-; mode 5 [row 16]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 224 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 225 * 16], m3
-
-; mode 5 [row 17]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 226 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 227 * 16], m3
-
-; mode 6 [row 22]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 300 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 301 * 16], m3
-
-; mode 3 [row 12]
-movu m6, [r5 + 18 * 16]
-movu m0, [r4 + 11]
-movd m1, [r4 + 12]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 19]
-movd m3, [r4 + 20]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 88 * 16], m1
-
-; mode 6 [row 25 - first half]
-movu [r0 + 306 * 16], m1
-
-movu m1, [r4 + 27]
-movd m3, [r4 + 28]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 35]
-movd m5, [r4 + 36]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 89 * 16], m3
-
-; mode 6 [row 25 - second half]
-movu [r0 + 307 * 16], m3
-
-; mode 4 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 158 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 159 * 16], m3
-
-; mode 5 [row 18]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 228 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 229 * 16], m3
-
-; mode 5 [row 19]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 230 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 231 * 16], m3
-
-; mode 6 [row 24]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 304 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 305 * 16], m3
-
-; mode 6 [row 26]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 308 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 309 * 16], m3
-
-; mode 3 [row 13]
-movu m6, [r5 + 12 * 16]
-movu m0, [r4 + 12]
-movd m1, [r4 + 13]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 20]
-movd m3, [r4 + 21]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 90 * 16], m1
-
-movu m1, [r4 + 28]
-movd m3, [r4 + 29]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 36]
-movd m5, [r4 + 37]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 91 * 16], m3
-
-; mode 4 [row 16]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 160 * 16], m3
-
-; mode 5 [row 20 - first half]
-movu [r0 + 232 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 161 * 16], m3
-
-; mode 5 [row 20 - second half]
-movu [r0 + 233 * 16], m3
-
-; mode 4 [row 17]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 162 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 163 * 16], m3
-
-; mode 5 [row 21]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 234 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 235 * 16], m3
-
-; mode 6 [row 27]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 310 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 311 * 16], m3
-
-; mode 6 [row 28]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 312 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 313 * 16], m3
-
-; mode 3 [row 14]
-movu m6, [r5 + 6 * 16]
-movu m0, [r4 + 13]
-movd m1, [r4 + 14]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 21]
-movd m3, [r4 + 22]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 92 * 16], m1
-
-; mode 6 [row 29 - first half]
-movu [r0 + 314 * 16], m1
-
-movu m1, [r4 + 29]
-movd m3, [r4 + 30]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 37]
-movd m5, [r4 + 38]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 93 * 16], m3
-
-; mode 6 [row 29 - second half]
-movu [r0 + 315 * 16], m3
-
-; mode 4 [row 18]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 164 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 165 * 16], m3
-
-; mode 5 [row 22]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 236 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 237 * 16], m3
-
-; mode 5 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 238 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 239 * 16], m3
-
-; mode 6 [row 30]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 316 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 317 * 16], m3
-
-; mode 3 [row 16]
-movu m6, [r5 + 26 * 16]
-movu m0, [r4 + 14]
-movd m1, [r4 + 15]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 22]
-movd m3, [r4 + 23]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 96 * 16], m1
-
-; mode 5 [row 25 - first half]
-movu [r0 + 242 * 16], m1
-
-movu m1, [r4 + 30]
-movd m3, [r4 + 31]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 38]
-movd m5, [r4 + 39]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 97 * 16], m3
-
-; mode 5 [row 25 - second half]
-movu [r0 + 243 * 16], m3
-
-; mode 4 [row 19]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 166 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 167 * 16], m3
-
-; mode 4 [row 20]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 168 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 169 * 16], m3
-
-; mode 5 [row 24]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 240 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 241 * 16], m3
-
-; mode 3 [row 17]
-movu m6, [r5 + 20 * 16]
-movu m0, [r4 + 15]
-movd m1, [r4 + 16]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 23]
-movd m3, [r4 + 24]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 98 * 16], m1
-
-movu m1, [r4 + 31]
-movd m3, [r4 + 32]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 39]
-movd m5, [r4 + 40]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 99 * 16], m3
-
-; mode 4 [row 21]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 170 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 171 * 16], m3
-
-; mode 5 [row 26]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 244 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 245 * 16], m3
-
-; mode 5 [row 27]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 246 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 247 * 16], m3
-
-; mode 3 [row 18]
-movu m6, [r5 + 14 * 16]
-movu m0, [r4 + 16]
-movd m1, [r4 + 17]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 24]
-movd m3, [r4 + 25]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 100 * 16], m1
-
-movu m1, [r4 + 32]
-movd m3, [r4 + 33]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 40]
-movd m5, [r4 + 41]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 101 * 16], m3
-
-; mode 4 [row 22]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 172 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 173 * 16], m3
-
-; mode 4 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 174 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 175 * 16], m3
-
-; mode 5 [row 28]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 248 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 249 * 16], m3
-
-; mode 5 [row 29]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 250 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 251 * 16], m3
-
-; mode 3 [row 19]
-movu m6, [r5 + 8 * 16]
-movu m0, [r4 + 17]
-movd m1, [r4 + 18]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 25]
-movd m3, [r4 + 26]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 102 * 16], m1
-
-movu m1, [r4 + 33]
-movd m3, [r4 + 34]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 41]
-movd m5, [r4 + 42]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 103 * 16], m3
-
-; mode 4 [row 24]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 176 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 177 * 16], m3
-
-; mode 5 [row 30]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 252 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 253 * 16], m3
-
-; mode 3 [row 20]
-movu m6, [r5 + 2 * 16]
-movu m0, [r4 + 18]
-movd m1, [r4 + 19]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 26]
-movd m3, [r4 + 27]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 104 * 16], m1
-
-movu m1, [r4 + 34]
-movd m3, [r4 + 35]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 42]
-movd m5, [r4 + 43]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 105 * 16], m3
-
-; mode 4 [row 25]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 178 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 179 * 16], m3
-
-; mode 4 [row 26]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 180 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 181 * 16], m3
-
-; mode 3 [row 21]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 106 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 107 * 16], m3
-
-; mode 3 [row 22]
-movu m6, [r5 + 22 * 16]
-movu m0, [r4 + 19]
-movd m1, [r4 + 20]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 27]
-movd m3, [r4 + 28]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 108 * 16], m1
-
-movu m1, [r4 + 35]
-movd m3, [r4 + 36]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 43]
-movd m5, [r4 + 44]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 109 * 16], m3
-
-; mode 4 [row 27]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 182 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 183 * 16], m3
-
-; mode 3 [row 23]
-movu m6, [r5 + 16 * 16]
-movu m0, [r4 + 20]
-movd m1, [r4 + 21]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 28]
-movd m3, [r4 + 29]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 110 * 16], m1
-
-movu m1, [r4 + 36]
-movd m3, [r4 + 37]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 44]
-movd m5, [r4 + 45]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 111 * 16], m3
-
-; mode 4 [row 28]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 184 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 185 * 16], m3
-
-; mode 4 [row 29]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 186 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 187 * 16], m3
-
-; mode 3 [row 24]
-movu m6, [r5 + 10 * 16]
-movu m0, [r4 + 21]
-movd m1, [r4 + 22]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 29]
-movd m3, [r4 + 30]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 112 * 16], m1
-
-movu m1, [r4 + 37]
-movd m3, [r4 + 38]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 45]
-movd m5, [r4 + 46]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 113 * 16], m3
-
-; mode 4 [row 30]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 188 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 189 * 16], m3
-
-; mode 3 [row 25]
-movu m6, [r5 + 4 * 16]
-movu m0, [r4 + 22]
-movd m1, [r4 + 23]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 30]
-movd m3, [r4 + 31]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 114 * 16], m1
-
-movu m1, [r4 + 38]
-movd m3, [r4 + 39]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 46]
-movd m5, [r4 + 47]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 115 * 16], m3
-
-; mode 3 [row 26]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 116 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 117 * 16], m3
-
-; mode 3 [row 27]
-movu m6, [r5 + 24 * 16]
-movu m0, [r4 + 23]
-movd m1, [r4 + 24]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 31]
-movd m3, [r4 + 32]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 118 * 16], m1
-
-movu m1, [r4 + 39]
-movd m3, [r4 + 40]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 47]
-movd m5, [r4 + 48]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 119 * 16], m3
-
-; mode 3 [row 28]
-movu m6, [r5 + 18 * 16]
-movu m0, [r4 + 24]
-movd m1, [r4 + 25]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 32]
-movd m3, [r4 + 33]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 120 * 16], m1
-
-movu m1, [r4 + 40]
-movd m3, [r4 + 41]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 48]
-movd m5, [r4 + 49]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 121 * 16], m3
-
-; mode 3 [row 29]
-movu m6, [r5 + 12 * 16]
-movu m0, [r4 + 25]
-movd m1, [r4 + 26]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 33]
-movd m3, [r4 + 34]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 122 * 16], m1
-
-movu m1, [r4 + 41]
-movd m3, [r4 + 42]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 49]
-movd m5, [r4 + 50]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 123 * 16], m3
-
-; mode 3 [row 30]
-movu m6, [r5 + 6 * 16]
-movu m0, [r4 + 26]
-movd m1, [r4 + 27]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r4 + 34]
-movd m3, [r4 + 35]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 124 * 16], m1
-
-movu m1, [r4 + 42]
-movd m3, [r4 + 43]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r4 + 50]
-movd m5, [r4 + 51]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 125 * 16], m3
-
-; mode 10
-movu m1, [r2 + 1]
-movu m2, [r2 + 17]
-movu [r0 + 512 * 16], m1
-movu [r0 + 513 * 16], m2
-movu [r0 + 514 * 16], m1
-movu [r0 + 515 * 16], m2
-movu [r0 + 516 * 16], m1
-movu [r0 + 517 * 16], m2
-movu [r0 + 518 * 16], m1
-movu [r0 + 519 * 16], m2
-movu [r0 + 520 * 16], m1
-movu [r0 + 521 * 16], m2
-movu [r0 + 522 * 16], m1
-movu [r0 + 523 * 16], m2
-movu [r0 + 524 * 16], m1
-movu [r0 + 525 * 16], m2
-movu [r0 + 526 * 16], m1
-movu [r0 + 527 * 16], m2
-
-movu [r0 + 528 * 16], m1
-movu [r0 + 529 * 16], m2
-movu [r0 + 530 * 16], m1
-movu [r0 + 531 * 16], m2
-movu [r0 + 532 * 16], m1
-movu [r0 + 533 * 16], m2
-movu [r0 + 534 * 16], m1
-movu [r0 + 535 * 16], m2
-movu [r0 + 536 * 16], m1
-movu [r0 + 537 * 16], m2
-movu [r0 + 538 * 16], m1
-movu [r0 + 539 * 16], m2
-movu [r0 + 540 * 16], m1
-movu [r0 + 541 * 16], m2
-movu [r0 + 542 * 16], m1
-movu [r0 + 543 * 16], m2
-
-movu [r0 + 544 * 16], m1
-movu [r0 + 545 * 16], m2
-movu [r0 + 546 * 16], m1
-movu [r0 + 547 * 16], m2
-movu [r0 + 548 * 16], m1
-movu [r0 + 549 * 16], m2
-movu [r0 + 550 * 16], m1
-movu [r0 + 551 * 16], m2
-movu [r0 + 552 * 16], m1
-movu [r0 + 553 * 16], m2
-movu [r0 + 554 * 16], m1
-movu [r0 + 555 * 16], m2
-movu [r0 + 556 * 16], m1
-movu [r0 + 557 * 16], m2
-movu [r0 + 558 * 16], m1
-movu [r0 + 559 * 16], m2
-
-movu [r0 + 560 * 16], m1
-movu [r0 + 561 * 16], m2
-movu [r0 + 562 * 16], m1
-movu [r0 + 563 * 16], m2
-movu [r0 + 564 * 16], m1
-movu [r0 + 565 * 16], m2
-movu [r0 + 566 * 16], m1
-movu [r0 + 567 * 16], m2
-movu [r0 + 568 * 16], m1
-movu [r0 + 569 * 16], m2
-movu [r0 + 570 * 16], m1
-movu [r0 + 571 * 16], m2
-movu [r0 + 572 * 16], m1
-movu [r0 + 573 * 16], m2
-movu [r0 + 574 * 16], m1
-movu [r0 + 575 * 16], m2
-
-; mode 11 [row 0]
-movu m0, [r4]
-
-; mode 11 [row 15 - first half]
-movu [r0 + 606 * 16], m0
-
-movu [r0 + 606 * 16], m0
-
-; mode 12 [row 31]
-pslldq m6, m0, 4
-pinsrb m6, [r3 + 26], 0
-pinsrb m6, [r3 + 19], 1
-pinsrb m6, [r3 + 13], 2
-pinsrb m6, [r3 + 6], 3
-movu [r0 + 702 * 16], m6
-movu m6, [r4 + 12]
-movu [r0 + 703 * 16], m6
-
-; mode 11 [row 31]
-pslldq m6, m0, 1
-pinsrb m6, [r3 + 16], 0
-movu [r0 + 638 * 16], m6
-movu m6, [r4 + 15]
-movu [r0 + 639 * 16], m6
-
-movd m1, [r4 + 1]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m1, m0, [r5 + 30 * 16]
-pmulhrsw m1, m7
-movu m2, [r4 + 8]
-movd m3, [r4 + 9]
-palignr m3, m2, 1
-punpcklbw m2, m3
-pmaddubsw m3, m2, [r5 + 30 * 16]
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 576 * 16], m1
-
-movu m1, [r4 + 16]
-
-; mode 11 [row 15 - second half]
-movu [r0 + 607 * 16], m1
-
-movd m3, [r4 + 17]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, m7
-movu m4, [r4 + 24]
-movd m5, [r4 + 25]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 577 * 16], m3
-
-; mode 11 [row 1]
-pmaddubsw m3, m0, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 578 * 16], m3
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 579 * 16], m3
-
-; mode 11 [row 2]
-pmaddubsw m3, m0, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 580 * 16], m3
-pmaddubsw m3, m1, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 581 * 16], m3
-
-; mode 11 [row 3]
-pmaddubsw m3, m0, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 582 * 16], m3
-pmaddubsw m3, m1, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 583 * 16], m3
-
-; mode 11 [row 4]
-pmaddubsw m3, m0, [r5 + 22 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 22 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 584 * 16], m3
-
-; mode 12 [row 1 - first half]
-movu [r0 + 642 * 16], m3
-
-pmaddubsw m3, m1, [r5 + 22 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 22 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 585 * 16], m3
-
-; mode 12 [row 1 - second half]
-movu [r0 + 643 * 16], m3
-
-; mode 11 [row 5]
-pmaddubsw m3, m0, [r5 + 20 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 586 * 16], m3
-pmaddubsw m3, m1, [r5 + 20 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 587 * 16], m3
-
-; mode 11 [row 6]
-pmaddubsw m3, m0, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 588 * 16], m3
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 589 * 16], m3
-
-; mode 11 [row 7]
-pmaddubsw m3, m0, [r5 + 16 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 590 * 16], m3
-pmaddubsw m3, m1, [r5 + 16 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 591 * 16], m3
-
-; mode 11 [row 8]
-pmaddubsw m3, m0, [r5 + 14 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 592 * 16], m3
-
-; mode 13 [row 1 - first half]
-movu [r0 + 706 * 16], m3
-
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 593 * 16], m3
-
-; mode 13 [row 1 - second half]
-movu [r0 + 707 * 16], m3
-
-; mode 11 [row 9]
-pmaddubsw m3, m0, [r5 + 12 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 12 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 594 * 16], m3
-
-; mode 12 [row 3 - first half]
-movu [r0 + 646 * 16], m3
-
-pmaddubsw m3, m1, [r5 + 12 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 595 * 16], m3
-
-; mode 12 [row 3 - second half]
-movu [r0 + 647 * 16], m3
-
-; mode 11 [row 10]
-pmaddubsw m3, m0, [r5 + 10 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 10 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 596 * 16], m3
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 597 * 16], m3
-
-; mode 11 [row 11]
-pmaddubsw m3, m0, [r5 + 8 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 598 * 16], m3
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 599 * 16], m3
-
-; mode 11 [row 12]
-pmaddubsw m3, m0, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 600 * 16], m3
-
-; mode 14 [row 1 - first half]
-movu [r0 + 770 * 16], m3
-
-pmaddubsw m3, m1, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 601 * 16], m3
-
-; mode 14 [row 1 - second half]
-movu [r0 + 771 * 16], m3
-
-; mode 11 [row 13]
-pmaddubsw m3, m0, [r5 + 4 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 602 * 16], m3
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 603 * 16], m3
-
-; mode 11 [row 14]
-pmaddubsw m3, m0, [r5 + 2 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 2 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 604 * 16], m3
-
-; mode 13 [row 5 - first half]
-movu [r0 + 650 * 16], m3
-
-pmaddubsw m3, m1, [r5 + 2 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 605 * 16], m3
-
-; mode 13 [row 5 - second half]
-movu [r0 + 651 * 16], m3
-
-; mode 12 [row 0]
-pmaddubsw m3, m0, [r5 + 27 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 27 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 640 * 16], m3
-pmaddubsw m3, m1, [r5 + 27 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 27 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 641 * 16], m3
-
-; mode 12 [row 2]
-pmaddubsw m3, m0, [r5 + 17 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 17 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 644 * 16], m3
-pmaddubsw m3, m1, [r5 + 17 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 645 * 16], m3
-
-; mode 12 [row 4]
-pmaddubsw m3, m0, [r5 + 7 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 7 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 648 * 16], m3
-pmaddubsw m3, m1, [r5 + 7 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 7 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 649 * 16], m3
-
-; mode 13 [row 0]
-pmaddubsw m3, m0, [r5 + 23 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 23 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 704 * 16], m3
-pmaddubsw m3, m1, [r5 + 23 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 23 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 705 * 16], m3
-
-; mode 13 [row 2]
-pmaddubsw m3, m0, [r5 + 5 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 5 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 708 * 16], m3
-pmaddubsw m3, m1, [r5 + 5 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 5 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 709 * 16], m3
-
-; mode 14 [row 0]
-pmaddubsw m3, m0, [r5 + 19 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 19 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 768 * 16], m3
-pmaddubsw m3, m1, [r5 + 19 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 19 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 769 * 16], m3
-
-; mode 15 [row 0]
-pmaddubsw m3, m0, [r5 + 15 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 15 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 832 * 16], m3
-pmaddubsw m3, m1, [r5 + 15 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 833 * 16], m3
-
-; mode 11 [row 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r3 + 16], 0
-pmaddubsw m3, m0, [r5 + 30 * 16]
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 8], 1
-pinsrb m2, [r4 + 7], 0
-pmaddubsw m5, m2, [r5 + 30 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 608 * 16], m3
-pslldq m1, 2
-pinsrb m1, [r4 + 16], 1
-pinsrb m1, [r4 + 15], 0
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrb m4, [r4 + 24], 1
-pinsrb m4, [r4 + 23], 0
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 609 * 16], m3
-
-; mode 11 [row 17]
-pmaddubsw m3, m0, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 610 * 16], m3
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 611 * 16], m3
-
-; mode 11 [row 18]
-pmaddubsw m3, m0, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 612 * 16], m3
-pmaddubsw m3, m1, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 613 * 16], m3
-
-; mode 11 [row 19]
-pmaddubsw m3, m0, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 614 * 16], m3
-pmaddubsw m3, m1, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 615 * 16], m3
-
-; mode 11 [row 20]
-pmaddubsw m3, m0, [r5 + 22 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 22 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 616 * 16], m3
-pmaddubsw m3, m1, [r5 + 22 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 22 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 617 * 16], m3
-
-; mode 11 [row 21]
-pmaddubsw m3, m0, [r5 + 20 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 618 * 16], m3
-pmaddubsw m3, m1, [r5 + 20 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 619 * 16], m3
-
-; mode 11 [row 22]
-pmaddubsw m3, m0, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 620 * 16], m3
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 621 * 16], m3
-
-; mode 11 [row 23]
-pmaddubsw m3, m0, [r5 + 16 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 622 * 16], m3
-pmaddubsw m3, m1, [r5 + 16 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 623 * 16], m3
-
-; mode 11 [row 24]
-pmaddubsw m3, m0, [r5 + 14 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 624 * 16], m3
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 625 * 16], m3
-
-; mode 11 [row 25]
-pmaddubsw m3, m0, [r5 + 12 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 12 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 626 * 16], m3
-pmaddubsw m3, m1, [r5 + 12 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 627 * 16], m3
-
-; mode 11 [row 26]
-pmaddubsw m3, m0, [r5 + 10 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 10 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 628 * 16], m3
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 629 * 16], m3
-
-; mode 11 [row 27]
-pmaddubsw m3, m0, [r5 + 8 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 630 * 16], m3
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 631 * 16], m3
-
-; mode 11 [row 28]
-pmaddubsw m3, m0, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 632 * 16], m3
-pmaddubsw m3, m1, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 633 * 16], m3
-
-; mode 11 [row 29]
-pmaddubsw m3, m0, [r5 + 4 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 634 * 16], m3
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 635 * 16], m3
-
-; mode 11 [row 30]
-pmaddubsw m3, m0, [r5 + 2 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 2 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 636 * 16], m3
-pmaddubsw m3, m1, [r5 + 2 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 637 * 16], m3
-
-; mode 12 [row 6]
-pinsrb m0, [r3 + 6], 0
-pmaddubsw m3, m0, [r5 + 29 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 29 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 652 * 16], m3
-pmaddubsw m3, m1, [r5 + 29 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 29 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 653 * 16], m3
-
-; mode 12 [row 7]
-pmaddubsw m3, m0, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 654 * 16], m3
-pmaddubsw m3, m1, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 655 * 16], m3
-
-; mode 12 [row 8]
-pmaddubsw m3, m0, [r5 + 19 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 19 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 656 * 16], m3
-pmaddubsw m3, m1, [r5 + 19 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 19 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 657 * 16], m3
-
-; mode 12 [row 9]
-pmaddubsw m3, m0, [r5 + 14 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 658 * 16], m3
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 659 * 16], m3
-
-; mode 12 [row 10]
-pmaddubsw m3, m0, [r5 + 9 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 9 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 660 * 16], m3
-pmaddubsw m3, m1, [r5 + 9 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 9 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 661 * 16], m3
-
-; mode 12 [row 11]
-pmaddubsw m3, m0, [r5 + 4 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 662 * 16], m3
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 663 * 16], m3
-
-; mode 13 [row 3]
-movu m6, m0
-pinsrb m6, [r3 + 4], 0
-pmaddubsw m3, m6, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 710 * 16], m3
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 711 * 16], m3
-
-; mode 13 [row 4]
-pmaddubsw m3, m6, [r5 + 19 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 19 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 712 * 16], m3
-pmaddubsw m3, m1, [r5 + 19 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 19 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 713 * 16], m3
-
-; mode 13 [row 5]
-pmaddubsw m3, m6, [r5 + 10 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 10 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 714 * 16], m3
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 715 * 16], m3
-
-; mode 13 [row 6]
-pmaddubsw m3, m6, [r5 + 1 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 1 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 716 * 16], m3
-pmaddubsw m3, m1, [r5 + 1 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 1 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 717 * 16], m3
-
-; mode 14 [row 2]
-movu m6, m0
-pinsrb m6, [r4 + 0], 1
-pinsrb m6, [r3 + 2], 0
-pmaddubsw m3, m6, [r5 + 25 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 25 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 772 * 16], m3
-pmaddubsw m3, m1, [r5 + 25 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 773 * 16], m3
-
-; mode 14 [row 3]
-pmaddubsw m3, m6, [r5 + 12 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 12 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 774 * 16], m3
-pmaddubsw m3, m1, [r5 + 12 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 775 * 16], m3
-
-; mode 15 [row 1]
-pmaddubsw m3, m6, [r5 + 30 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 30 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 834 * 16], m3
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 835 * 16], m3
-
-; mode 15 [row 2]
-pmaddubsw m3, m6, [r5 + 13 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 13 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 836 * 16], m3
-pmaddubsw m3, m1, [r5 + 13 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 13 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 837 * 16], m3
-
-; mode 15 [row 3]
-pslldq m6, 2
-pinsrb m6, [r3 + 2], 1
-pinsrb m6, [r3 + 4], 0
-pmaddubsw m3, m6, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 7], 1
-pinsrb m2, [r4 + 6], 0
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 838 * 16], m3
-pslldq m1, 2
-pinsrb m1, [r4 + 15], 1
-pinsrb m1, [r4 + 14], 0
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrb m4, [r4 + 23], 1
-pinsrb m4, [r4 + 22], 0
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 839 * 16], m3
-
-; mode 15 [row 4]
-pmaddubsw m3, m6, [r5 + 11 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 11 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 840 * 16], m3
-pmaddubsw m3, m1, [r5 + 11 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 11 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 841 * 16], m3
-
-; mode 15 [row 5, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 4], 1
-pinsrb m6, [r3 + 6], 0
-pmaddubsw m3, m6, [r5 + 26 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 842 * 16], m3
-
-; mode 15 [row 6, 0-7]
-pmaddubsw m3, m6, [r5 + 9 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 844 * 16], m3
-
-; mode 15 [row 7, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 6], 1
-pinsrb m6, [r3 + 8], 0
-pmaddubsw m3, m6, [r5 + 24 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 846 * 16], m3
-
-; mode 15 [row 8, 0-7]
-pmaddubsw m3, m6, [r5 + 7 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 848 * 16], m3
-
-; mode 15 [row 9, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 8], 1
-pinsrb m6, [r3 + 9], 0
-pmaddubsw m3, m6, [r5 + 22 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 850 * 16], m3
-
-; mode 15 [row 10, 0-7]
-pmaddubsw m3, m6, [r5 + 5 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 852 * 16], m3
-
-; mode 15 [row 11, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 9], 1
-pinsrb m6, [r3 + 11], 0
-pmaddubsw m3, m6, [r5 + 20 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 854 * 16], m3
-
-; mode 15 [row 12, 0-7]
-pmaddubsw m3, m6, [r5 + 3 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 856 * 16], m3
-
-; mode 15 [row 13, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 11], 1
-pinsrb m6, [r3 + 13], 0
-pmaddubsw m3, m6, [r5 + 18 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 858 * 16], m3
-
-; mode 15 [row 14, 0-7]
-pmaddubsw m3, m6, [r5 + 1 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 860 * 16], m3
-
-; mode 15 [row 15, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 13], 1
-pinsrb m6, [r3 + 15], 0
-pmaddubsw m3, m6, [r5 + 16 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 862 * 16], m3
-
-; mode 15 [row 16, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 15], 1
-pinsrb m6, [r3 + 17], 0
-pmaddubsw m3, m6, [r5 + 31 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 864 * 16], m3
-
-; mode 15 [row 17, 0-7]
-pmaddubsw m3, m6, [r5 + 14 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 866 * 16], m3
-
-; mode 15 [row 18, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 17], 1
-pinsrb m6, [r3 + 19], 0
-pmaddubsw m3, m6, [r5 + 29 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 868 * 16], m3
-
-; mode 15 [row 19, 0-7]
-pmaddubsw m3, m6, [r5 + 12 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 870 * 16], m3
-
-; mode 15 [row 20, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 19], 1
-pinsrb m6, [r3 + 21], 0
-pmaddubsw m3, m6, [r5 + 27 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 872 * 16], m3
-
-; mode 15 [row 21, 0-7]
-pmaddubsw m3, m6, [r5 + 10 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 874 * 16], m3
-
-; mode 15 [row 22, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 21], 1
-pinsrb m6, [r3 + 23], 0
-pmaddubsw m3, m6, [r5 + 25 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 876 * 16], m3
-
-; mode 15 [row 23, 0-7]
-pmaddubsw m3, m6, [r5 + 8 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 878 * 16], m3
-
-; mode 15 [row 24, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 23], 1
-pinsrb m6, [r3 + 24], 0
-pmaddubsw m3, m6, [r5 + 23 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 880 * 16], m3
-
-; mode 15 [row 25, 0-7]
-pmaddubsw m3, m6, [r5 + 6 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 882 * 16], m3
-
-; mode 15 [row 26, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 24], 1
-pinsrb m6, [r3 + 26], 0
-pmaddubsw m3, m6, [r5 + 21 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 884 * 16], m3
-
-; mode 15 [row 27, 0-7]
-pmaddubsw m3, m6, [r5 + 4 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 886 * 16], m3
-
-; mode 15 [row 28, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 26], 1
-pinsrb m6, [r3 + 28], 0
-pmaddubsw m3, m6, [r5 + 19 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 888 * 16], m3
-
-; mode 15 [row 29, 0-7]
-pmaddubsw m3, m6, [r5 + 2 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 890 * 16], m3
-
-; mode 15 [row 30, 0-7]
-pslldq m6, 2
-pinsrb m6, [r3 + 28], 1
-pinsrb m6, [r3 + 30], 0
-pmaddubsw m3, m6, [r5 + 17 * 16]
-pmulhrsw m3, m7
-packuswb m3, m3
-movh [r0 + 892 * 16], m3
-
-; mode 15 [row 31, 0-7]
-pshufb m3, m6, [tab_S2]
-movh [r0 + 894 * 16], m3
-
-; mode 12 [row 12]
-pslldq m0, 2
-pinsrb m0, [r3 + 6], 1
-pinsrb m0, [r3 + 13], 0
-pmaddubsw m3, m0, [r5 + 31 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 31 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 664 * 16], m3
-pmaddubsw m3, m1, [r5 + 31 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 31 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 665 * 16], m3
-
-; mode 12 [row 13]
-pmaddubsw m3, m0, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 666 * 16], m3
-pmaddubsw m3, m1, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 667 * 16], m3
-
-; mode 12 [row 14]
-pmaddubsw m3, m0, [r5 + 21 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 21 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 668 * 16], m3
-pmaddubsw m3, m1, [r5 + 21 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 669 * 16], m3
-
-; mode 12 [row 15]
-pmaddubsw m3, m0, [r5 + 16 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 670 * 16], m3
-pmaddubsw m3, m1, [r5 + 16 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 671 * 16], m3
-
-; mode 12 [row 16]
-pmaddubsw m3, m0, [r5 + 11 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 11 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 672 * 16], m3
-pmaddubsw m3, m1, [r5 + 11 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 11 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 673 * 16], m3
-
-; mode 12 [row 17]
-pmaddubsw m3, m0, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 674 * 16], m3
-pmaddubsw m3, m1, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 675 * 16], m3
-
-; mode 12 [row 18]
-pmaddubsw m3, m0, [r5 + 1 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 1 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 676 * 16], m3
-pmaddubsw m3, m1, [r5 + 1 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 1 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 677 * 16], m3
-
-; mode 13 [row 7]
-movu m6, m0
-pinsrb m6, [r3 + 4], 2
-pinsrb m6, [r3 + 4], 1
-pinsrb m6, [r3 + 7], 0
-pmaddubsw m3, m6, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 718 * 16], m3
-pmaddubsw m3, m1, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 719 * 16], m3
-
-; mode 13 [row 8]
-pmaddubsw m3, m6, [r5 + 15 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 15 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 720 * 16], m3
-pmaddubsw m3, m1, [r5 + 15 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 721 * 16], m3
-
-; mode 13 [row 9]
-pmaddubsw m3, m6, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 722 * 16], m3
-pmaddubsw m3, m1, [r5 + 6 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 723 * 16], m3
-
-; mode 14 [row 4]
-pinsrb m6, [r3 + 2], 2
-pinsrb m6, [r3 + 2], 1
-pinsrb m6, [r3 + 5], 0
-pmaddubsw m3, m6, [r5 + 31 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 31 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 776 * 16], m3
-pmaddubsw m3, m1, [r5 + 31 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 31 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 777 * 16], m3
-
-; mode 14 [row 5]
-pmaddubsw m3, m6, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 778 * 16], m3
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 779 * 16], m3
-
-; mode 14 [row 6]
-pmaddubsw m3, m6, [r5 + 5 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 5 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 780 * 16], m3
-pmaddubsw m3, m1, [r5 + 5 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 5 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 781 * 16], m3
-
-; mode 14 [row 7]
-pslldq m6, 2
-pinsrb m6, [r3 + 5], 1
-pinsrb m6, [r3 + 7], 0
-pmaddubsw m3, m6, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 5], 0
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 782 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 13], 0
-pmaddubsw m3, m1, [r5 + 24 * 16]
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 21], 0
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 783 * 16], m3
-
-; mode 14 [row 8]
-pmaddubsw m3, m6, [r5 + 11 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 11 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 784 * 16], m3
-pmaddubsw m3, m1, [r5 + 11 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 11 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 785 * 16], m3
-
-; mode 15 [row 5, 8-31]
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m5, m5
-movh [r0 + 842 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 26 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 843 * 16], m3
-
-; mode 15 [row 6, 8-31]
-pmaddubsw m5, m2, [r5 + 9 * 16]
-pmulhrsw m5, m7
-packuswb m5, m5
-movh [r0 + 844 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 9 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 9 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 845 * 16], m3
-
-; mode 12 [row 19]
-pslldq m0, 2
-pinsrb m0, [r3 + 13], 1
-pinsrb m0, [r3 + 19], 0
-pmaddubsw m3, m0, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 678 * 16], m3
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 679 * 16], m3
-
-; mode 12 [row 20]
-pmaddubsw m3, m0, [r5 + 23 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 23 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 680 * 16], m3
-pmaddubsw m3, m1, [r5 + 23 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 23 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 681 * 16], m3
-
-; mode 12 [row 21]
-pmaddubsw m3, m0, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 682 * 16], m3
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 683 * 16], m3
-
-; mode 12 [row 22]
-pmaddubsw m3, m0, [r5 + 13 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 13 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 684 * 16], m3
-pmaddubsw m3, m1, [r5 + 13 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 13 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 685 * 16], m3
-
-; mode 12 [row 23]
-pmaddubsw m3, m0, [r5 + 8 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 686 * 16], m3
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 687 * 16], m3
-
-; mode 12 [row 24]
-pmaddubsw m3, m0, [r5 + 3 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m2, [r5 + 3 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 688 * 16], m3
-pmaddubsw m3, m1, [r5 + 3 * 16]
-pmulhrsw m3, m7
-pmaddubsw m5, m4, [r5 + 3 * 16]
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 689 * 16], m3
-
-; mode 13 [row 10]
-movu m7, m6
-movu m6, m0
-pinsrb m6, [r3 + 4], 4
-pinsrb m6, [r3 + 4], 3
-pinsrb m6, [r3 + 7], 2
-pinsrb m6, [r3 + 7], 1
-pinsrb m6, [r3 + 11], 0
-pmaddubsw m3, m6, [r5 + 29 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 29 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 724 * 16], m3
-pmaddubsw m3, m1, [r5 + 29 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 29 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 725 * 16], m3
-
-; mode 13 [row 11]
-pmaddubsw m3, m6, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 726 * 16], m3
-pmaddubsw m3, m1, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 727 * 16], m3
-
-; mode 13 [row 12]
-pmaddubsw m3, m6, [r5 + 11 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 11 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 728 * 16], m3
-pmaddubsw m3, m1, [r5 + 11 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 11 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 729 * 16], m3
-
-; mode 13 [row 13]
-pmaddubsw m3, m6, [r5 + 2 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 2 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 730 * 16], m3
-pmaddubsw m3, m1, [r5 + 2 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 731 * 16], m3
-
-; mode 14 [row 9]
-pslldq m7, 2
-pinsrb m7, [r3 + 7], 1
-pinsrb m7, [r3 + 10], 0
-pmaddubsw m3, m7, [r5 + 30 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrw m2, [r4 + 4], 0
-pmaddubsw m5, m2, [r5 + 30 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 786 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 12], 0
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrb m4, [r4 + 21], 1
-pinsrb m4, [r4 + 20], 0
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 787 * 16], m3
-
-; mode 14 [row 10]
-pmaddubsw m3, m7, [r5 + 17 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 17 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 788 * 16], m3
-pmaddubsw m3, m1, [r5 + 17 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 789 * 16], m3
-
-; mode 14 [row 11]
-pmaddubsw m3, m7, [r5 + 4 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 790 * 16], m3
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 791 * 16], m3
-
-movu m6, [pw_1024]
-
-; mode 15 [row 7, 8-31]
-pmaddubsw m5, m2, [r5 + 24 * 16]
-pmulhrsw m5, m6
-packuswb m5, m5
-movh [r0 + 846 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 24 * 16]
-pmulhrsw m3, m6
-pmaddubsw m5, m4, [r5 + 24 * 16]
-pmulhrsw m5, m6
-packuswb m3, m5
-movu [r0 + 847 * 16], m3
-
-; mode 15 [row 8, 8-31]
-pmaddubsw m5, m2, [r5 + 7 * 16]
-pmulhrsw m5, m6
-packuswb m5, m5
-movh [r0 + 848 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 7 * 16]
-pmulhrsw m3, m6
-pmaddubsw m5, m4, [r5 + 7 * 16]
-pmulhrsw m5, m6
-packuswb m3, m5
-movu [r0 + 849 * 16], m3
-
-; mode 12 [row 25]
-pslldq m0, 2
-pinsrb m0, [r3 + 19], 1
-pinsrb m0, [r3 + 26], 0
-pmaddubsw m3, m0, [r5 + 30 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 30 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 690 * 16], m3
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 691 * 16], m3
-
-; mode 12 [row 26]
-pmaddubsw m3, m0, [r5 + 25 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 25 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 692 * 16], m3
-pmaddubsw m3, m1, [r5 + 25 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 693 * 16], m3
-
-; mode 12 [row 27]
-pmaddubsw m3, m0, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 694 * 16], m3
-pmaddubsw m3, m1, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 695 * 16], m3
-
-; mode 12 [row 28]
-pmaddubsw m3, m0, [r5 + 15 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 15 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 696 * 16], m3
-pmaddubsw m3, m1, [r5 + 15 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 697 * 16], m3
-
-; mode 12 [row 29]
-pmaddubsw m3, m0, [r5 + 10 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 10 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 698 * 16], m3
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 699 * 16], m3
-
-; mode 12 [row 30]
-pmaddubsw m3, m0, [r5 + 5 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 5 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 700 * 16], m3
-pmaddubsw m3, m1, [r5 + 5 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 5 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 701 * 16], m3
-
-; mode 13 [row 14]
-movu m6, m0
-pinsrb m6, [r3 + 4], 6
-pinsrb m6, [r3 + 4], 5
-pinsrb m6, [r3 + 7], 4
-pinsrb m6, [r3 + 7], 3
-pinsrb m6, [r3 + 11], 2
-pinsrb m6, [r3 + 11], 1
-pinsrb m6, [r3 + 14], 0
-pmaddubsw m3, m6, [r5 + 25 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 25 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 732 * 16], m3
-pmaddubsw m3, m1, [r5 + 25 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 733 * 16], m3
-
-; mode 13 [row 15]
-pmaddubsw m3, m6, [r5 + 16 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 734 * 16], m3
-pmaddubsw m3, m1, [r5 + 16 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 735 * 16], m3
-
-; mode 13 [row 16]
-pmaddubsw m3, m6, [r5 + 7 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 7 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 736 * 16], m3
-pmaddubsw m3, m1, [r5 + 7 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 7 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 737 * 16], m3
-
-; mode 13 [row 17]
-pslldq m6, 2
-pinsrb m6, [r3 + 14], 1
-pinsrb m6, [r3 + 18], 0
-pmaddubsw m3, m6, [r5 + 30 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrw m2, [r4 + 3], 0
-pmaddubsw m5, m2, [r5 + 30 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 738 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 11], 0
-pmaddubsw m3, m1, [r5 + 30 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 19], 0
-pmaddubsw m5, m4, [r5 + 30 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 739 * 16], m3
-
-; mode 13 [row 18]
-pmaddubsw m3, m6, [r5 + 21 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 21 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 740 * 16], m3
-pmaddubsw m3, m1, [r5 + 21 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 741 * 16], m3
-
-; mode 13 [row 19]
-pmaddubsw m3, m6, [r5 + 12 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 12 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 742 * 16], m3
-pmaddubsw m3, m1, [r5 + 12 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 743 * 16], m3
-
-; mode 13 [row 20]
-pmaddubsw m3, m6, [r5 + 3 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 3 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 744 * 16], m3
-pmaddubsw m3, m1, [r5 + 3 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 3 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 745 * 16], m3
-
-; mode 14 [row 12]
-pslldq m7, 2
-pinsrb m7, [r3 + 10], 1
-pinsrb m7, [r3 + 12], 0
-pmaddubsw m3, m7, [r5 + 23 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 23 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 792 * 16], m3
-pmaddubsw m3, m1, [r5 + 23 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 23 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 793 * 16], m3
-
-; mode 14 [row 13]
-pmaddubsw m3, m7, [r5 + 10 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 10 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 794 * 16], m3
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 795 * 16], m3
-
-; mode 15 [row 9]
-pmaddubsw m5, m2, [r5 + 22 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movu [r0 + 850 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 22 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 22 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 851 * 16], m3
-
-; mode 15 [row 10]
-pmaddubsw m5, m2, [r5 + 5 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movu [r0 + 852 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 5 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 5 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 853 * 16], m3
-
-; mode 13 [row 21]
-pslldq m6, 2
-pinsrb m6, [r3 + 18], 1
-pinsrb m6, [r3 + 21], 0
-pmaddubsw m3, m6, [r5 + 26 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrw m2, [r4 + 2], 0
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 746 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 10], 0
-pmaddubsw m3, m1, [r5 + 26 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 18], 0
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 747 * 16], m3
-
-; mode 13 [row 22]
-pmaddubsw m3, m6, [r5 + 17 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 17 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 748 * 16], m3
-pmaddubsw m3, m1, [r5 + 17 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 749 * 16], m3
-
-; mode 13 [row 23]
-pmaddubsw m3, m6, [r5 + 8 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 750 * 16], m3
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 751 * 16], m3
-
-; mode 14 [row 14]
-pslldq m7, 2
-pinsrb m7, [r3 + 12], 1
-pinsrb m7, [r3 + 15], 0
-pmaddubsw m3, m7, [r5 + 29 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 29 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 796 * 16], m3
-pmaddubsw m3, m1, [r5 + 29 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 29 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 797 * 16], m3
-
-; mode 14 [row 15]
-pmaddubsw m3, m7, [r5 + 16 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 798 * 16], m3
-pmaddubsw m3, m1, [r5 + 16 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 799 * 16], m3
-
-; mode 14 [row 16]
-pmaddubsw m3, m7, [r5 + 3 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 3 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 800 * 16], m3
-pmaddubsw m3, m1, [r5 + 3 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 3 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 801 * 16], m3
-
-; mode 15 [row 11]
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 854 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 855 * 16], m3
-
-; mode 15 [row 12]
-pmaddubsw m5, m2, [r5 + 3 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 856 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 3 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 3 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 857 * 16], m3
-
-; mode 13 [row 24]
-pslldq m6, 2
-pinsrb m6, [r3 + 21], 1
-pinsrb m6, [r3 + 25], 0
-pmaddubsw m3, m6, [r5 + 31 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrw m2, [r4 + 1], 0
-pmaddubsw m5, m2, [r5 + 31 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 752 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 9], 0
-pmaddubsw m3, m1, [r5 + 31 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 17], 0
-pmaddubsw m5, m4, [r5 + 31 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 753 * 16], m3
-
-; mode 13 [row 25]
-pmaddubsw m3, m6, [r5 + 22 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 22 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 754 * 16], m3
-pmaddubsw m3, m1, [r5 + 22 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 22 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 755 * 16], m3
-
-; mode 13 [row 26]
-pmaddubsw m3, m6, [r5 + 13 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 13 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 756 * 16], m3
-pmaddubsw m3, m1, [r5 + 13 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 13 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 757 * 16], m3
-
-; mode 13 [row 27]
-pmaddubsw m3, m6, [r5 + 4 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 4 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 758 * 16], m3
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 759 * 16], m3
-
-; mode 14 [row 17]
-pslldq m7, 2
-pinsrb m7, [r3 + 15], 1
-pinsrb m7, [r3 + 17], 0
-pmaddubsw m3, m7, [r5 + 22 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 22 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 802 * 16], m3
-pmaddubsw m3, m1, [r5 + 22 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 22 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 803 * 16], m3
-
-; mode 14 [row 18]
-pmaddubsw m3, m7, [r5 + 9 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 9 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 804 * 16], m3
-pmaddubsw m3, m1, [r5 + 9 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 9 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 805 * 16], m3
-
-; mode 15 [row 13]
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 858 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 859 * 16], m3
-
-; mode 15 [row 14]
-pmaddubsw m5, m2, [r5 + 1 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 860 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 1 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 1 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 861 * 16], m3
-
-; mode 13 [row 28]
-pslldq m6, 2
-pinsrb m6, [r3 + 25], 1
-pinsrb m6, [r3 + 28], 0
-pmaddubsw m3, m6, [r5 + 27 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrw m2, [r4 + 0], 0
-pmaddubsw m5, m2, [r5 + 27 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 760 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 8], 0
-pmaddubsw m3, m1, [r5 + 27 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 16], 0
-pmaddubsw m5, m4, [r5 + 27 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 761 * 16], m3
-
-; mode 13 [row 29]
-pmaddubsw m3, m6, [r5 + 18 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 18 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 762 * 16], m3
-pmaddubsw m3, m1, [r5 + 18 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 18 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 763 * 16], m3
-
-; mode 13 [row 30]
-pmaddubsw m3, m6, [r5 + 9 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 9 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 764 * 16], m3
-pmaddubsw m3, m1, [r5 + 9 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 9 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 765 * 16], m3
-
-; mode 14 [row 19]
-pslldq m7, 2
-pinsrb m7, [r3 + 17], 1
-pinsrb m7, [r3 + 20], 0
-pmaddubsw m3, m7, [r5 + 28 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 28 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 806 * 16], m3
-pmaddubsw m3, m1, [r5 + 28 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 28 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 807 * 16], m3
-
-; mode 14 [row 20]
-pmaddubsw m3, m7, [r5 + 15 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 15 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 808 * 16], m3
-pmaddubsw m3, m1, [r5 + 15 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 15 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 809 * 16], m3
-
-; mode 14 [row 21]
-pmaddubsw m3, m7, [r5 + 2 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 2 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 810 * 16], m3
-pmaddubsw m3, m1, [r5 + 2 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 811 * 16], m3
-
-; mode 15 [row 15]
-pmaddubsw m5, m2, [r5 + 16 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 862 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 16 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 16 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 863 * 16], m3
-
-; mode 14 [row 22]
-pslldq m7, 2
-pinsrb m7, [r3 + 20], 1
-pinsrb m7, [r3 + 22], 0
-pmaddubsw m3, m7, [r5 + 21 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrb m2, [r4 + 0], 1
-pinsrb m2, [r3 + 2], 0
-pmaddubsw m5, m2, [r5 + 21 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 812 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 7], 0
-pmaddubsw m3, m1, [r5 + 21 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 15], 0
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 813 * 16], m3
-
-; mode 14 [row 23]
-pmaddubsw m3, m7, [r5 + 8 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 8 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 814 * 16], m3
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 815 * 16], m3
-
-; mode 15 [row 16]
-pmaddubsw m5, m2, [r5 + 31 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 864 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 31 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 31 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 865 * 16], m3
-
-; mode 15 [row 17]
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 866 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 867 * 16], m3
-
-; mode 14 [row 24]
-pslldq m7, 2
-pinsrb m7, [r3 + 22], 1
-pinsrb m7, [r3 + 25], 0
-pmaddubsw m3, m7, [r5 + 27 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrb m2, [r3 + 2], 1
-pinsrb m2, [r3 + 5], 0
-pmaddubsw m5, m2, [r5 + 27 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 816 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 6], 0
-pmaddubsw m3, m1, [r5 + 27 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 14], 0
-pmaddubsw m5, m4, [r5 + 27 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 817 * 16], m3
-
-; mode 14 [row 25]
-pmaddubsw m3, m7, [r5 + 14 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 14 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 818 * 16], m3
-pmaddubsw m3, m1, [r5 + 14 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 14 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 819 * 16], m3
-
-; mode 14 [row 26]
-pmaddubsw m3, m7, [r5 + 1 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 1 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 820 * 16], m3
-pmaddubsw m3, m1, [r5 + 1 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 1 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 821 * 16], m3
-
-; mode 15 [row 18]
-pinsrb m2, [r3 + 4], 0
-pmaddubsw m5, m2, [r5 + 29 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 868 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 29 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 29 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 869 * 16], m3
-
-; mode 15 [row 19]
-pmaddubsw m5, m2, [r5 + 12 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 870 * 16 + 8], m5
-pmaddubsw m3, m1, [r5 + 12 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 12 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 871 * 16], m3
-
-; mode 15 [row 20 - 8 to 15]
-pslldq m3, m2, 2
-pinsrb m3, [r3 + 4], 1
-pinsrb m3, [r3 + 6], 0
-pmaddubsw m5, m3, [r5 + 27 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 872 * 16 + 8], m5
-
-; mode 15 [row 21 - 8 to 15]
-pmaddubsw m5, m3, [r5 + 10 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 874 * 16 + 8], m5
-
-; mode 15 [row 22 - 8 to 15]
-pslldq m3, 2
-pinsrb m3, [r3 + 6], 1
-pinsrb m3, [r3 + 8], 0
-pmaddubsw m5, m3, [r5 + 25 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 876 * 16 + 8], m5
-
-; mode 15 [row 23 - 8 to 15]
-pmaddubsw m5, m3, [r5 + 8 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 878 * 16 + 8], m5
-
-; mode 15 [row 24 - 8 to 15]
-pslldq m3, 2
-pinsrb m3, [r3 + 8], 1
-pinsrb m3, [r3 + 9], 0
-pmaddubsw m5, m3, [r5 + 23 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 880 * 16 + 8], m5
-
-; mode 15 [row 25 - 8 to 15]
-pmaddubsw m5, m3, [r5 + 6 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 882 * 16 + 8], m5
-
-; mode 15 [row 26 - 8 to 15]
-pslldq m3, 2
-pinsrb m3, [r3 + 9], 1
-pinsrb m3, [r3 + 11], 0
-pmaddubsw m5, m3, [r5 + 21 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 884 * 16 + 8], m5
-
-; mode 15 [row 27 - 8 to 15]
-pmaddubsw m5, m3, [r5 + 4 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 886 * 16 + 8], m5
-
-; mode 15 [row 28 - 8 to 15]
-pslldq m3, 2
-pinsrb m3, [r3 + 11], 1
-pinsrb m3, [r3 + 13], 0
-pmaddubsw m5, m3, [r5 + 19 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 888 * 16 + 8], m5
-
-; mode 15 [row 29 - 8 to 15]
-pmaddubsw m5, m3, [r5 + 2 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 890 * 16 + 8], m5
-
-; mode 15 [row 30 - 8 to 15]
-pslldq m3, 2
-pinsrb m3, [r3 + 13], 1
-pinsrb m3, [r3 + 15], 0
-pmaddubsw m5, m3, [r5 + 17 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m5, m5
-movh [r0 + 892 * 16 + 8], m5
-
-; mode 15 [row 31, 8 to 15]
-pshufb m5, m3, [tab_S2]
-movh [r0 + 894 * 16 + 8], m5
-
-; mode 14 [row 27]
-pinsrb m2, [r3 + 5], 0
-pslldq m7, 2
-pinsrb m7, [r3 + 25], 1
-pinsrb m7, [r3 + 27], 0
-pmaddubsw m3, m7, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrb m2, [r3 + 5], 1
-pinsrb m2, [r3 + 7], 0
-pmaddubsw m5, m2, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 822 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 5], 0
-pmaddubsw m3, m1, [r5 + 20 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 13], 0
-pmaddubsw m5, m4, [r5 + 20 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 823 * 16], m3
-
-; mode 15 [row 20 - 16 to 31]
-pmaddubsw m3, m1, [r5 + 27 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 27 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 873 * 16], m3
-
-; mode 15 [row 21 - 16 to 31]
-pmaddubsw m3, m1, [r5 + 10 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 10 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 875 * 16], m3
-
-; mode 14 [row 28]
-pmaddubsw m3, m7, [r5 + 7 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 7 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 824 * 16], m3
-pmaddubsw m3, m1, [r5 + 7 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 7 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 825 * 16], m3
-
-; mode 14 [row 29]
-pslldq m7, 2
-pinsrb m7, [r3 + 27], 1
-pinsrb m7, [r3 + 30], 0
-pmaddubsw m3, m7, [r5 + 26 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m2, 2
-pinsrb m2, [r3 + 7], 1
-pinsrb m2, [r3 + 10], 0
-pmaddubsw m5, m2, [r5 + 26 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 826 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 4], 0
-pmaddubsw m3, m1, [r5 + 26 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 12], 0
-pmaddubsw m5, m4, [r5 + 26 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 827 * 16], m3
-
-; mode 14 [row 30]
-pmaddubsw m3, m7, [r5 + 13 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m2, [r5 + 13 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 828 * 16], m3
-pmaddubsw m3, m1, [r5 + 13 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 13 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 829 * 16], m3
-
-; mode 15 [row 22]
-pmaddubsw m3, m1, [r5 + 25 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 25 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 877 * 16], m3
-
-; mode 15 [row 23]
-pmaddubsw m3, m1, [r5 + 8 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 8 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 879 * 16], m3
-
-; mode 14 [row 31]
-pshufb m3, m7, [tab_S2]
-movh [r0 + 830 * 16], m3
-pshufb m3, m2, [tab_S2]
-movh [r0 + 830 * 16 + 8], m3
-pshufb m3, m1, [tab_S2]
-movh [r0 + 831 * 16], m3
-pshufb m3, m4, [tab_S2]
-movh [r0 + 831 * 16 + 8], m3
-
-; mode 13 [row 31]
-pshufb m0, m6, [tab_S2]
-movh [r0 + 766 * 16], m0
-movh m0, [r4]
-movh [r0 + 766 * 16 + 8], m0
-movu m0, [r4 + 8]
-movu [r0 + 767 * 16], m0
-
-; mode 15 [row 24]
-pslldq m1, 2
-pinsrw m1, [r4 + 3], 0
-pmaddubsw m3, m1, [r5 + 23 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 11], 0
-pmaddubsw m5, m4, [r5 + 23 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 881 * 16], m3
-
-; mode 15 [row 25]
-pmaddubsw m3, m1, [r5 + 6 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 6 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 883 * 16], m3
-
-; mode 15 [row 26]
-pslldq m1, 2
-pinsrw m1, [r4 + 2], 0
-pmaddubsw m3, m1, [r5 + 21 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 10], 0
-pmaddubsw m5, m4, [r5 + 21 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 885 * 16], m3
-
-; mode 15 [row 27]
-pmaddubsw m3, m1, [r5 + 4 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 4 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 887 * 16], m3
-
-; mode 15 [row 28]
-pslldq m1, 2
-pinsrw m1, [r4 + 1], 0
-pmaddubsw m3, m1, [r5 + 19 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 9], 0
-pmaddubsw m5, m4, [r5 + 19 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 889 * 16], m3
-
-; mode 15 [row 29]
-pmaddubsw m3, m1, [r5 + 2 * 16]
-pmulhrsw m3, [pw_1024]
-pmaddubsw m5, m4, [r5 + 2 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 891 * 16], m3
-
-; mode 15 [row 30]
-pslldq m1, 2
-pinsrw m1, [r4 + 0], 0
-pmaddubsw m3, m1, [r5 + 17 * 16]
-pmulhrsw m3, [pw_1024]
-pslldq m4, 2
-pinsrw m4, [r4 + 8], 0
-pmaddubsw m5, m4, [r5 + 17 * 16]
-pmulhrsw m5, [pw_1024]
-packuswb m3, m5
-movu [r0 + 893 * 16], m3
-
-; mode 15 [row 31]
-pshufb m5, m1, [tab_S2]
-movh [r0 + 895 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 895 * 16 + 8], m5
-
-; mode 16 [row 0]
-movu m6, [r5 + 11 * 16]
-movu m7, [pw_1024]
-movh m0, [r4 ]
-movh m1, [r4 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movh m2, [r4 + 8]
-movh m3, [r4 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 896 * 16], m1
-
-movh m1, [r4 + 16]
-movh m3, [r4 + 17]
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movh m4, [r4 + 24]
-movh m5, [r4 + 25]
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 897 * 16], m3
-
-; mode16 [row 1]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r4], 1
-pinsrb m0, [r3 + 2], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 898 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 15], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 23], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 899 * 16], m3
-
-; mode16 [row 2]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 900 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 901 * 16], m3
-
-; mode16 [row 3]
-movu m6, [r5 + 12 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 2], 1
-pinsrb m0, [r3 + 3], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 902 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 14], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 22], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 903 * 16], m3
-
-; mode16 [row 4]
-movu m6, [r5 + 23 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 3], 1
-pinsrb m0, [r3 + 5], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 904 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 13], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 21], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 905 * 16], m3
-
-; mode16 [row 5]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 906 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 907 * 16], m3
-
-; mode16 [row 6]
-movu m6, [r5 + 13 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 5], 1
-pinsrb m0, [r3 + 6], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 5], 1
-pinsrb m2, [r4 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 908 * 16], m3
-pslldq m1, 2
-pinsrw m1, [r4 + 12], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 20], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 909 * 16], m3
-
-; mode16 [row 7]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 6], 1
-pinsrb m0, [r3 + 8], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 910 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 11], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 19], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 911 * 16], m3
-
-; mode16 [row 8]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 912 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 913 * 16], m3
-
-; mode16 [row 9]
-movu m6, [r5 + 14 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 8], 1
-pinsrb m0, [r3 + 9], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 914 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 10], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 18], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 915 * 16], m3
-
-; mode16 [row 10]
-movu m6, [r5 + 25 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 9], 1
-pinsrb m0, [r3 + 11], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 916 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 9], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrb m4, [r4 + 18], 1
-pinsrb m4, [r4 + 17], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 917 * 16], m3
-
-; mode16 [row 11]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 918 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 919 * 16], m3
-
-; mode16 [row 12]
-movu m6, [r5 + 15 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 11], 1
-pinsrb m0, [r3 + 12], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 920 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 8], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 16], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 921 * 16], m3
-
-; mode16 [row 13]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 12], 1
-pinsrb m0, [r3 + 14], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 0], 1
-pinsrb m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 922 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 7], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 15], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 923 * 16], m3
-
-; mode16 [row 14]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 924 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 925 * 16], m3
-
-; mode16 [row 15]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 14], 1
-pinsrb m0, [r3 + 15], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 2], 1
-pinsrb m2, [r3 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 926 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 6], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 14], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 927 * 16], m3
-
-; mode16 [row 16]
-movu m6, [r5 + 27 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 15], 1
-pinsrb m0, [r3 + 17], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 3], 1
-pinsrb m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 928 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 5], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 13], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 929 * 16], m3
-
-; mode16 [row 17]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 930 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 931 * 16], m3
-
-; mode16 [row 18]
-movu m6, [r5 + 17 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 17], 1
-pinsrb m0, [r3 + 18], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 5], 1
-pinsrb m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 932 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 4], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 12], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 933 * 16], m3
-
-; mode16 [row 19]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 18], 1
-pinsrb m0, [r3 + 20], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 6], 1
-pinsrb m2, [r3 + 8], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 934 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 3], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 11], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 935 * 16], m3
-
-; mode16 [row 20]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 936 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 937 * 16], m3
-
-; mode16 [row 21]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 20], 1
-pinsrb m0, [r3 + 21], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 8], 1
-pinsrb m2, [r3 + 9], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 938 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 2], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 10], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 939 * 16], m3
-
-; mode16 [row 22]
-movu m6, [r5 + 29 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 21], 1
-pinsrb m0, [r3 + 23], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 9], 1
-pinsrb m2, [r3 + 11], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 940 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 1], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 9], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 941 * 16], m3
-
-; mode16 [row 23]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 942 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 943 * 16], m3
-
-; mode16 [row 24]
-movu m6, [r5 + 19 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 23], 1
-pinsrb m0, [r3 + 24], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 11], 1
-pinsrb m2, [r3 + 12], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 944 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 0], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 8], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 945 * 16], m3
-
-; mode16 [row 25]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 24], 1
-pinsrb m0, [r3 + 26], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 12], 1
-pinsrb m2, [r3 + 14], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 946 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r4 + 0], 1
-pinsrb m1, [r3 + 2], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 7], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 947 * 16], m3
-
-; mode16 [row 26]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 948 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 949 * 16], m3
-
-; mode16 [row 27]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 26], 1
-pinsrb m0, [r3 + 27], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 14], 1
-pinsrb m2, [r3 + 15], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 950 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 2], 1
-pinsrb m1, [r3 + 3], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 6], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 951 * 16], m3
-
-; mode16 [row 28]
-movu m6, [r5 + 31 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 27], 1
-pinsrb m0, [r3 + 29], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 15], 1
-pinsrb m2, [r3 + 17], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 952 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 3], 1
-pinsrb m1, [r3 + 5], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 5], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 953 * 16], m3
-
-; mode16 [row 29]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 954 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 955 * 16], m3
-
-; mode16 [row 30]
-movu m6, [r5 + 21 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 29], 1
-pinsrb m0, [r3 + 30], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 17], 1
-pinsrb m2, [r3 + 18], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 956 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 5], 1
-pinsrb m1, [r3 + 6], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 4], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 957 * 16], m3
-
-; mode16 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 958 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 958 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 959 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 959 * 16 + 8], m5
-
-; mode 17 [row 0]
-movu m6, [r5 + 6 * 16]
-movu m7, [pw_1024]
-movh m0, [r4 ]
-movh m1, [r4 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movh m2, [r4 + 8]
-movh m3, [r4 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 960 * 16], m1
-
-movh m1, [r4 + 16]
-movh m3, [r4 + 17]
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movh m4, [r4 + 24]
-movh m5, [r4 + 25]
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 961 * 16], m3
-
-; mode17 [row 1]
-movu m6, [r5 + 12 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 0], 1
-pinsrb m0, [r3 + 1], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 962 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 15], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 23], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 963 * 16], m3
-
-; mode17 [row 2]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 1], 1
-pinsrb m0, [r3 + 2], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 964 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 14], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 22], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 965 * 16], m3
-
-; mode17 [row 3]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 2], 1
-pinsrb m0, [r3 + 4], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 966 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 13], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 21], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 967 * 16], m3
-
-; mode17 [row 4]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 4], 1
-pinsrb m0, [r3 + 5], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 968 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 12], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 20], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 969 * 16], m3
-
-; mode17 [row 5]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 970 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 971 * 16], m3
-
-; mode17 [row 6]
-movu m6, [r5 + 10 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 5], 1
-pinsrb m0, [r3 + 6], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 972 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 11], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 19], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 973 * 16], m3
-
-; mode17 [row 7]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 6], 1
-pinsrb m0, [r3 + 7], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 974 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 10], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 18], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 975 * 16], m3
-
-; mode17 [row 8]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 7], 1
-pinsrb m0, [r3 + 9], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 976 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 9], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 17], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 977 * 16], m3
-
-; mode17 [row 9]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 9], 1
-pinsrb m0, [r3 + 10], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrw m2, [r4 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 978 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 8], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 16], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 979 * 16], m3
-
-; mode17 [row 10]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 980 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 981 * 16], m3
-
-; mode17 [row 11]
-movu m6, [r5 + 8 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 10], 1
-pinsrb m0, [r3 + 11], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 0], 1
-pinsrb m2, [r3 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 982 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 7], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 15], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 983 * 16], m3
-
-; mode17 [row 12]
-movu m6, [r5 + 14 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 11], 1
-pinsrb m0, [r3 + 12], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 1], 1
-pinsrb m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 984 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 6], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 14], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 985 * 16], m3
-
-; mode17 [row 13]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 12], 1
-pinsrb m0, [r3 + 14], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 2], 1
-pinsrb m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 986 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 5], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 13], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 987 * 16], m3
-
-; mode17 [row 14]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 14], 1
-pinsrb m0, [r3 + 15], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 4], 1
-pinsrb m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 988 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 4], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 12], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 989 * 16], m3
-
-; mode17 [row 15]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 990 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 990 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 991 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 991 * 16 + 8], m5
-
-; mode17 [row 16]
-movu m6, [r5 + 6 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 15], 1
-pinsrb m0, [r3 + 16], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 5], 1
-pinsrb m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 992 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 3], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 11], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 993 * 16], m3
-
-; mode17 [row 17]
-movu m6, [r5 + 12 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 16], 1
-pinsrb m0, [r3 + 17], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 6], 1
-pinsrb m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 994 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 2], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 10], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 995 * 16], m3
-
-; mode17 [row 18]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 17], 1
-pinsrb m0, [r3 + 18], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 7], 1
-pinsrb m2, [r3 + 9], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 996 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 1], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 9], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 997 * 16], m3
-
-; mode17 [row 19]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 18], 1
-pinsrb m0, [r3 + 20], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 9], 1
-pinsrb m2, [r3 + 10], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 998 * 16], m3
-
-pslldq m1, 2
-pinsrw m1, [r4 + 0], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 8], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 999 * 16], m3
-
-; mode17 [row 20]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 20], 1
-pinsrb m0, [r3 + 21], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 10], 1
-pinsrb m2, [r3 + 11], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1000 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r4 + 0], 1
-pinsrb m1, [r3 + 1], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-;pinsrb m4, [r4 + 8], 1
-;pinsrb m4, [r4 + 7], 0
-pinsrw m4, [r4 + 7], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1001 * 16], m3
-
-; mode17 [row 21]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1002 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1003 * 16], m3
-
-; mode17 [row 22]
-movu m6, [r5 + 10 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 21], 1
-pinsrb m0, [r3 + 22], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 11], 1
-pinsrb m2, [r3 + 12], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1004 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 1], 1
-pinsrb m1, [r3 + 2], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 6], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1005 * 16], m3
-
-; mode17 [row 23]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 22], 1
-pinsrb m0, [r3 + 23], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 12], 1
-pinsrb m2, [r3 + 14], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1006 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 2], 1
-pinsrb m1, [r3 + 4], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 5], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1007 * 16], m3
-
-; mode17 [row 24]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 23], 1
-pinsrb m0, [r3 + 25], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 14], 1
-pinsrb m2, [r3 + 15], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1008 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 4], 1
-pinsrb m1, [r3 + 5], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 4], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1009 * 16], m3
-
-; mode17 [row 25]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 25], 1
-pinsrb m0, [r3 + 26], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 15], 1
-pinsrb m2, [r3 + 16], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1010 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 5], 1
-pinsrb m1, [r3 + 6], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 3], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1011 * 16], m3
-
-; mode17 [row 26]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1012 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1013 * 16], m3
-
-; mode17 [row 27]
-movu m6, [r5 + 8 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 26], 1
-pinsrb m0, [r3 + 27], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 16], 1
-pinsrb m2, [r3 + 17], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1014 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 6], 1
-pinsrb m1, [r3 + 7], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 2], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1015 * 16], m3
-
-; mode17 [row 28]
-movu m6, [r5 + 14 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 27], 1
-pinsrb m0, [r3 + 28], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 17], 1
-pinsrb m2, [r3 + 18], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1016 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 7], 1
-pinsrb m1, [r3 + 9], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 1], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1017 * 16], m3
-
-; mode17 [row 29]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 28], 1
-pinsrb m0, [r3 + 30], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 18], 1
-pinsrb m2, [r3 + 20], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1018 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 9], 1
-pinsrb m1, [r3 + 10], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrw m4, [r4 + 0], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1019 * 16], m3
-
-; mode17 [row 30]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r3 + 30], 1
-pinsrb m0, [r3 + 31], 0
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 20], 1
-pinsrb m2, [r3 + 21], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1020 * 16], m3
-
-pslldq m1, 2
-pinsrb m1, [r3 + 10], 1
-pinsrb m1, [r3 + 11], 0
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pslldq m4, 2
-pinsrb m4, [r4 + 0], 1
-pinsrb m4, [r3 + 1], 0
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1021 * 16], m3
-
-; mode17 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1022 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1022 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1023 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1023 * 16 + 8], m5
-
-;mode 18[row 0]
-movu m0, [r3]
-movu [r0 + 1024 * 16], m0
-movu m1, [r3 + 16]
-movu [r0 + 1025 * 16], m1
-
-;mode 18[row 1]
-pslldq m0, 1
-pinsrb m0, [r4 + 1], 0
-movu [r0 + 1026 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 15], 0
-movu [r0 + 1027 * 16], m1
-
-;mode 18[row 2]
-pslldq m0, 1
-pinsrb m0, [r4 + 2], 0
-movu [r0 + 1028 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 14], 0
-movu [r0 + 1029 * 16], m1
-
-;mode 18[row 3]
-pslldq m0, 1
-pinsrb m0, [r4 + 3], 0
-movu [r0 + 1030 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 13], 0
-movu [r0 + 1031 * 16], m1
-
-;mode 18[row 4]
-pslldq m0, 1
-pinsrb m0, [r4 + 4], 0
-movu [r0 + 1032 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 12], 0
-movu [r0 + 1033 * 16], m1
-
-;mode 18[row 5]
-pslldq m0, 1
-pinsrb m0, [r4 + 5], 0
-movu [r0 + 1034 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 11], 0
-movu [r0 + 1035 * 16], m1
-
-;mode 18[row 6]
-pslldq m0, 1
-pinsrb m0, [r4 + 6], 0
-movu [r0 + 1036 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 10], 0
-movu [r0 + 1037 * 16], m1
-
-;mode 18[row 7]
-pslldq m0, 1
-pinsrb m0, [r4 + 7], 0
-movu [r0 + 1038 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 9], 0
-movu [r0 + 1039 * 16], m1
-
-;mode 18[row 8]
-pslldq m0, 1
-pinsrb m0, [r4 + 8], 0
-movu [r0 + 1040 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 8], 0
-movu [r0 + 1041 * 16], m1
-
-;mode 18[row 9]
-pslldq m0, 1
-pinsrb m0, [r4 + 9], 0
-movu [r0 + 1042 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 7], 0
-movu [r0 + 1043 * 16], m1
-
-;mode 18[row 10]
-pslldq m0, 1
-pinsrb m0, [r4 + 10], 0
-movu [r0 + 1044 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 6], 0
-movu [r0 + 1045 * 16], m1
-
-;mode 18[row 11]
-pslldq m0, 1
-pinsrb m0, [r4 + 11], 0
-movu [r0 + 1046 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 5], 0
-movu [r0 + 1047 * 16], m1
-
-;mode 18[row 12]
-pslldq m0, 1
-pinsrb m0, [r4 + 12], 0
-movu [r0 + 1048 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 4], 0
-movu [r0 + 1049 * 16], m1
-
-;mode 18[row 13]
-pslldq m0, 1
-pinsrb m0, [r4 + 13], 0
-movu [r0 + 1050 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 3], 0
-movu [r0 + 1051 * 16], m1
-
-;mode 18[row 14]
-pslldq m0, 1
-pinsrb m0, [r4 + 14], 0
-movu [r0 + 1052 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 2], 0
-movu [r0 + 1053 * 16], m1
-
-;mode 18[row 15]
-pslldq m0, 1
-pinsrb m0, [r4 + 15], 0
-movu [r0 + 1054 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 1], 0
-movu [r0 + 1055 * 16], m1
-
-;mode 18[row 16]
-pslldq m0, 1
-pinsrb m0, [r4 + 16], 0
-movu [r0 + 1056 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r3 + 0], 0
-movu [r0 + 1057 * 16], m1
-
-;mode 18[row 17]
-pslldq m0, 1
-pinsrb m0, [r4 + 17], 0
-movu [r0 + 1058 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 1], 0
-movu [r0 + 1059 * 16], m1
-
-;mode 18[row 18]
-pslldq m0, 1
-pinsrb m0, [r4 + 18], 0
-movu [r0 + 1060 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 2], 0
-movu [r0 + 1061 * 16], m1
-
-;mode 18[row 19]
-pslldq m0, 1
-pinsrb m0, [r4 + 19], 0
-movu [r0 + 1062 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 3], 0
-movu [r0 + 1063 * 16], m1
-
-;mode 18[row 20]
-pslldq m0, 1
-pinsrb m0, [r4 + 20], 0
-movu [r0 + 1064 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 4], 0
-movu [r0 + 1065 * 16], m1
-
-;mode 18[row 21]
-pslldq m0, 1
-pinsrb m0, [r4 + 21], 0
-movu [r0 + 1066 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 5], 0
-movu [r0 + 1067 * 16], m1
-
-;mode 18[row 22]
-pslldq m0, 1
-pinsrb m0, [r4 + 22], 0
-movu [r0 + 1068 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 6], 0
-movu [r0 + 1069 * 16], m1
-
-;mode 18[row 23]
-pslldq m0, 1
-pinsrb m0, [r4 + 23], 0
-movu [r0 + 1070 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 7], 0
-movu [r0 + 1071 * 16], m1
-
-;mode 18[row 24]
-pslldq m0, 1
-pinsrb m0, [r4 + 24], 0
-movu [r0 + 1072 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 8], 0
-movu [r0 + 1073 * 16], m1
-
-;mode 18[row 25]
-pslldq m0, 1
-pinsrb m0, [r4 + 25], 0
-movu [r0 + 1074 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 9], 0
-movu [r0 + 1075 * 16], m1
-
-;mode 18[row 26]
-pslldq m0, 1
-pinsrb m0, [r4 + 26], 0
-movu [r0 + 1076 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 10], 0
-movu [r0 + 1077 * 16], m1
-
-;mode 18[row 27]
-pslldq m0, 1
-pinsrb m0, [r4 + 27], 0
-movu [r0 + 1078 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 11], 0
-movu [r0 + 1079 * 16], m1
-
-;mode 18[row 28]
-pslldq m0, 1
-pinsrb m0, [r4 + 28], 0
-movu [r0 + 1080 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 12], 0
-movu [r0 + 1081 * 16], m1
-
-;mode 18[row 29]
-pslldq m0, 1
-pinsrb m0, [r4 + 29], 0
-movu [r0 + 1082 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 13], 0
-movu [r0 + 1083 * 16], m1
-
-;mode 18[row 30]
-pslldq m0, 1
-pinsrb m0, [r4 + 30], 0
-movu [r0 + 1084 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 14], 0
-movu [r0 + 1085 * 16], m1
-
-;mode 18[row 31]
-pslldq m0, 1
-pinsrb m0, [r4 + 31], 0
-movu [r0 + 1086 * 16], m0
-pslldq m1, 1
-pinsrb m1, [r4 + 15], 0
-movu [r0 + 1087 * 16], m1
-
-; mode 19 [row 0]
-movu m6, [r5 + 6 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 1088 * 16], m1
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1089 * 16], m4
-
-; mode 19 [row 1]
-movu m6, [r5 + 12 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 1], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1090 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1091 * 16], m4
-
-; mode 19 [row 2]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 1], 1
-pinsrb m0, [r4 + 2], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1092 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 14], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 22], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1093 * 16], m4
-
-; mode 19 [row 3]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 2], 1
-pinsrb m0, [r4 + 4], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1094 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 13], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 21], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1095 * 16], m4
-
-; mode 19 [row 4]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 4], 1
-pinsrb m0, [r4 + 5], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1096 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 12], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 20], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1097 * 16], m4
-
-; mode 19 [row 5]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1098 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1099 * 16], m4
-
-; mode 19 [row 6]
-movu m6, [r5 + 10 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 5], 1
-pinsrb m0, [r4 + 6], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1100 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 11], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 19], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1101 * 16], m4
-
-; mode 19 [row 7]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 6], 1
-pinsrb m0, [r4 + 7], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1102 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 10], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 18], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1103 * 16], m4
-
-; mode 19 [row 8]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 7], 1
-pinsrb m0, [r4 + 9], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1104 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 9], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 17], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1105 * 16], m4
-
-; mode 19 [row 9]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 9], 1
-pinsrb m0, [r4 + 10], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1106 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 8], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 16], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1107 * 16], m4
-
-; mode 19 [row 10]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1108 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1109 * 16], m4
-
-; mode 19 [row 11]
-movu m6, [r5 + 8 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 10], 1
-pinsrb m0, [r4 + 11], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 0], 1
-pinsrb m2, [r4 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1110 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 7], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 15], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1111 * 16], m4
-
-; mode 19 [row 12]
-movu m6, [r5 + 14 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 11], 1
-pinsrb m0, [r4 + 12], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 1], 1
-pinsrb m2, [r4 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1112 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 6], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 14], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1113 * 16], m4
-
-; mode 19 [row 13]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 12], 1
-pinsrb m0, [r4 + 14], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 2], 1
-pinsrb m2, [r4 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1114 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 5], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 13], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1115 * 16], m4
-
-; mode 19 [row 14]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 14], 1
-pinsrb m0, [r4 + 15], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 4], 1
-pinsrb m2, [r4 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1116 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 4], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 12], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1117 * 16], m4
-
-; mode19 [row 15]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1118 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1118 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1119 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1119 * 16 + 8], m5
-
-; mode 19 [row 16]
-movu m6, [r5 + 6 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 15], 1
-pinsrb m0, [r4 + 16], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 5], 1
-pinsrb m2, [r4 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1120 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 3], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 11], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1121 * 16], m4
-
-; mode 19 [row 17]
-movu m6, [r5 + 12 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 16], 1
-pinsrb m0, [r4 + 17], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 6], 1
-pinsrb m2, [r4 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1122 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 2], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 10], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1123 * 16], m4
-
-; mode 19 [row 18]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 17], 1
-pinsrb m0, [r4 + 18], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 7], 1
-pinsrb m2, [r4 + 9], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1124 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 1], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 9], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1125 * 16], m4
-
-; mode 19 [row 19]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 18], 1
-pinsrb m0, [r4 + 20], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 9], 1
-pinsrb m2, [r4 + 10], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1126 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 0], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 8], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1127 * 16], m4
-
-; mode 19 [row 20]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 20], 1
-pinsrb m0, [r4 + 21], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 10], 1
-pinsrb m2, [r4 + 11], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1128 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 0], 1
-pinsrb m1, [r4 + 1], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrb m3, [r3 + 8], 1
-pinsrb m3, [r3 + 7], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1129 * 16], m4
-
-; mode 19 [row 21]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1130 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1131 * 16], m4
-
-; mode 19 [row 22]
-movu m6, [r5 + 10 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 21], 1
-pinsrb m0, [r4 + 22], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 11], 1
-pinsrb m2, [r4 + 12], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1132 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 1], 1
-pinsrb m1, [r4 + 2], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 6], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1133 * 16], m4
-
-; mode 19 [row 23]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 22], 1
-pinsrb m0, [r4 + 23], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 12], 1
-pinsrb m2, [r4 + 14], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1134 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 2], 1
-pinsrb m1, [r4 + 4], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 5], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1135 * 16], m4
-
-; mode 19 [row 24]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 23], 1
-pinsrb m0, [r4 + 25], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 14], 1
-pinsrb m2, [r4 + 15], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1136 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 4], 1
-pinsrb m1, [r4 + 5], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 4], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1137 * 16], m4
-
-; mode 19 [row 25]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 25], 1
-pinsrb m0, [r4 + 26], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 15], 1
-pinsrb m2, [r4 + 16], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1138 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 5], 1
-pinsrb m1, [r4 + 6], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 3], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1139 * 16], m4
-
-; mode 19 [row 26]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1140 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1141 * 16], m4
-
-; mode 19 [row 27]
-movu m6, [r5 + 8 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 26], 1
-pinsrb m0, [r4 + 27], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 16], 1
-pinsrb m2, [r4 + 17], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1142 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 6], 1
-pinsrb m1, [r4 + 7], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 2], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1143 * 16], m4
-
-; mode 19 [row 28]
-movu m6, [r5 + 14 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 27], 1
-pinsrb m0, [r4 + 28], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 17], 1
-pinsrb m2, [r4 + 18], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1144 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 7], 1
-pinsrb m1, [r4 + 9], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 1], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1145 * 16], m4
-
-; mode 19 [row 29]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 28], 1
-pinsrb m0, [r4 + 30], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 18], 1
-pinsrb m2, [r4 + 20], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1146 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 9], 1
-pinsrb m1, [r4 + 10], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 0], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1147 * 16], m4
-
-; mode 19 [row 30]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 30], 1
-pinsrb m0, [r4 + 31], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 20], 1
-pinsrb m2, [r4 + 21], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1148 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 10], 1
-pinsrb m1, [r4 + 11], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrb m3, [r4 + 0], 1
-pinsrb m3, [r4 + 1], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1149 * 16], m4
-
-; mode19 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1150 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1150 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1151 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1151 * 16 + 8], m5
-
-; mode 20 [row 0]
-movu m6, [r5 + 11 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 1152 * 16], m1
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1153 * 16], m4
-
-; mode 20 [row 1]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 2], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1154 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1155 * 16], m4
-
-; mode 20 [row 2]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1156 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1157 * 16], m4
-
-; mode 20 [row 3]
-movu m6, [r5 + 12 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 2], 1
-pinsrb m0, [r4 + 3], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1158 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 14], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 22], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1159 * 16], m4
-
-; mode 20 [row 4]
-movu m6, [r5 + 23 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 3], 1
-pinsrb m0, [r4 + 5], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1160 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 13], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 21], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1161 * 16], m4
-
-; mode 20 [row 5]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1162 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1163 * 16], m4
-
-; mode 20 [row 6]
-movu m6, [r5 + 13 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 5], 1
-pinsrb m0, [r4 + 6], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1164 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 12], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 20], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1165 * 16], m4
-
-; mode 20 [row 7]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 6], 1
-pinsrb m0, [r4 + 8], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1166 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 11], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 19], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1167 * 16], m4
-
-; mode 20 [row 8]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1168 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1169 * 16], m4
-
-; mode 20 [row 9]
-movu m6, [r5 + 14 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 8], 1
-pinsrb m0, [r4 + 9], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 3], 1
-pinsrb m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1170 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 10], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 18], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1171 * 16], m4
-
-; mode 20 [row 10]
-movu m6, [r5 + 25 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 9], 1
-pinsrb m0, [r4 + 11], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1172 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 9], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 17], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1173 * 16], m4
-
-; mode 20 [row 11]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1174 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1175 * 16], m4
-
-; mode 20 [row 12]
-movu m6, [r5 + 15 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 11], 1
-pinsrb m0, [r4 + 12], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r3 + 1], 1
-pinsrb m2, [r3 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1176 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 8], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 16], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1177 * 16], m4
-
-; mode 20 [row 13]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 12], 1
-pinsrb m0, [r4 + 14], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 0], 1
-pinsrb m2, [r4 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1178 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 7], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 15], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1179 * 16], m4
-
-; mode 20 [row 14]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1180 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1181 * 16], m4
-
-; mode 20 [row 15]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 14], 1
-pinsrb m0, [r4 + 15], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 2], 1
-pinsrb m2, [r4 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1182 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 6], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 14], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1183 * 16], m4
-
-; mode 20 [row 16]
-movu m6, [r5 + 27 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 15], 1
-pinsrb m0, [r4 + 17], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 3], 1
-pinsrb m2, [r4 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1184 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 5], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 13], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1185 * 16], m4
-
-; mode 20 [row 17]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1186 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1187 * 16], m4
-
-; mode 20 [row 18]
-movu m6, [r5 + 17 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 17], 1
-pinsrb m0, [r4 + 18], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 5], 1
-pinsrb m2, [r4 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1188 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 4], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 12], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1189 * 16], m4
-
-; mode 20 [row 19]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 18], 1
-pinsrb m0, [r4 + 20], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 6], 1
-pinsrb m2, [r4 + 8], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1190 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 3], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 11], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1191 * 16], m4
-
-; mode 20 [row 20]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1192 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1193 * 16], m4
-
-; mode 20 [row 21]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 20], 1
-pinsrb m0, [r4 + 21], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 8], 1
-pinsrb m2, [r4 + 9], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1194 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 2], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 10], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1195 * 16], m4
-
-; mode 20 [row 22]
-movu m6, [r5 + 29 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 21], 1
-pinsrb m0, [r4 + 23], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 9], 1
-pinsrb m2, [r4 + 11], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1196 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 1], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 9], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1197 * 16], m4
-
-; mode 20 [row 23]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1198 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1199 * 16], m4
-
-; mode 20 [row 24]
-movu m6, [r5 + 19 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 23], 1
-pinsrb m0, [r4 + 24], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 11], 1
-pinsrb m2, [r4 + 12], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1200 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 0], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 8], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1201 * 16], m4
-
-; mode 20 [row 25]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 24], 1
-pinsrb m0, [r4 + 26], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 12], 1
-pinsrb m2, [r4 + 14], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1202 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 0], 1
-pinsrb m1, [r4 + 2], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 7], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1203 * 16], m4
-
-; mode 20 [row 26]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1204 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1205 * 16], m4
-
-; mode 20 [row 27]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 26], 1
-pinsrb m0, [r4 + 27], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 14], 1
-pinsrb m2, [r4 + 15], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1206 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 2], 1
-pinsrb m1, [r4 + 3], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 6], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1207 * 16], m4
-
-; mode 20 [row 28]
-movu m6, [r5 + 31 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 27], 1
-pinsrb m0, [r4 + 29], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 15], 1
-pinsrb m2, [r4 + 17], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1208 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 3], 1
-pinsrb m1, [r4 + 5], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 5], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1209 * 16], m4
-
-; mode 20 [row 29]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1210 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1211 * 16], m4
-
-; mode 20 [row 30]
-movu m6, [r5 + 21 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 29], 1
-pinsrb m0, [r4 + 30], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 17], 1
-pinsrb m2, [r4 + 18], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1212 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r4 + 5], 1
-pinsrb m1, [r4 + 6], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 4], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1213 * 16], m4
-
-; mode20 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1214 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1214 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1215 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1215 * 16 + 8], m5
-
-; mode 21 [row 0]
-movu m6, [r5 + 15 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 1216 * 16], m1
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1217 * 16], m4
-
-; mode 21 [row 1]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 2], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1218 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1219 * 16], m4
-
-; mode 21 [row 2]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1220 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1221 * 16], m4
-
-; mode 21 [row 3]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 2], 1
-pinsrb m0, [r4 + 4], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1222 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 14], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 22], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1223 * 16], m4
-
-; mode 21 [row 4]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1224 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1225 * 16], m4
-
-; mode 21 [row 5]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 4], 1
-pinsrb m0, [r4 + 6], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1226 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 13], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 21], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1227 * 16], m4
-
-; mode 21 [row 6]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1228 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1229 * 16], m4
-
-; mode 21 [row 7]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 6], 1
-pinsrb m0, [r4 + 8], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1230 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 12], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 20], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1231 * 16], m4
-
-; mode 21 [row 8]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1232 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1233 * 16], m4
-
-; mode 21 [row 9]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 8], 1
-pinsrb m0, [r4 + 9], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1234 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 11], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 19], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1235 * 16], m4
-
-; mode 21 [row 10]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1236 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1237 * 16], m4
-
-; mode 21 [row 11]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 9], 1
-pinsrb m0, [r4 + 11], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1238 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 10], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 18], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1239 * 16], m4
-
-; mode 21 [row 12]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1240 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1241 * 16], m4
-
-; mode 21 [row 13]
-movu m6, [r5 + 18 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 11], 1
-pinsrb m0, [r4 + 13], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1242 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 9], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 17], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1243 * 16], m4
-
-; mode 21 [row 14]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1244 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1245 * 16], m4
-
-; mode 21 [row 15]
-movu m6, [r5 + 16 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 13], 1
-pinsrb m0, [r4 + 15], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1246 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 8], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 16], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1247 * 16], m4
-
-; mode 21 [row 16]
-movu m6, [r5 + 31 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 15], 1
-pinsrb m0, [r4 + 17], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 0], 1
-pinsrb m2, [r4 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1248 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 7], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 15], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1249 * 16], m4
-
-; mode 21 [row 17]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1250 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1251 * 16], m4
-
-; mode 21 [row 18]
-movu m6, [r5 + 29 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 17], 1
-pinsrb m0, [r4 + 19], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 2], 1
-pinsrb m2, [r4 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1252 * 16], m4
-pslldq m1, 2
-pinsrb m1, [r3 + 7], 1
-pinsrb m1, [r3 + 6], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrb m3, [r3 + 15], 1
-pinsrb m3, [r3 + 14], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1253 * 16], m4
-
-; mode 21 [row 19]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1254 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1255 * 16], m4
-
-; mode 21 [row 20]
-movu m6, [r5 + 27 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 19], 1
-pinsrb m0, [r4 + 21], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 4], 1
-pinsrb m2, [r4 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1256 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 5], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 13], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1257 * 16], m4
-
-; mode 21 [row 21]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1258 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1259 * 16], m4
-
-; mode 21 [row 22]
-movu m6, [r5 + 25 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 21], 1
-pinsrb m0, [r4 + 23], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 6], 1
-pinsrb m2, [r4 + 8], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1260 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 4], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 12], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1261 * 16], m4
-
-; mode 21 [row 23]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1262 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1263 * 16], m4
-
-; mode 21 [row 24]
-movu m6, [r5 + 23 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 23], 1
-pinsrb m0, [r4 + 24], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 8], 1
-pinsrb m2, [r4 + 9], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1264 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 3], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 11], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1265 * 16], m4
-
-; mode 21 [row 25]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1266 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1267 * 16], m4
-
-; mode 21 [row 26]
-movu m6, [r5 + 21 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 24], 1
-pinsrb m0, [r4 + 26], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 9], 1
-pinsrb m2, [r4 + 11], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1268 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 2], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 10], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1269 * 16], m4
-
-; mode 21 [row 27]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1270 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1271 * 16], m4
-
-; mode 21 [row 28]
-movu m6, [r5 + 19 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 26], 1
-pinsrb m0, [r4 + 28], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 11], 1
-pinsrb m2, [r4 + 13], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1272 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 1], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 9], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1273 * 16], m4
-
-; mode 21 [row 29]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1274 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1275 * 16], m4
-
-; mode 21 [row 30]
-movu m6, [r5 + 17 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 28], 1
-pinsrb m0, [r4 + 30], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 13], 1
-pinsrb m2, [r4 + 15], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1276 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 0], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 8], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1277 * 16], m4
-
-; mode21 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1278 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1278 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1279 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1279 * 16 + 8], m5
-
-; mode 22 [row 0]
-movu m6, [r5 + 19 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 1280 * 16], m1
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1281 * 16], m4
-
-; mode 22 [row 1]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1282 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1283 * 16], m4
-
-; mode 22 [row 2]
-movu m6, [r5 + 25 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 2], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1284 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1285 * 16], m4
-
-; mode 22 [row 3]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1286 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1287 * 16], m4
-
-; mode 22 [row 4]
-movu m6, [r5 + 31 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 2], 1
-pinsrb m0, [r4 + 5], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1288 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 14], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 22], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1289 * 16], m4
-
-; mode 22 [row 5]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1290 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1291 * 16], m4
-
-; mode 22 [row 6]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1292 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1293 * 16], m4
-
-; mode 22 [row 7]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 5], 1
-pinsrb m0, [r4 + 7], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1294 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 13], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 21], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1295 * 16], m4
-
-; mode 22 [row 8]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1296 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1297 * 16], m4
-
-; mode 22 [row 9]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 7], 1
-pinsrb m0, [r4 + 10], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1298 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 12], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 20], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1299 * 16], m4
-
-; mode 22 [row 10]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1300 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1301 * 16], m4
-
-; mode 22 [row 11]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1302 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1303 * 16], m4
-
-; mode 22 [row 12]
-movu m6, [r5 + 23 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 10], 1
-pinsrb m0, [r4 + 12], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1304 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 11], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 19], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1305 * 16], m4
-
-; mode 22 [row 13]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1306 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1307 * 16], m4
-
-; mode 22 [row 14]
-movu m6, [r5 + 29 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 12], 1
-pinsrb m0, [r4 + 15], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1308 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 10], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 18], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1309 * 16], m4
-
-; mode 22 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1310 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1311 * 16], m4
-
-; mode 22 [row 16]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1312 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1313 * 16], m4
-
-; mode 22 [row 17]
-movu m6, [r5 + 22 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 15], 1
-pinsrb m0, [r4 + 17], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1314 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 9], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 17], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1315 * 16], m4
-
-; mode 22 [row 18]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1316 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1317 * 16], m4
-
-; mode 22 [row 19]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 17], 1
-pinsrb m0, [r4 + 20], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1318 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 8], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 16], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1319 * 16], m4
-
-; mode 22 [row 20]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1320 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1321 * 16], m4
-
-; mode 22 [row 21]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1322 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1323 * 16], m4
-
-; mode 22 [row 22]
-movu m6, [r5 + 21 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 20], 1
-pinsrb m0, [r4 + 22], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 0], 1
-pinsrb m2, [r4 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1324 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 7], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 15], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1325 * 16], m4
-
-; mode 22 [row 23]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1326 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1327 * 16], m4
-
-; mode 22 [row 24]
-movu m6, [r5 + 27 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 22], 1
-pinsrb m0, [r4 + 25], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 2], 1
-pinsrb m2, [r4 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1328 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 6], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 14], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1329 * 16], m4
-
-; mode 22 [row 25]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1330 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1331 * 16], m4
-
-; mode 22 [row 26]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1332 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1333 * 16], m4
-
-; mode 22 [row 27]
-movu m6, [r5 + 20 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 25], 1
-pinsrb m0, [r4 + 27], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 5], 1
-pinsrb m2, [r4 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1334 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 5], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 13], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1335 * 16], m4
-
-; mode 22 [row 28]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1336 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1337 * 16], m4
-
-; mode 22 [row 29]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 27], 1
-pinsrb m0, [r4 + 30], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrb m2, [r4 + 7], 1
-pinsrb m2, [r4 + 10], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1338 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 4], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 12], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1339 * 16], m4
-
-; mode 22 [row 30]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1340 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1341 * 16], m4
-
-; mode22 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1342 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1342 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1343 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1343 * 16 + 8], m5
-
-; mode 23 [row 0]
-movu m6, [r5 + 23 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m1, m0, m6
-pmulhrsw m1, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m3, m2, m6
-pmulhrsw m3, m7
-packuswb m1, m3
-movu [r0 + 1344 * 16], m1
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1345 * 16], m4
-
-; mode 23 [row 1]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1346 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1347 * 16], m4
-
-; mode 23 [row 2]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1348 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1349 * 16], m4
-
-; mode 23 [row 3]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 4], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1350 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1351 * 16], m4
-
-; mode 23 [row 4]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1352 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1353 * 16], m4
-
-; mode 23 [row 5]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1354 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1355 * 16], m4
-
-; mode 23 [row 6]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1356 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1357 * 16], m4
-
-; mode 23 [row 7]
-movu m6, [r5 + 24 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 4], 1
-pinsrb m0, [r4 + 7], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1358 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 14], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 22], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1359 * 16], m4
-
-; mode 23 [row 8]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1360 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1361 * 16], m4
-
-; mode 23 [row 9]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1362 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1363 * 16], m4
-
-; mode 23 [row 10]
-movu m6, [r5 + 29 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 7], 1
-pinsrb m0, [r4 + 11], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1364 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 13], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 21], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1365 * 16], m4
-
-; mode 23 [row 11]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1366 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1367 * 16], m4
-
-; mode 23 [row 12]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1368 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1369 * 16], m4
-
-; mode 23 [row 13]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1370 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1371 * 16], m4
-
-; mode 23 [row 14]
-movu m6, [r5 + 25 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 11], 1
-pinsrb m0, [r4 + 14], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1372 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 12], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 20], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1373 * 16], m4
-
-; mode 23 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1374 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1375 * 16], m4
-
-; mode 23 [row 16]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1376 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1377 * 16], m4
-
-; mode 23 [row 17]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 14], 1
-pinsrb m0, [r4 + 18], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 3], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1378 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 11], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 19], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1379 * 16], m4
-
-; mode 23 [row 18]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1380 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1381 * 16], m4
-
-; mode 23 [row 19]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1382 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1383 * 16], m4
-
-; mode 23 [row 20]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1384 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1385 * 16], m4
-
-; mode 23 [row 21]
-movu m6, [r5 + 26 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 18], 1
-pinsrb m0, [r4 + 21], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 2], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1386 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 10], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 18], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1387 * 16], m4
-
-; mode 23 [row 22]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1388 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1389 * 16], m4
-
-; mode 23 [row 23]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1390 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1391 * 16], m4
-
-; mode 23 [row 24]
-movu m6, [r5 + 31 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 21], 1
-pinsrb m0, [r4 + 25], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 1], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1392 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 9], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 17], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1393 * 16], m4
-
-; mode 23 [row 25]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1394 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1395 * 16], m4
-
-; mode 23 [row 26]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1396 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1397 * 16], m4
-
-; mode 23 [row 27]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1398 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1399 * 16], m4
-
-; mode 23 [row 28]
-movu m6, [r5 + 27 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 25], 1
-pinsrb m0, [r4 + 28], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 0], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1400 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 8], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 16], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1401 * 16], m4
-
-; mode 23 [row 29]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1402 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1403 * 16], m4
-
-; mode 23 [row 30]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1404 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1405 * 16], m4
-
-; mode23 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1406 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1406 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1407 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1407 * 16 + 8], m5
-
-; mode 24 [row 0]
-movu m6, [r5 + 27 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1408 * 16], m4
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1409 * 16], m4
-
-; mode 24 [row 1]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1410 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1411 * 16], m4
-
-; mode 24 [row 2]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1412 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1413 * 16], m4
-
-; mode 24 [row 3]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1414 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1415 * 16], m4
-
-; mode 24 [row 4]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1416 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1417 * 16], m4
-
-; mode 24 [row 5]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1418 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1419 * 16], m4
-
-; mode 24 [row 6]
-movu m6, [r5 + 29 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 6], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1420 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1421 * 16], m4
-
-; mode 24 [row 7]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1422 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1423 * 16], m4
-
-; mode 24 [row 8]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1424 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1425 * 16], m4
-
-; mode 24 [row 9]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1426 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1427 * 16], m4
-
-; mode 24 [row 10]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1428 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1429 * 16], m4
-
-; mode 24 [row 11]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1430 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1431 * 16], m4
-
-; mode 24 [row 12]
-movu m6, [r5 + 31 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 6], 1
-pinsrb m0, [r4 + 13], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 6], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1432 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 14], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 22], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1433 * 16], m4
-
-; mode 24 [row 13]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1434 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1435 * 16], m4
-
-; mode 24 [row 14]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1436 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1437 * 16], m4
-
-; mode 24 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1438 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1439 * 16], m4
-
-; mode 24 [row 16]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1440 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1441 * 16], m4
-
-; mode 24 [row 17]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1442 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1443 * 16], m4
-
-; mode 24 [row 18]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1444 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1445 * 16], m4
-
-; mode 24 [row 19]
-movu m6, [r5 + 28 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 13], 1
-pinsrb m0, [r4 + 19], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 5], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1446 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 13], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 21], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1447 * 16], m4
-
-; mode 24 [row 20]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1448 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1449 * 16], m4
-
-; mode 24 [row 21]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1450 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1451 * 16], m4
-
-; mode 24 [row 22]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1452 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1453 * 16], m4
-
-; mode 24 [row 23]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1454 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1455 * 16], m4
-
-; mode 24 [row 24]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1456 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1457 * 16], m4
-
-; mode 24 [row 25]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 19], 1
-pinsrb m0, [r4 + 26], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 4], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1458 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 12], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 20], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1459 * 16], m4
-
-; mode 24 [row 26]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1460 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1461 * 16], m4
-
-; mode 24 [row 27]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1462 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1463 * 16], m4
-
-; mode 24 [row 28]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1464 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1465 * 16], m4
-
-; mode 24 [row 29]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1466 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1467 * 16], m4
-
-; mode 24 [row 30]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1468 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1469 * 16], m4
-
-; mode 24 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1470 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1470 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1471 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1471 * 16 + 8], m5
-
-; mode 25 [row 0]
-movu m6, [r5 + 30 * 16]
-movu m0, [r3 ]
-movu m1, [r3 + 1 ]
-punpcklbw m0, m1
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-movu m2, [r3 + 8]
-movu m3, [r3 + 9]
-punpcklbw m2, m3
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1472 * 16], m4
-
-movu m1, [r3 + 16]
-movu m3, [r3 + 17]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 24]
-movu m5, [r3 + 25]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1473 * 16], m4
-
-; mode 25 [row 1]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1474 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1475 * 16], m4
-
-; mode 25 [row 2]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1476 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1477 * 16], m4
-
-; mode 25 [row 3]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1478 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1479 * 16], m4
-
-; mode 25 [row 4]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1480 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1481 * 16], m4
-
-; mode 25 [row 5]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1482 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1483 * 16], m4
-
-; mode 25 [row 6]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1484 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1485 * 16], m4
-
-; mode 25 [row 7]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1486 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1487 * 16], m4
-
-; mode 25 [row 8]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1488 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1489 * 16], m4
-
-; mode 25 [row 9]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1490 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1491 * 16], m4
-
-; mode 25 [row 10]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1492 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1493 * 16], m4
-
-; mode 25 [row 11]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1494 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1495 * 16], m4
-
-; mode 25 [row 12]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1496 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1497 * 16], m4
-
-; mode 25 [row 13]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1498 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1499 * 16], m4
-
-; mode 25 [row 14]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1500 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1501 * 16], m4
-
-; mode 25 [row 15]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1502 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1502 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1503 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1503 * 16 + 8], m5
-
-; mode 25 [row 16]
-movu m6, [r5 + 30 * 16]
-pslldq m0, 2
-pinsrb m0, [r4 + 0], 1
-pinsrb m0, [r4 + 16], 0
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pslldq m2, 2
-pinsrw m2, [r3 + 7], 0
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1504 * 16], m4
-pslldq m1, 2
-pinsrw m1, [r3 + 15], 0
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pslldq m3, 2
-pinsrw m3, [r3 + 23], 0
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1505 * 16], m4
-
-; mode 25 [row 17]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1506 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1507 * 16], m4
-
-; mode 25 [row 18]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1508 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1509 * 16], m4
-
-; mode 25 [row 19]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1510 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1511 * 16], m4
-
-; mode 25 [row 20]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1512 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1513 * 16], m4
-
-; mode 25 [row 21]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1514 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1515 * 16], m4
-
-; mode 25 [row 22]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1516 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1517 * 16], m4
-
-; mode 25 [row 23]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1518 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1519 * 16], m4
-
-; mode 25 [row 24]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1520 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1521 * 16], m4
-
-; mode 25 [row 25]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1522 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1523 * 16], m4
-
-; mode 25 [row 26]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1524 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1525 * 16], m4
-
-; mode 25 [row 27]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1526 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1527 * 16], m4
-
-; mode 25 [row 28]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1528 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1529 * 16], m4
-
-; mode 25 [row 29]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1530 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1531 * 16], m4
-
-; mode 25 [row 30]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1532 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1533 * 16], m4
-
-; mode 25 [row 31]
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1534 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1534 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1535 * 16], m5
-pshufb m5, m3, [tab_S2]
-movh [r0 + 1535 * 16 + 8], m5
-
-; mode 26
-movu m1, [r1 + 1]
-movu m2, [r1 + 17]
-movu [r0 + 1536 * 16], m1
-movu [r0 + 1537 * 16], m2
-movu [r0 + 1538 * 16], m1
-movu [r0 + 1539 * 16], m2
-movu [r0 + 1540 * 16], m1
-movu [r0 + 1541 * 16], m2
-movu [r0 + 1542 * 16], m1
-movu [r0 + 1543 * 16], m2
-movu [r0 + 1544 * 16], m1
-movu [r0 + 1545 * 16], m2
-movu [r0 + 1546 * 16], m1
-movu [r0 + 1547 * 16], m2
-movu [r0 + 1548 * 16], m1
-movu [r0 + 1549 * 16], m2
-movu [r0 + 1550 * 16], m1
-movu [r0 + 1551 * 16], m2
-
-movu [r0 + 1552 * 16], m1
-movu [r0 + 1553 * 16], m2
-movu [r0 + 1554 * 16], m1
-movu [r0 + 1555 * 16], m2
-movu [r0 + 1556 * 16], m1
-movu [r0 + 1557 * 16], m2
-movu [r0 + 1558 * 16], m1
-movu [r0 + 1559 * 16], m2
-movu [r0 + 1560 * 16], m1
-movu [r0 + 1561 * 16], m2
-movu [r0 + 1562 * 16], m1
-movu [r0 + 1563 * 16], m2
-movu [r0 + 1564 * 16], m1
-movu [r0 + 1565 * 16], m2
-movu [r0 + 1566 * 16], m1
-movu [r0 + 1567 * 16], m2
-
-movu [r0 + 1568 * 16], m1
-movu [r0 + 1569 * 16], m2
-movu [r0 + 1570 * 16], m1
-movu [r0 + 1571 * 16], m2
-movu [r0 + 1572 * 16], m1
-movu [r0 + 1573 * 16], m2
-movu [r0 + 1574 * 16], m1
-movu [r0 + 1575 * 16], m2
-movu [r0 + 1576 * 16], m1
-movu [r0 + 1577 * 16], m2
-movu [r0 + 1578 * 16], m1
-movu [r0 + 1579 * 16], m2
-movu [r0 + 1580 * 16], m1
-movu [r0 + 1581 * 16], m2
-movu [r0 + 1582 * 16], m1
-movu [r0 + 1583 * 16], m2
-
-movu [r0 + 1584 * 16], m1
-movu [r0 + 1585 * 16], m2
-movu [r0 + 1586 * 16], m1
-movu [r0 + 1587 * 16], m2
-movu [r0 + 1588 * 16], m1
-movu [r0 + 1589 * 16], m2
-movu [r0 + 1590 * 16], m1
-movu [r0 + 1591 * 16], m2
-movu [r0 + 1592 * 16], m1
-movu [r0 + 1593 * 16], m2
-movu [r0 + 1594 * 16], m1
-movu [r0 + 1595 * 16], m2
-movu [r0 + 1596 * 16], m1
-movu [r0 + 1597 * 16], m2
-movu [r0 + 1598 * 16], m1
-movu [r0 + 1599 * 16], m2
-
-; mode 27 [row 0]
-movu m6, [r5 + 2 * 16]
-movu m0, [r3 + 1 ]
-movu m1, [r3 + 2 ]
-punpcklbw m0, m1
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-movu m2, [r3 + 9]
-movu m3, [r3 + 10]
-punpcklbw m2, m3
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1600 * 16], m4
-
-movu m1, [r3 + 17]
-movu m3, [r3 + 18]
-punpcklbw m1, m3
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-movu m3, [r3 + 25]
-movu m5, [r3 + 26]
-punpcklbw m3, m5
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1601 * 16], m4
-
-; mode 27 [row 1]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1602 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1603 * 16], m4
-
-; mode 27 [row 2]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1604 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1605 * 16], m4
-
-; mode 27 [row 3]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1606 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1607 * 16], m4
-
-; mode 27 [row 4]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1608 * 16], m4
-
-; mode 28 [row 1 -first half]
-movu [r0 + 1666 * 16], m4
-
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1609 * 16], m4
-
-; mode 28 [row 1 - second half]
-movu [r0 + 1667 * 16], m4
-
-; mode 27 [row 5]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1610 * 16], m4
-
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1611 * 16], m4
-
-; mode 27 [row 6]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1612 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1613 * 16], m4
-
-; mode 27 [row 7]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1614 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1615 * 16], m4
-
-; mode 27 [row 8]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1616 * 16], m4
-
-; mode 29 [row 1 - first half]
-movu [r0 + 1730 * 16], m4
-
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1617 * 16], m4
-
-; mode 29 [row 1 - second half]
-movu [r0 + 1731 * 16], m4
-
-; mode 27 [row 9]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1618 * 16], m4
-
-; mode 28 [row 3 -first half]
-movu [r0 + 1670 * 16], m4
-
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1619 * 16], m4
-
-; mode 28 [row 3 -second half]
-movu [r0 + 1671 * 16], m4
-
-; mode 27 [row 10]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1620 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1621 * 16], m4
-
-; mode 27 [row 11]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1622 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1623 * 16], m4
-
-; mode 27 [row 12]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1624 * 16], m4
-
-; mode 30 [row 1 - first half]
-movu [r0 + 1794 * 16], m4
-
-; mode 33 [row 0 - first half]
-movu [r0 + 1984 * 16], m4
-
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1625 * 16], m4
-
-; mode 30 [row 1 - second half]
-movu [r0 + 1795 * 16], m4
-
-; mode 33 [row 0 - second half]
-movu [r0 + 1985 * 16], m4
-
-; mode 27 [row 13]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1626 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1627 * 16], m4
-
-; mode 27 [row 14]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1628 * 16], m4
-
-; mode 28 [row 5 first half]
-movu [r0 + 1674 * 16], m4
-
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1629 * 16], m4
-
-; mode 28 [row 5 second half]
-movu [r0 + 1675 * 16], m4
-
-; mode 28 [row 0]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1664 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1665 * 16], m4
-
-; mode 28 [row 2]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1668 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1669 * 16], m4
-
-; mode 28 [row 4]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1672 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1673 * 16], m4
-
-; mode 30 [row 0]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1792 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1793 * 16], m4
-
-; mode 29 [row 0]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1728 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1729 * 16], m4
-
-; mode 29 [row 2]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1732 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1733 * 16], m4
-
-; mode 31 [row 0]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1856 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1857 * 16], m4
-
-; mode 32 [row 0]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m4, m0, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1920 * 16], m4
-pmaddubsw m4, m1, m6
-pmulhrsw m4, m7
-pmaddubsw m5, m3, m6
-pmulhrsw m5, m7
-packuswb m4, m5
-movu [r0 + 1921 * 16], m4
-
-; mode 27 [row 15]
-movu m0, [r3 + 2]
-movd m1, [r3 + 3]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 10]
-movd m3, [r3 + 11]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 18]
-movd m3, [r3 + 19]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 26]
-movd m5, [r3 + 27]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1630 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1630 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1631 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1631 * 16 + 8], m5
-
-; mode 27 [row 16]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1632 * 16], m3
-
-; mode 31 [row 1 - first half]
-movu [r0 + 1858 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1633 * 16], m3
-
-; mode 31 [row 1 - second half]
-movu [r0 + 1859 * 16], m3
-
-; mode 27 [row 17]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1634 * 16], m3
-
-; mode 29 [row 3 - first half]
-movu [r0 + 1734 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1635 * 16], m3
-
-; mode 29 [row 3 - second half]
-movu [r0 + 1735 * 16], m3
-
-; mode 27 [row 18]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1636 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1637 * 16], m3
-
-; mode 27 [row 19]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1638 * 16], m3
-
-; mode 28 [row 7 - first half]
-movu [r0 + 1678 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1639 * 16], m3
-
-; mode 28 [row 7 - second half]
-movu [r0 + 1679 * 16], m3
-
-; mode 27 [row 20]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1640 * 16], m3
-
-; mode 32 [row 1 - first half]
-movu [r0 + 1922 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1641 * 16], m3
-
-; mode 32 [row 1 - second half]
-movu [r0 + 1923 * 16], m3
-
-; mode 27 [row 21]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1642 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1643 * 16], m3
-
-; mode 27 [row 22]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1644 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1645 * 16], m3
-
-; mode 27 [row 23]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1646 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1647 * 16], m3
-
-; mode 27 [row 24]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1648 * 16], m3
-
-; mode 28 [row 9 - first half]
-movu [r0 + 1682 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1649 * 16], m3
-
-; mode 28 [row 9 - second half]
-movu [r0 + 1683 * 16], m3
-
-; mode 27 [row 25]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1650 * 16], m3
-
-; mode 30 [row 3 - first half]
-movu [r0 + 1798 * 16], m3
-
-; mode 33 [row 1 - first half]
-movu [r0 + 1986 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1651 * 16], m3
-
-; mode 30 [row 3 - second half]
-movu [r0 + 1799 * 16], m3
-
-; mode 33 [row 1 - second half]
-movu [r0 + 1987 * 16], m3
-
-; mode 27 [row 26]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1652 * 16], m3
-
-; mode 29 [row 5 - first half]
-movu [r0 + 1738 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1653 * 16], m3
-
-; mode 29 [row 5 - second half]
-movu [r0 + 1739 * 16], m3
-
-; mode 27 [row 27]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1654 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1655 * 16], m3
-
-; mode 27 [row 28]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1656 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1657 * 16], m3
-
-; mode 27 [row 29]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1658 * 16], m3
-
-; mode 28 [row 11 - first half]
-movu [r0 + 1686 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1659 * 16], m3
-
-; mode 28 [row 11 - second half]
-movu [r0 + 1687 * 16], m3
-
-; mode 27 [row 30]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1660 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1661 * 16], m3
-
-; mode 28 [row 6]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1676 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1677 * 16], m3
-
-; mode 28 [row 8]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1680 * 16], m3
-
-; mode 29 [row 4 - first half]
-movu [r0 + 1736 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1681 * 16], m3
-
-; mode 29 [row 4 - second half]
-movu [r0 + 1737 * 16], m3
-
-; mode 28 [row 10]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1684 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1685 * 16], m3
-
-; mode 29 [row 6]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1740 * 16], m3
-
-; mode 32 [row 2 - first half]
-movu [r0 + 1924 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1741 * 16], m3
-
-; mode 32 [row 2 - second half]
-movu [r0 + 1925 * 16], m3
-
-; mode 30 [row 2]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1796 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1797 * 16], m3
-
-; mode 31 [row 2]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1860 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1861 * 16], m3
-
-; mode 27 [row 15]
-movu m0, [r3 + 3]
-movd m1, [r3 + 4]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 11]
-movd m3, [r3 + 12]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 19]
-movd m3, [r3 + 20]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 27]
-movd m5, [r3 + 28]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1662 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1662 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1663 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1663 * 16 + 8], m5
-
-; mode 28 [row 12]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1688 * 16], m3
-
-; mode 30 [row 4 - first half]
-movu [r0 + 1800 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1689 * 16], m3
-
-; mode 30 [row 4 - second half]
-movu [r0 + 1801 * 16], m3
-
-; mode 28 [row 13]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1690 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1691 * 16], m3
-
-; mode 28 [row 14]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1692 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1693 * 16], m3
-
-; mode 28 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1694 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1695 * 16], m3
-
-; mode 28 [row 16]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1696 * 16], m3
-
-; mode 31 [row 4 - first half]
-movu [r0 + 1864 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1697 * 16], m3
-
-; mode 31 [row 4 - second half]
-movu [r0 + 1865 * 16], m3
-
-; mode 28 [row 17]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1698 * 16], m3
-
-; mode 29 [row 9 - first half]
-movu [r0 + 1746 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1699 * 16], m3
-
-; mode 29 [row 9 - second half]
-movu [r0 + 1747 * 16], m3
-
-; mode 28 [row 18]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1700 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1701 * 16], m3
-
-; mode 29 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1742 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1743 * 16], m3
-
-; mode 29 [row 8]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1744 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1745 * 16], m3
-
-; mode 30 [row 5]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1802 * 16], m3
-
-; mode 33 [row 2 - first half]
-movu [r0 + 1988 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1803 * 16], m3
-
-; mode 33 [row 2 - second half]
-movu [r0 + 1989 * 16], m3
-
-; mode 30 [row 6]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1804 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1805 * 16], m3
-
-; mode 31 [row 3]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1862 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1863 * 16], m3
-
-; mode 32 [row 3]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1926 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1927 * 16], m3
-
-; mode 28 [row 19]
-movu m6, [r5 + 4 * 16]
-movu m0, [r3 + 4]
-movd m1, [r3 + 5]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 12]
-movd m4, [r3 + 13]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1702 * 16], m3
-
-movu m1, [r3 + 20]
-movd m3, [r3 + 21]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 28]
-movd m5, [r3 + 29]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1703 * 16], m3
-
-; mode 28 [row 20]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1704 * 16], m3
-
-; mode 32 [row 4 - first half]
-movu [r0 + 1928 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1705 * 16], m3
-
-; mode 32 [row 4 - second half]
-movu [r0 + 1929 * 16], m3
-
-; mode 28 [row 21]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1706 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1707 * 16], m3
-
-; mode 28 [row 22]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1708 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1709 * 16], m3
-
-; mode 28 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1710 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1711 * 16], m3
-
-; mode 28 [row 24]
-movu m6, [r5 + 29 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1712 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1713 * 16], m3
-
-; mode 29 [row 10]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1748 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1749 * 16], m3
-
-; mode 29 [row 11]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1750 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1751 * 16], m3
-
-; mode 29 [row 12]
-movu m6, [r5 + 21 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1752 * 16], m3
-
-; mode 30 [row 8 -first half]
-movu [r0 + 1808 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1753 * 16], m3
-
-; mode 30 [row 8 -second half]
-movu [r0 + 1809 * 16], m3
-
-; mode 29 [row 13]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1754 * 16], m3
-
-; mode 32 [row 5 - first half]
-movu [r0 + 1930 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1755 * 16], m3
-
-; mode 32 [row 5 - second half]
-movu [r0 + 1931 * 16], m3
-
-; mode 30 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1806 * 16], m3
-
-; mode 33 [row 3 - first half]
-movu [r0 + 1990 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1807 * 16], m3
-
-; mode 33 [row 3 - second half]
-movu [r0 + 1991 * 16], m3
-
-; mode 31 [row 5]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1866 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1867 * 16], m3
-
-; mode 31 [row 6]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1868 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1869 * 16], m3
-
-; mode 28 [row 25]
-movu m6, [r5 + 2 * 16]
-movu m0, [r3 + 5]
-movd m1, [r3 + 6]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 13]
-movd m4, [r3 + 14]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1714 * 16], m3
-
-movu m1, [r3 + 21]
-movd m3, [r3 + 22]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 29]
-movd m5, [r3 + 30]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1715 * 16], m3
-
-; mode 28 [row 26]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1716 * 16], m3
-
-; mode 29 [row 14 - first half]
-movu [r0 + 1756 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1717 * 16], m3
-
-; mode 29 [row 14 - second half]
-movu [r0 + 1757 * 16], m3
-
-; mode 28 [row 27]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1718 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1719 * 16], m3
-
-; mode 28 [row 28]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1720 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1721 * 16], m3
-
-; mode 28 [row 29]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1722 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1723 * 16], m3
-
-; mode 28 [row 30]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1724 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1725 * 16], m3
-
-; mode 29 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1758 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1759 * 16], m3
-
-; mode 29 [row 16]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1760 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1761 * 16], m3
-
-; mode 30 [row 9]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1810 * 16], m3
-
-; mode 33 [row 4 - first half]
-movu [r0 + 1992 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1811 * 16], m3
-
-; mode 33 [row 4 - second half]
-movu [r0 + 1993 * 16], m3
-
-; mode 30 [row 10]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1812 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1813 * 16], m3
-
-; mode 31 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1870 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1871 * 16], m3
-
-; mode 31 [row 8]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1872 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1873 * 16], m3
-
-; mode 32 [row 6]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1932 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1933 * 16], m3
-
-; mode 30 [row 11]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1814 * 16], m3
-
-; mode 33 [row 5 - first half]
-movu [r0 + 1994 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1815 * 16], m3
-
-; mode 33 [row 5 - second half]
-movu [r0 + 1995 * 16], m3
-
-; mode 28 [row 31]
-movu m0, [r3 + 6]
-movd m1, [r3 + 7]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 14]
-movd m3, [r3 + 15]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 22]
-movd m3, [r3 + 23]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 30]
-movd m5, [r3 + 31]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1726 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1726 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1727 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1727 * 16 + 8], m5
-
-; mode 29 [row 17]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1762 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1763 * 16], m3
-
-; mode 29 [row 18]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1764 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1765 * 16], m3
-
-; mode 29 [row 19]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1766 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1767 * 16], m3
-
-; mode 29 [row 20]
-movu m6, [r5 + 29 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1768 * 16], m3
-
-; mode 32 [row 8 - first halif]
-movu [r0 + 1936 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1769 * 16], m3
-
-; mode 32 [row 8 - second halif]
-movu [r0 + 1937 * 16], m3
-
-; mode 30 [row 12]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1816 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1817 * 16], m3
-
-; mode 30 [row 13]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1818 * 16], m3
-
-; mode 33 [row 6 - first half]
-movu [r0 + 1996 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1819 * 16], m3
-
-; mode 33 [row 6 - second half]
-movu [r0 + 1997 * 16], m3
-
-; mode 31 [row 9]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1874 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1875 * 16], m3
-
-; mode 31 [row 10]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1876 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1877 * 16], m3
-
-; mode 32 [row 7]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1934 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1935 * 16], m3
-
-; mode 29 [row 21]
-movu m6, [r5 + 6 * 16]
-movu m0, [r3 + 7]
-movd m1, [r3 + 8]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 15]
-movd m4, [r3 + 16]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1770 * 16], m3
-
-movu m1, [r3 + 23]
-movd m3, [r3 + 24]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 31]
-movd m5, [r3 + 32]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1771 * 16], m3
-
-; mode 29 [row 22]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1772 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1773 * 16], m3
-
-; mode 29 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1774 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1775 * 16], m3
-
-; mode 30 [row 14]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1820 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1821 * 16], m3
-
-; mode 30 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1822 * 16], m3
-
-; mode 33 [row 7 - first half]
-movu [r0 + 1998 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1823 * 16], m3
-
-; mode 33 [row 7 - second half]
-movu [r0 + 1999 * 16], m3
-
-; mode 30 [row 16]
-movu m6, [r5 + 29 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1824 * 16], m3
-
-; mode 31 [row 12 - first half]
-movu [r0 + 1880 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1825 * 16], m3
-
-; mode 31 [row 12 - second half]
-movu [r0 + 1881 * 16], m3
-
-; mode 31 [row 11]
-movu m6, [r5 + 12 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1878 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1879 * 16], m3
-
-; mode 32 [row 9]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1938 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1939 * 16], m3
-
-; mode 29 [row 24]
-movu m6, [r5 + 1 * 16]
-movu m0, [r3 + 8]
-movd m1, [r3 + 9]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 16]
-movd m4, [r3 + 17]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1776 * 16], m3
-
-movu m1, [r3 + 24]
-movd m3, [r3 + 25]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 32]
-movd m5, [r3 + 33]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1777 * 16], m3
-
-; mode 29 [row 25]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1778 * 16], m3
-
-; mode 30 [row 17 - first half]
-movu [r0 + 1826 * 16], m3
-
-; mode 33 [row 8 - first half]
-movu [r0 + 2000 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1779 * 16], m3
-
-; mode 30 [row 17 - second half]
-movu [r0 + 1827 * 16], m3
-
-; mode 33 [row 8 - second half]
-movu [r0 + 2001 * 16], m3
-
-; mode 29 [row 26]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1780 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1781 * 16], m3
-
-; mode 29 [row 27]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1782 * 16], m3
-
-; mode 32 [row 11 - first half]
-movu [r0 + 1942 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1783 * 16], m3
-
-; mode 32 [row 11 - second half]
-movu [r0 + 1943 * 16], m3
-
-; mode 30 [row 18]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1828 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1829 * 16], m3
-
-; mode 31 [row 13]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1882 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1883 * 16], m3
-
-; mode 31 [row 14]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1884 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1885 * 16], m3
-
-; mode 32 [row 10]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1940 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1941 * 16], m3
-
-; mode 29 [row 28]
-movu m6, [r5 + 5 * 16]
-movu m0, [r3 + 9]
-movd m1, [r3 + 10]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 17]
-movd m4, [r3 + 18]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1784 * 16], m3
-
-movu m1, [r3 + 25]
-movd m3, [r3 + 26]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 33]
-movd m5, [r3 + 34]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1785 * 16], m3
-
-; mode 29 [row 29]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1786 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1787 * 16], m3
-
-; mode 29 [row 30]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1788 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1789 * 16], m3
-
-; mode 30 [row 19]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1830 * 16], m3
-
-; mode 33 [row 9 - first half]
-movu [r0 + 2002 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1831 * 16], m3
-
-; mode 33 [row 9 - second half]
-movu [r0 + 2003 * 16], m3
-
-; mode 30 [row 20]
-movu m6, [r5 + 17 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1832 * 16], m3
-
-; mode 32 [row 12 - first half]
-movu [r0 + 1944 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1833 * 16], m3
-
-; mode 32 [row 12 - second half]
-movu [r0 + 1945 * 16], m3
-
-; mode 30 [row 21]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1834 * 16], m3
-
-; mode 33 [row 10 - first half]
-movu [r0 + 2004 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1835 * 16], m3
-
-; mode 33 [row 10 - second half]
-movu [r0 + 2005 * 16], m3
-
-; mode 31 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1886 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1887 * 16], m3
-
-; mode 29 [row 31]
-movu m0, [r3 + 10]
-movd m1, [r3 + 11]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 18]
-movd m3, [r3 + 19]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 26]
-movd m3, [r3 + 27]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 34]
-movd m5, [r3 + 35]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1790 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1790 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1791 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1791 * 16 + 8], m5
-
-; mode 30 [row 22]
-movu m6, [r5 + 11 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1836 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1837 * 16], m3
-
-; mode 30 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1838 * 16], m3
-
-; mode 33 [row 11 - first half]
-movu [r0 + 2006 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1839 * 16], m3
-
-; mode 33 [row 11 - second half]
-movu [r0 + 2007 * 16], m3
-
-; mode 31 [row 16]
-movu m6, [r5 + 1 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1888 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1889 * 16], m3
-
-; mode 31 [row 17]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1890 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1891 * 16], m3
-
-; mode 32 [row 13]
-movu m6, [r5 + 6 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1946 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1947 * 16], m3
-
-; mode 32 [row 14]
-movu m6, [r5 + 27 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1948 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1949 * 16], m3
-
-; mode 30 [row 24]
-movu m6, [r5 + 5 * 16]
-movu m0, [r3 + 11]
-movd m1, [r3 + 12]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 19]
-movd m4, [r3 + 20]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1840 * 16], m3
-
-movu m1, [r3 + 27]
-movd m3, [r3 + 28]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 35]
-movd m5, [r3 + 36]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1841 * 16], m3
-
-; mode 30 [row 25]
-movu m6, [r5 + 18 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1842 * 16], m3
-
-; mode 33 [row 12 - first half]
-movu [r0 + 2008 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1843 * 16], m3
-
-; mode 33 [row 12 - second half]
-movu [r0 + 2009 * 16], m3
-
-; mode 30 [row 26]
-movu m6, [r5 + 31 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1844 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1845 * 16], m3
-
-; mode 31 [row 18]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1892 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1893 * 16], m3
-
-; mode 31 [row 19]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1894 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1895 * 16], m3
-
-; mode 32 [row 15]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1950 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1951 * 16], m3
-
-; mode 30 [row 27]
-movu m6, [r5 + 12 * 16]
-movu m0, [r3 + 12]
-movd m1, [r3 + 13]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 20]
-movd m4, [r3 + 21]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1846 * 16], m3
-
-; mode 33 [row 13 - first half]
-movu [r0 + 2010 * 16], m3
-
-movu m1, [r3 + 28]
-movd m3, [r3 + 29]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 36]
-movd m5, [r3 + 37]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1847 * 16], m3
-
-; mode 33 [row 13 - second half]
-movu [r0 + 2011 * 16], m3
-
-; mode 30 [row 28]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1848 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1849 * 16], m3
-
-; mode 31 [row 20]
-movu m6, [r5 + 5 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1896 * 16], m3
-
-; mode 32 [row 16 - first half]
-movu [r0 + 1952 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1897 * 16], m3
-
-; mode 32 [row 16 - second half]
-movu [r0 + 1953 * 16], m3
-
-; mode 31 [row 21]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1898 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1899 * 16], m3
-
-; mode 32 [row 17]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1954 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1955 * 16], m3
-
-; mode 30 [row 29]
-movu m6, [r5 + 6 * 16]
-movu m0, [r3 + 13]
-movd m1, [r3 + 14]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 21]
-movd m4, [r3 + 22]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1850 * 16], m3
-
-; mode 33 [row 14 - first half]
-movu [r0 + 2012 * 16], m3
-
-movu m1, [r3 + 29]
-movd m3, [r3 + 30]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 37]
-movd m5, [r3 + 38]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1851 * 16], m3
-
-; mode 33 [row 14 - second half]
-movu [r0 + 2013 * 16], m3
-
-; mode 30 [row 30]
-movu m6, [r5 + 19 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1852 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1853 * 16], m3
-
-; mode 31 [row 22]
-movu m6, [r5 + 7 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1900 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1901 * 16], m3
-
-; mode 31 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1902 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1903 * 16], m3
-
-; mode 32 [row 18]
-movu m6, [r5 + 15 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1956 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1957 * 16], m3
-
-; mode 30 [row 31]
-movu m0, [r3 + 14]
-movd m1, [r3 + 15]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 22]
-movd m3, [r3 + 23]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 30]
-movd m3, [r3 + 31]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 38]
-movd m5, [r3 + 39]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1854 * 16], m5
-
-; mode 33 [row 15 - first eight]
-movh [r0 + 2014 * 16], m5
-
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1854 * 16 + 8], m5
-
-; mode 33 [row 15 - second eight]
-movh [r0 + 2014 * 16 + 8], m5
-
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1855 * 16], m5
-
-; mode 33 [row 15 - third eight]
-movh [r0 + 2015 * 16], m5
-
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1855 * 16 + 8], m5
-
-; mode 33 [row 15 - fourth eight]
-movh [r0 + 2015 * 16 + 8], m5
-
-; mode 31 [row 24]
-movu m6, [r5 + 9 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1904 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1905 * 16], m3
-
-; mode 31 [row 25]
-movu m6, [r5 + 26 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1906 * 16], m3
-
-; mode 33 [row 16 - first half]
-movu [r0 + 2016 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1907 * 16], m3
-
-; mode 33 [row 16 - second half]
-movu [r0 + 2017 * 16], m3
-
-; mode 32 [row 19]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1958 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1959 * 16], m3
-
-; mode 32 [row 20]
-movu m6, [r5 + 25 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1960 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1961 * 16], m3
-
-; mode 31 [row 26]
-movu m6, [r5 + 11 * 16]
-movu m0, [r3 + 15]
-movd m1, [r3 + 16]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 23]
-movd m4, [r3 + 24]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1908 * 16], m3
-
-movu m1, [r3 + 31]
-movd m3, [r3 + 32]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 39]
-movd m5, [r3 + 40]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1909 * 16], m3
-
-; mode 31 [row 27]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1910 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1911 * 16], m3
-
-; mode 32 [row 21]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1962 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1963 * 16], m3
-
-; mode 33 [row 17]
-movu m6, [r5 + 20 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2018 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2019 * 16], m3
-
-; mode 31 [row 28]
-movu m6, [r5 + 13 * 16]
-movu m0, [r3 + 16]
-movd m1, [r3 + 17]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 24]
-movd m4, [r3 + 25]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1912 * 16], m3
-
-movu m1, [r3 + 32]
-movd m3, [r3 + 33]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 40]
-movd m5, [r3 + 41]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1913 * 16], m3
-
-; mode 31 [row 29]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1914 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1915 * 16], m3
-
-; mode 32 [row 22]
-movu m6, [r5 + 3 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1964 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1965 * 16], m3
-
-; mode 32 [row 23]
-movu m6, [r5 + 24 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1966 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1967 * 16], m3
-
-; mode 33 [row 18]
-movu m6, [r5 + 14 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2020 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2021 * 16], m3
-
-; mode 31 [row 30]
-movu m6, [r5 + 15 * 16]
-movu m0, [r3 + 17]
-movd m1, [r3 + 18]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 25]
-movd m4, [r3 + 26]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1916 * 16], m3
-
-movu m1, [r3 + 33]
-movd m3, [r3 + 34]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 41]
-movd m5, [r3 + 42]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1917 * 16], m3
-
-; mode 32 [row 24]
-movu m6, [r5 + 13 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1968 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1969 * 16], m3
-
-; mode 33 [row 19]
-movu m6, [r5 + 8 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2022 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2023 * 16], m3
-
-; mode 31 [row 31]
-movu m0, [r3 + 18]
-movd m1, [r3 + 19]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 26]
-movd m3, [r3 + 27]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 34]
-movd m3, [r3 + 35]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 42]
-movd m5, [r3 + 43]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1918 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1918 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1919 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1919 * 16 + 8], m5
-
-; mode 32 [row 25]
-movu m6, [r5 + 2 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1970 * 16], m3
-
-; mode 33 [row 20 - first half]
-movu [r0 + 2024 * 16], m3
-
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1971 * 16], m3
-
-; mode 33 [row 20 - second half]
-movu [r0 + 2025 * 16], m3
-
-; mode 32 [row 26]
-movu m6, [r5 + 23 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1972 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1973 * 16], m3
-
-; mode 33 [row 21]
-movu m6, [r5 + 28 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2026 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2027 * 16], m3
-
-; mode 32 [row 27]
-movu m6, [r5 + 12 * 16]
-movu m0, [r3 + 19]
-movd m1, [r3 + 20]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 27]
-movd m4, [r3 + 28]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1974 * 16], m3
-
-movu m1, [r3 + 35]
-movd m3, [r3 + 36]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 43]
-movd m5, [r3 + 44]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1975 * 16], m3
-
-; mode 33 [row 22]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2028 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2029 * 16], m3
-
-; mode 32 [row 28]
-movu m6, [r5 + 1 * 16]
-movu m0, [r3 + 20]
-movd m1, [r3 + 21]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 28]
-movd m4, [r3 + 29]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1976 * 16], m3
-
-movu m1, [r3 + 36]
-movd m3, [r3 + 37]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 44]
-movd m5, [r3 + 45]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1977 * 16], m3
-
-; mode 32 [row 29]
-movu m6, [r5 + 22 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1978 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1979 * 16], m3
-
-; mode 33 [row 23]
-movu m6, [r5 + 16 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2030 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2031 * 16], m3
-
-; mode 32 [row 30]
-movu m6, [r5 + 11 * 16]
-movu m0, [r3 + 21]
-movd m1, [r3 + 22]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 29]
-movd m4, [r3 + 30]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1980 * 16], m3
-
-movu m1, [r3 + 37]
-movd m3, [r3 + 38]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 45]
-movd m5, [r3 + 46]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 1981 * 16], m3
-
-; mode 33 [row 24]
-movu m6, [r5 + 10 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2032 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2033 * 16], m3
-
-; mode 32 [row 31]
-movu m0, [r3 + 22]
-movd m1, [r3 + 23]
-palignr m1, m0, 1
-punpcklbw m0, m1
-movu m2, [r3 + 30]
-movd m3, [r3 + 31]
-palignr m3, m2, 1
-punpcklbw m2, m3
-movu m1, [r3 + 38]
-movd m3, [r3 + 39]
-palignr m3, m1, 1
-punpcklbw m1, m3
-movu m4, [r3 + 46]
-movd m5, [r3 + 47]
-palignr m5, m4, 1
-punpcklbw m4, m5
-
-pshufb m5, m0, [tab_S2]
-movh [r0 + 1982 * 16], m5
-pshufb m5, m2, [tab_S2]
-movh [r0 + 1982 * 16 + 8], m5
-pshufb m5, m1, [tab_S2]
-movh [r0 + 1983 * 16], m5
-pshufb m5, m4, [tab_S2]
-movh [r0 + 1983 * 16 + 8], m5
-
-; mode 33 [row 25]
-movu m6, [r5 + 4 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2034 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2035 * 16], m3
-
-; mode 33 [row 26]
-movu m6, [r5 + 30 * 16]
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2036 * 16], m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2037 * 16], m3
-
-; mode 33 [row 27]
-movu m6, [r5 + 24 * 16]
-movu m0, [r3 + 23]
-movd m1, [r3 + 24]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 31]
-movd m4, [r3 + 32]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2038 * 16], m3
-
-movu m1, [r3 + 39]
-movd m3, [r3 + 40]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 47]
-movd m5, [r3 + 48]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2039 * 16], m3
-
-; mode 33 [row 28]
-movu m6, [r5 + 18 * 16]
-movu m0, [r3 + 24]
-movd m1, [r3 + 25]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 32]
-movd m4, [r3 + 33]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2040 * 16], m3
-
-movu m1, [r3 + 40]
-movd m3, [r3 + 41]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 48]
-movd m5, [r3 + 49]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2041 * 16], m3
-
-; mode 33 [row 29]
-movu m6, [r5 + 12 * 16]
-movu m0, [r3 + 25]
-movd m1, [r3 + 26]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 33]
-movd m4, [r3 + 34]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2042 * 16], m3
-
-movu m1, [r3 + 41]
-movd m3, [r3 + 42]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 49]
-movd m5, [r3 + 50]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2043 * 16], m3
-
-; mode 33 [row 30]
-movu m6, [r5 + 6 * 16]
-movu m0, [r3 + 26]
-movd m1, [r3 + 27]
-palignr m1, m0, 1
-punpcklbw m0, m1
-pmaddubsw m3, m0, m6
-pmulhrsw m3, m7
-movu m2, [r3 + 34]
-movd m4, [r3 + 35]
-palignr m4, m2, 1
-punpcklbw m2, m4
-pmaddubsw m5, m2, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2044 * 16], m3
-
-movu m1, [r3 + 42]
-movd m3, [r3 + 43]
-palignr m3, m1, 1
-punpcklbw m1, m3
-pmaddubsw m3, m1, m6
-pmulhrsw m3, m7
-movu m4, [r3 + 50]
-movd m5, [r3 + 51]
-palignr m5, m4, 1
-punpcklbw m4, m5
-pmaddubsw m5, m4, m6
-pmulhrsw m5, m7
-packuswb m3, m5
-movu [r0 + 2045 * 16], m3
-
-; mode 33 [row 31]
-movu m5, [r3 + 27]
-movu [r0 + 2046 * 16], m5
-movu m5, [r3 + 43]
-movu [r0 + 2047 * 16], m5
-
-;mode 34 [row 0]
-movu m0, [r3 + 2]
-movu [r0 + 2048 * 16], m0
-movu m1, [r3 + 18]
-movu [r0 + 2049 * 16], m1
-
-;mode 34 [row 1]
-movu m2, [r3 + 34]
-palignr m3, m1, m0, 1
-movu [r0 + 2050 * 16], m3
-palignr m4, m2, m1, 1
-movu [r0 + 2051 * 16], m4
-
-;mode 34 [row 2]
-palignr m3, m1, m0, 2
-movu [r0 + 2052 * 16], m3
-palignr m4, m2, m1, 2
-movu [r0 + 2053 * 16], m4
-
-;mode 34 [row 3]
-palignr m3, m1, m0, 3
-movu [r0 + 2054 * 16], m3
-palignr m4, m2, m1, 3
-movu [r0 + 2055 * 16], m4
-
-;mode 34 [row 4]
-palignr m3, m1, m0, 4
-movu [r0 + 2056 * 16], m3
-palignr m4, m2, m1, 4
-movu [r0 + 2057 * 16], m4
-
-;mode 34 [row 5]
-palignr m3, m1, m0, 5
-movu [r0 + 2058 * 16], m3
-palignr m4, m2, m1, 5
-movu [r0 + 2059 * 16], m4
-
-;mode 34 [row 6]
-palignr m3, m1, m0, 6
-movu [r0 + 2060 * 16], m3
-palignr m4, m2, m1, 6
-movu [r0 + 2061 * 16], m4
-
-;mode 34 [row 7]
-palignr m3, m1, m0, 7
-movu [r0 + 2062 * 16], m3
-palignr m4, m2, m1, 7
-movu [r0 + 2063 * 16], m4
-
-;mode 34 [row 8]
-palignr m3, m1, m0, 8
-movu [r0 + 2064 * 16], m3
-palignr m4, m2, m1, 8
-movu [r0 + 2065 * 16], m4
-
-;mode 34 [row 9]
-palignr m3, m1, m0, 9
-movu [r0 + 2066 * 16], m3
-palignr m4, m2, m1, 9
-movu [r0 + 2067 * 16], m4
-
-;mode 34 [row 10]
-palignr m3, m1, m0, 10
-movu [r0 + 2068 * 16], m3
-palignr m4, m2, m1, 10
-movu [r0 + 2069 * 16], m4
-
-;mode 34 [row 11]
-palignr m3, m1, m0, 11
-movu [r0 + 2070 * 16], m3
-palignr m4, m2, m1, 11
-movu [r0 + 2071 * 16], m4
-
-;mode 34 [row 12]
-palignr m3, m1, m0, 12
-movu [r0 + 2072 * 16], m3
-palignr m4, m2, m1, 12
-movu [r0 + 2073 * 16], m4
-
-;mode 34 [row 13]
-palignr m3, m1, m0, 13
-movu [r0 + 2074 * 16], m3
-palignr m4, m2, m1, 13
-movu [r0 + 2075 * 16], m4
-
-;mode 34 [row 14]
-palignr m3, m1, m0, 14
-movu [r0 + 2076 * 16], m3
-palignr m4, m2, m1, 14
-movu [r0 + 2077 * 16], m4
-
-;mode 34 [row 15]
-palignr m3, m1, m0, 15
-movu [r0 + 2078 * 16], m3
-palignr m4, m2, m1, 15
-movu [r0 + 2079 * 16], m4
-
-;mode 34 [row 16]
-palignr m3, m1, m0, 16
-movu [r0 + 2080 * 16], m3
-palignr m4, m2, m1, 16
-movu [r0 + 2081 * 16], m4
-
-;mode 34 [row 17]
-movu m0, [r3 + 19]
-movu [r0 + 2082 * 16], m0
-movu m1, [r3 + 35]
-movu [r0 + 2083 * 16], m1
-
-;mode 34 [row 18]
-movu m2, [r3 + 51]
-palignr m3, m1, m0, 1
-movu [r0 + 2084 * 16], m3
-palignr m4, m2, m1, 1
-movu [r0 + 2085 * 16], m4
-
-;mode 34 [row 19]
-palignr m3, m1, m0, 2
-movu [r0 + 2086 * 16], m3
-palignr m4, m2, m1, 2
-movu [r0 + 2087 * 16], m4
-
-;mode 34 [row 20]
-palignr m3, m1, m0, 3
-movu [r0 + 2088 * 16], m3
-palignr m4, m2, m1, 3
-movu [r0 + 2089 * 16], m4
-
-;mode 34 [row 21]
-palignr m3, m1, m0, 4
-movu [r0 + 2090 * 16], m3
-palignr m4, m2, m1, 4
-movu [r0 + 2091 * 16], m4
-
-;mode 34 [row 22]
-palignr m3, m1, m0, 5
-movu [r0 + 2092 * 16], m3
-palignr m4, m2, m1, 5
-movu [r0 + 2093 * 16], m4
-
-;mode 34 [row 23]
-palignr m3, m1, m0, 6
-movu [r0 + 2094 * 16], m3
-palignr m4, m2, m1, 6
-movu [r0 + 2095 * 16], m4
-
-;mode 34 [row 24]
-palignr m3, m1, m0, 7
-movu [r0 + 2096 * 16], m3
-palignr m4, m2, m1, 7
-movu [r0 + 2097 * 16], m4
-
-;mode 34 [row 25]
-palignr m3, m1, m0, 8
-movu [r0 + 2098 * 16], m3
-palignr m4, m2, m1, 8
-movu [r0 + 2099 * 16], m4
-
-;mode 34 [row 26]
-palignr m3, m1, m0, 9
-movu [r0 + 2100 * 16], m3
-palignr m4, m2, m1, 9
-movu [r0 + 2101 * 16], m4
-
-;mode 34 [row 27]
-palignr m3, m1, m0, 10
-movu [r0 + 2102 * 16], m3
-palignr m4, m2, m1, 10
-movu [r0 + 2103 * 16], m4
-
-;mode 34 [row 28]
-palignr m3, m1, m0, 11
-movu [r0 + 2104 * 16], m3
-palignr m4, m2, m1, 11
-movu [r0 + 2105 * 16], m4
-
-;mode 34 [row 29]
-palignr m3, m1, m0, 12
-movu [r0 + 2106 * 16], m3
-palignr m4, m2, m1, 12
-movu [r0 + 2107 * 16], m4
-
-;mode 34 [row 30]
-palignr m3, m1, m0, 13
-movu [r0 + 2108 * 16], m3
-palignr m4, m2, m1, 13
-movu [r0 + 2109 * 16], m4
-
-;mode 34 [row 31]
-palignr m3, m1, m0, 14
-movu [r0 + 2110 * 16], m3
-palignr m4, m2, m1, 14
-movu [r0 + 2111 * 16], m4
-
-RET
+cglobal all_angs_pred_32x32, 3,7,8, 0-4
+ mov r6d, [r1 + 64]
+ mov r3d, [r1]
+ mov [rsp], r6d
+ mov [r1 + 64], r3b
+ mov r3d, [r2]
+ mov r6d, [r2 + 64]
+ mov [r2 + 64], r3b
+
+ lea r3, [r2]
+ lea r4, [r2 + 64]
+ lea r2, [r1 + 64]
+
+ ;mode 2[row 0]
+ movu m0, [r4 + 2]
+ movu [r0 + 0 * 16], m0
+ movu m1, [r4 + 18]
+ movu [r0 + 1 * 16], m1
+
+ ;mode 9 [row 15]
+ movu [r0 + 478 * 16], m0
+ movu [r0 + 479 * 16], m1
+
+ ;mode 2[row 1]
+ movu m2, [r4 + 34]
+ palignr m3, m1, m0, 1
+ movu [r0 + 2 * 16], m3
+ palignr m4, m2, m1, 1
+ movu [r0 + 3 * 16], m4
+
+ ; mode 9 [row 31]
+ movu [r0 + 510 * 16], m3
+ movu [r0 + 511 * 16], m4
+
+ ;mode 2[row 17]
+ movu [r0 + 34 * 16], m4
+ movu m5, [r4 + 35]
+ movu [r0 + 35 * 16], m5
+
+ ;mode 2[row 2]
+ palignr m3, m1, m0, 2
+ movu [r0 + 4 * 16], m3
+ palignr m4, m2, m1, 2
+ movu [r0 + 5 * 16], m4
+
+ ;mode 2[row 18]
+ movu [r0 + 36 * 16], m4
+ movu m6, [r4 + 51]
+ palignr m7, m6, m5, 1
+ movu [r0 + 37 * 16], m7
+
+ ;mode 2[row 3]
+ palignr m3, m1, m0, 3
+ movu [r0 + 6 * 16], m3
+ palignr m4, m2, m1, 3
+ movu [r0 + 7 * 16], m4
+
+ ;mode 2[row 19]
+ movu [r0 + 38 * 16], m4
+ palignr m7, m6, m5, 2
+ movu [r0 + 39 * 16], m7
+
+ ;mode 2[row 4]
+ palignr m3, m1, m0, 4
+ movu [r0 + 8 * 16], m3
+ palignr m4, m2, m1, 4
+ movu [r0 + 9 * 16], m4
+
+ ; mode 8 [row 31]
+ movu [r0 + 446 * 16], m3
+ movu [r0 + 447 * 16], m4
+
+ ;mode 2[row 20]
+ movu [r0 + 40 * 16], m4
+ palignr m7, m6, m5, 3
+ movu [r0 + 41 * 16], m7
+
+ ; mode 4 [row 31]
+ movu [r0 + 190 * 16], m4
+ movu [r0 + 191 * 16], m7
+
+ ;mode 2[row 5]
+ palignr m3, m1, m0, 5
+ movu [r0 + 10 * 16], m3
+ palignr m4, m2, m1, 5
+ movu [r0 + 11 * 16], m4
+
+ ;mode 2[row 21]
+ movu [r0 + 42 * 16], m4
+ palignr m7, m6, m5, 4
+ movu [r0 + 43 * 16], m7
+
+ ;mode 2[row 6]
+ palignr m3, m1, m0, 6
+ movu [r0 + 12 * 16], m3
+ palignr m4, m2, m1, 6
+ movu [r0 + 13 * 16], m4
+
+ ;mode 2[row 22]
+ movu [r0 + 44 * 16], m4
+ palignr m7, m6, m5, 5
+ movu [r0 + 45 * 16], m7
+
+ ;mode 2[row 7]
+ palignr m3, m1, m0, 7
+ movu [r0 + 14 * 16], m3
+ palignr m4, m2, m1, 7
+ movu [r0 + 15 * 16], m4
+
+ ;mode 2[row 23]
+ movu [r0 + 46 * 16], m4
+ palignr m7, m6, m5, 6
+ movu [r0 + 47 * 16], m7
+
+ ;mode 2[row 8]
+ palignr m3, m1, m0, 8
+ movu [r0 + 16 * 16], m3
+ palignr m4, m2, m1, 8
+ movu [r0 + 17 * 16], m4
+
+ ;mode 7[row 31]
+ movu [r0 + 382 * 16], m3
+ movu [r0 + 383 * 16], m4
+
+ ;mode 2[row 24]
+ movu [r0 + 48 * 16], m4
+ palignr m7, m6, m5, 7
+ movu [r0 + 49 * 16], m7
+
+ ;mode 2[row 9]
+ palignr m3, m1, m0, 9
+ movu [r0 + 18 * 16], m3
+ palignr m4, m2, m1, 9
+ movu [r0 + 19 * 16], m4
+
+ ;mode 2[row 25]
+ movu [r0 + 50 * 16], m4
+ palignr m7, m6, m5, 8
+ movu [r0 + 51 * 16], m7
+
+ ; mode 3 [row 31]
+ movu [r0 + 126 * 16], m4
+ movu [r0 + 127 * 16], m7
+
+ ;mode 2[row 10]
+ palignr m3, m1, m0, 10
+ movu [r0 + 20 * 16], m3
+ palignr m4, m2, m1, 10
+ movu [r0 + 21 * 16], m4
+
+ ;mode 2[row 26]
+ movu [r0 + 52 * 16], m4
+ palignr m7, m6, m5, 9
+ movu [r0 + 53 * 16], m7
+
+ ;mode 2[row 11]
+ palignr m3, m1, m0, 11
+ movu [r0 + 22 * 16], m3
+ palignr m4, m2, m1, 11
+ movu [r0 + 23 * 16], m4
+
+ ;mode 2[row 27]
+ movu [r0 + 54 * 16], m4
+ palignr m7, m6, m5, 10
+ movu [r0 + 55 * 16], m7
+
+ ;mode 2[row 12]
+ palignr m3, m1, m0, 12
+ movu [r0 + 24 * 16], m3
+ palignr m4, m2, m1, 12
+ movu [r0 + 25 * 16], m4
+
+ ; mode 6 [row 31]
+ movu [r0 + 318 * 16], m3
+ movu [r0 + 319 * 16], m4
+
+ ; mode 3 [row 15]
+ movu [r0 + 94 * 16], m3
+ movu [r0 + 95 * 16], m4
+
+ ;mode 2[row 28]
+ movu [r0 + 56 * 16], m4
+ palignr m7, m6, m5, 11
+ movu [r0 + 57 * 16], m7
+
+ ;mode 2[row 13]
+ palignr m3, m1, m0, 13
+ movu [r0 + 26 * 16], m3
+ palignr m4, m2, m1, 13
+ movu [r0 + 27 * 16], m4
+
+ ;mode 2[row 29]
+ movu [r0 + 58 * 16], m4
+ palignr m7, m6, m5, 12
+ movu [r0 + 59 * 16], m7
+
+ ;mode 2[row 14]
+ palignr m3, m1, m0, 14
+ movu [r0 + 28 * 16], m3
+ palignr m4, m2, m1, 14
+ movu [r0 + 29 * 16], m4
+
+ ;mode 2[row 30]
+ movu [r0 + 60 * 16], m4
+ palignr m7, m6, m5, 13
+ movu [r0 + 61 * 16], m7
+
+ ;mode 2[row 15]
+ palignr m3, m1, m0, 15
+ movu [r0 + 30 * 16], m3
+ palignr m4, m2, m1, 15
+ movu [r0 + 31 * 16], m4
+
+ ;mode 2[row 31]
+ movu [r0 + 62 * 16], m4
+ palignr m7, m6, m5, 14
+ movu [r0 + 63 * 16], m7
+
+ ;mode 2[row 16]
+ movu [r0 + 32 * 16], m1
+ movu [r0 + 33 * 16], m2
+
+ ; mode 5[row 31]
+ movu [r0 + 254 * 16], m1
+ movu [r0 + 255 * 16], m2
+
+ ; mode 3 [row 0]
+ lea r5, [ang_table]
+ movu m6, [r5 + 26 * 16]
+ movu m7, [pw_1024 ]
+ movu m1, [r4 + 1 ]
+ punpcklbw m1, m0
+ pmaddubsw m0, m1, m6
+ pmulhrsw m0, m7
+ movu m2, [r4 + 9]
+ movd m3, [r4 + 10]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m0, m3
+ movu [r0 + 64 * 16], m0
+
+ ; mode 6 [row 1 - first half]
+ movu [r0 + 258 * 16], m0
+
+ ; mode 9 [row 12 - first half]
+ movu [r0 + 472 * 16], m0
+
+ movu m0, [r4 + 17]
+ movd m3, [r4 + 18]
+ palignr m3, m0, 1
+ punpcklbw m0, m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 25]
+ movd m5, [r4 + 26]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 65 * 16], m3
+
+ ; mode 6 [row 1 - second half]
+ movu [r0 + 259 * 16], m3
+
+ ; mode 9 [row 12 - second half]
+ movu [r0 + 473 * 16], m3
+
+ ; mode 4 [row 0]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 128 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 129 * 16], m3
+
+ ; mode 5 [row 0]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 192 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 193 * 16], m3
+
+ ; mode 6 [row 0]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 256 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 257 * 16], m3
+
+ ; mode 7 [row 0]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 320 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 321 * 16], m3
+
+ ; mode 7 [row 1]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 322 * 16], m3
+
+ ; mode 9 [row 8 - first half]
+ movu [r0 + 464 * 16], m3
+
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 323 * 16], m3
+
+ ; mode 9 [row 8 - second half]
+ movu [r0 + 465 * 16], m3
+
+ ; mode 7 [row 2]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 324 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 325 * 16], m3
+
+ ; mode 8 [row 0]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 384 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 385 * 16], m3
+
+ ; mode 8 [row 1]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 386 * 16], m3
+
+ ; mode 9 [row 4 - first half]
+ movu [r0 + 456 * 16], m3
+
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 387 * 16], m3
+
+ ; mode 9 [row 4 - second half]
+ movu [r0 + 457 * 16], m3
+
+ ; mode 8 [row 2]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 388 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 389 * 16], m3
+
+ ; mode 8 [row 3]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 390 * 16], m3
+
+ ; mode 9 [row 9 - first half]
+ movu [r0 + 466 * 16], m3
+
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 391 * 16], m3
+
+ ; mode 9 [row 9 - second half]
+ movu [r0 + 467 * 16], m3
+
+ ; mode 8 [row 4]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 392 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 393 * 16], m3
+
+ ; mode 8 [row 5]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 394 * 16], m3
+
+ ; mode 9 [row 14 - first half]
+ movu [r0 + 476 * 16], m3
+
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 395 * 16], m3
+
+ ; mode 9 [row 14 - second half]
+ movu [r0 + 477 * 16], m3
+
+ ; mode 9 [row 0]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 448 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 449 * 16], m3
+
+ ; mode 9 [row 1]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 450 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 451 * 16], m3
+
+ ; mode 9 [row 2]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 452 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 453 * 16], m3
+
+ ; mode 9 [row 3]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 454 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 455 * 16], m3
+
+ ; mode 9 [row 5]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 458 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 459 * 16], m3
+
+ ; mode 9 [row 6]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 460 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 461 * 16], m3
+
+ ; mode 9 [row 7]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 462 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 463 * 16], m3
+
+ ; mode 9 [row 10]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 468 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 469 * 16], m3
+
+ ; mode 9 [row 11]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 470 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 471 * 16], m3
+
+ ; mode 9 [row 13]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 474 * 16], m3
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 475 * 16], m3
+
+ ; mode 3 [row 1]
+ movu m6, [r5 + 20 * 16]
+ movu m0, [r4 + 2]
+ movd m1, [r4 + 3]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 10]
+ movd m3, [r4 + 11]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 66 * 16], m1
+
+ ; mode 6 [row 3 - first half]
+ movu [r0 + 262 * 16], m1
+
+ ; mode 9 [row 25 - first half]
+ movu [r0 + 498 * 16], m1
+
+ movu m1, [r4 + 18]
+ movd m3, [r4 + 19]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 26]
+ movd m5, [r4 + 27]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 67 * 16], m3
+
+ ; mode 6 [row 3 - second half]
+ movu [r0 + 263 * 16], m3
+
+ ; mode 9 [row 25 - second half]
+ movu [r0 + 499 * 16], m3
+
+ ; mode 4 [row 1]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 130 * 16], m3
+
+ ; mode 9 [row 20 - first half]
+ movu [r0 + 488 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 131 * 16], m3
+
+ ; mode 9 [row 20 - second half]
+ movu [r0 + 489 * 16], m3
+
+ ; mode 4 [row 2]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 132 * 16], m3
+
+ ; mode 7 [row 6 - first half]
+ movu [r0 + 332 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 133 * 16], m3
+
+ ; mode 7 [row 6 - second half]
+ movu [r0 + 333 * 16], m3
+
+ ; mode 5 [row 1]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 194 * 16], m3
+
+ ; mode 5 [row 1 - first half]
+ movu [r0 + 480 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 195 * 16], m3
+
+ ; mode 5 [row 1 - second half]
+ movu [r0 + 481 * 16], m3
+
+ ; mode 5 [row 2]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 196 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 197 * 16], m3
+
+ ; mode 6 [row 2]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 260 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 261 * 16], m3
+
+ ; mode 7 [row 3]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 326 * 16], m3
+
+ ; mode 9 [row 17 - first half]
+ movu [r0 + 482 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 327 * 16], m3
+
+ ; mode 9 [row 17 - second half]
+ movu [r0 + 483 * 16], m3
+
+ ; mode 7 [row 4]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 328 * 16], m3
+
+ ; mode 8 [row 8 - first half]
+ movu [r0 + 400 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 329 * 16], m3
+
+ ; mode 8 [row 8 - second half]
+ movu [r0 + 401 * 16], m3
+
+ ; mode 7 [row 5]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 330 * 16], m3
+
+ ; mode 9 [row 26 - first half]
+ movu [r0 + 500 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 331 * 16], m3
+
+ ; mode 9 [row 26 - second half]
+ movu [r0 + 501 * 16], m3
+
+ ; mode 8 [row 6]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 396 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 397 * 16], m3
+
+ ; mode 9 [row 18]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 484 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 485 * 16], m3
+
+ ; mode 9 [row 21]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 490 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 491 * 16], m3
+
+ ; mode 9 [row 22]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 492 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 493 * 16], m3
+
+ ; mode 9 [row 23]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 494 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 495 * 16], m3
+
+ ; mode 9 [row 27]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 502 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 503 * 16], m3
+
+ ; mode 9 [row 28]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 504 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 505 * 16], m3
+
+ ; mode 9 [row 30]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 508 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 509 * 16], m3
+
+ ; mode 8 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 398 * 16], m3
+
+ ; mode 9 [row 19 - first half]
+ movu [r0 + 486 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 399 * 16], m3
+
+ ; mode 9 [row 19 - second half]
+ movu [r0 + 487 * 16], m3
+
+ ; mode 8 [row 9]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 402 * 16], m3
+
+ ; mode 9 [row 24 - first half]
+ movu [r0 + 496 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 403 * 16], m3
+
+ ; mode 9 [row 24 - second half]
+ movu [r0 + 497 * 16], m3
+
+ ; mode 8 [row 10]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 404 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 405 * 16], m3
+
+ ; mode 8 [row 11]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 406 * 16], m3
+
+ ; mode 9 [row 29 - first half]
+ movu [r0 + 506 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 407 * 16], m3
+
+ ; mode 9 [row 29 - second half]
+ movu [r0 + 507 * 16], m3
+
+ ; mode 3 [row 2]
+ movu m6, [r5 + 14 * 16]
+ movu m0, [r4 + 3]
+ movd m1, [r4 + 4]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 11]
+ movd m3, [r4 + 12]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 68 * 16], m1
+
+ ; mode 3 [row 2 - first half]
+ movu [r0 + 266 * 16], m1
+
+ movu m1, [r4 + 19]
+ movd m3, [r4 + 20]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 27]
+ movd m5, [r4 + 28]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 69 * 16], m3
+
+ ; mode 3 [row 2 - second half]
+ movu [r0 + 267 * 16], m3
+
+ ; mode 4 [row 3]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 134 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 135 * 16], m3
+
+ ; mode 5 [row 3]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 198 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 199 * 16], m3
+
+ ; mode 5 [row 4]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 200 * 16], m3
+
+ ; mode 8 [row 16 - first half]
+ movu [r0 + 416 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 201 * 16], m3
+
+ ; mode 8 [row 16 - second half]
+ movu [r0 + 417 * 16], m3
+
+ ; mode 6 [row 4]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 264 * 16], m3
+
+ ; mode 6 [row 4 - first half]
+ movu [r0 + 408 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 265 * 16], m3
+
+ ; mode 6 [row 4 - second half]
+ movu [r0 + 409 * 16], m3
+
+ ; mode 6 [row 6]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 268 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 269 * 16], m3
+
+ ; mode 7 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 334 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 335 * 16], m3
+
+ ; mode 7 [row 8]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 336 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 337 * 16], m3
+
+ ; mode 7 [row 9]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 338 * 16], m3
+
+ ; mode 8 [row 17 - first half]
+ movu [r0 + 418 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 339 * 16], m3
+
+ ; mode 8 [row 17 - second half]
+ movu [r0 + 419 * 16], m3
+
+ ; mode 8 [row 13]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 410 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 411 * 16], m3
+
+ ; mode 8 [row 14]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 412 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 413 * 16], m3
+
+ ; mode 8 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 414 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 415 * 16], m3
+
+ ; mode 8 [row 18]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 420 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 421 * 16], m3
+
+ ; mode 3 [row 3]
+ movu m6, [r5 + 8 * 16]
+ movu m0, [r4 + 4]
+ movd m1, [r4 + 5]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 12]
+ movd m3, [r4 + 13]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 70 * 16], m1
+
+ ; mode 6 [row 7 - first half]
+ movu [r0 + 270 * 16], m1
+
+ movu m1, [r4 + 20]
+ movd m3, [r4 + 21]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 28]
+ movd m5, [r4 + 29]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 71 * 16], m3
+
+ ; mode 6 [row 7 - second half]
+ movu [r0 + 271 * 16], m3
+
+ ; mode 4 [row 4]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 136 * 16], m3
+
+ ; mode 4 [row 4 - first half]
+ movu [r0 + 424 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 137 * 16], m3
+
+ ; mode 4 [row 4 - second half]
+ movu [r0 + 425 * 16], m3
+
+ ; mode 4 [row 5]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 138 * 16], m3
+
+ ; mode 7 [row 13 - first half]
+ movu [r0 + 346 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 139 * 16], m3
+
+ ; mode 7 [row 13 - second half]
+ movu [r0 + 347 * 16], m3
+
+ ; mode 5 [row 5]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 202 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 203 * 16], m3
+
+ ; mode 5 [row 6]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 204 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 205 * 16], m3
+
+ ; mode 6 [row 8]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 272 * 16], m3
+
+ ; mode 7 [row 12 - first half]
+ movu [r0 + 344 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 273 * 16], m3
+
+ ; mode 7 [row 12 - second half]
+ movu [r0 + 345 * 16], m3
+
+ ; mode 7 [row 10]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 340 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 341 * 16], m3
+
+ ; mode 7 [row 11]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 342 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 343 * 16], m3
+
+ ; mode 8 [row 19]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 422 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 423 * 16], m3
+
+ ; mode 8 [row 21]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 426 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 427 * 16], m3
+
+ ; mode 8 [row 22]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 428 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 429 * 16], m3
+
+ ; mode 8 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 430 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 431 * 16], m3
+
+ ; mode 8 [row 24]
+ movu m6, [r5 + 29 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 432 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 433 * 16], m3
+
+ ; mode 3 [row 4]
+ movu m6, [r5 + 2 * 16]
+ movu m0, [r4 + 5]
+ movd m1, [r4 + 6]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 13]
+ movd m3, [r4 + 14]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 72 * 16], m1
+
+ ; mode 3 [row 4 - first half]
+ movu [r0 + 274 * 16], m1
+
+ ; mode 8 [row 25 - first half]
+ movu [r0 + 434 * 16], m1
+
+ movu m1, [r4 + 21]
+ movd m3, [r4 + 22]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 29]
+ movd m5, [r4 + 30]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 73 * 16], m3
+
+ ; mode 3 [row 4 - second half]
+ movu [r0 + 275 * 16], m3
+
+ ; mode 8 [row 25 - second half]
+ movu [r0 + 435 * 16], m3
+
+ ; mode 3 [row 5]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 74 * 16], m3
+
+ ; mode 3 [row 5 - first half]
+ movu [r0 + 278 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 75 * 16], m3
+
+ ; mode 3 [row 5 - second half]
+ movu [r0 + 279 * 16], m3
+
+ ; mode 4 [row 6]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 140 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 141 * 16], m3
+
+ ; mode 5 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 206 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 207 * 16], m3
+
+ ; mode 5 [row 8]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 208 * 16], m3
+
+ ; mode 7 [row 16 - first half]
+ movu [r0 + 352 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 209 * 16], m3
+
+ ; mode 7 [row 16 - second half]
+ movu [r0 + 353 * 16], m3
+
+ ; mode 6 [row 10]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 276 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 277 * 16], m3
+
+ ; mode 7 [row 14]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 348 * 16], m3
+
+ ; mode 8 [row 26 - first half]
+ movu [r0 + 436 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 349 * 16], m3
+
+ ; mode 8 [row 26 - second half]
+ movu [r0 + 437 * 16], m3
+
+ ; mode 7 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 350 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 351 * 16], m3
+
+ ; mode 8 [row 27]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 438 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 439 * 16], m3
+
+ ; mode 8 [row 28]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 440 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 441 * 16], m3
+
+ ; mode 8 [row 29]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 442 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 443 * 16], m3
+
+ ; mode 8 [row 30]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 444 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 445 * 16], m3
+
+ ; mode 3 [row 6]
+ movu m6, [r5 + 22 * 16]
+ movu m0, [r4 + 6]
+ movd m1, [r4 + 7]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 14]
+ movd m3, [r4 + 15]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 76 * 16], m1
+
+ ; mode 6 [row 13 - first half]
+ movu [r0 + 282 * 16], m1
+
+ movu m1, [r4 + 22]
+ movd m3, [r4 + 23]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 30]
+ movd m5, [r4 + 31]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 77 * 16], m3
+
+ ; mode 6 [row 13 - second half]
+ movu [r0 + 283 * 16], m3
+
+ ; mode 4 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 142 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 143 * 16], m3
+
+ ; mode 4 [row 8]
+ movu m6, [r5 + 29 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 144 * 16], m3
+
+ ; mode 4 [row 8 - first half]
+ movu [r0 + 360 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 145 * 16], m3
+
+ ; mode 4 [row 8 - second half]
+ movu [r0 + 361 * 16], m3
+
+ ; mode 5 [row 9]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 210 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 211 * 16], m3
+
+ ; mode 5 [row 10]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 212 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 213 * 16], m3
+
+ ; mode 7 [row 17]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 354 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 355 * 16], m3
+
+ ; mode 7 [row 18]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 356 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 357 * 16], m3
+
+ ; mode 7 [row 19]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 358 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 359 * 16], m3
+
+ ; mode 6 [row 12]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 280 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 281 * 16], m3
+
+ ; mode 3 [row 7]
+ movu m6, [r5 + 16 * 16]
+ movu m0, [r4 + 7]
+ movd m1, [r4 + 8]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 15]
+ movd m3, [r4 + 16]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 78 * 16], m1
+
+ ; mode 6 [row 15 - first half]
+ movu [r0 + 286 * 16], m1
+
+ movu m1, [r4 + 23]
+ movd m3, [r4 + 24]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 31]
+ movd m5, [r4 + 32]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 79 * 16], m3
+
+ ; mode 6 [row 15 - second half]
+ movu [r0 + 287 * 16], m3
+
+ ; mode 4 [row 9]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 146 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 147 * 16], m3
+
+ ; mode 5 [row 11]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 214 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 215 * 16], m3
+
+ ; mode 5 [row 12]
+ movu m6, [r5 + 29 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 216 * 16], m3
+
+ ; mode 6 [row 16 - first half]
+ movu [r0 + 288 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 217 * 16], m3
+
+ ; mode 6 [row 16 - second half]
+ movu [r0 + 289 * 16], m3
+
+ ; mode 6 [row 14]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 284 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 285 * 16], m3
+
+ ; mode 7 [row 21]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 362 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 363 * 16], m3
+
+ ; mode 7 [row 22]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 364 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 365 * 16], m3
+
+ ; mode 7 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 366 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 367 * 16], m3
+
+ ; mode 3 [row 8]
+ movu m6, [r5 + 10 * 16]
+ movu m0, [r4 + 8]
+ movd m1, [r4 + 9]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 16]
+ movd m3, [r4 + 17]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 80 * 16], m1
+
+ ; mode 7 [row 25 - first half]
+ movu [r0 + 290 * 16], m1
+
+ ; mode 6 [row 17 - first half]
+ movu [r0 + 370 * 16], m1
+
+ movu m1, [r4 + 24]
+ movd m3, [r4 + 25]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 32]
+ movd m5, [r4 + 33]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 81 * 16], m3
+
+ ; mode 7 [row 25 - second half]
+ movu [r0 + 291 * 16], m3
+
+ ; mode 6 [row 17 - second half]
+ movu [r0 + 371 * 16], m3
+
+ ; mode 4 [row 10]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 148 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 149 * 16], m3
+
+ ; mode 4 [row 11]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 150 * 16], m3
+
+ ; mode 7 [row 27 - first half]
+ movu [r0 + 374 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 151 * 16], m3
+
+ ; mode 7 [row 27 - second half]
+ movu [r0 + 375 * 16], m3
+
+ ; mode 5 [row 13]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 218 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 219 * 16], m3
+
+ ; mode 5 [row 14]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 220 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 221 * 16], m3
+
+ ; mode 6 [row 18]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 292 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 293 * 16], m3
+
+ ; mode 7 [row 24]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 368 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 369 * 16], m3
+
+ ; mode 7 [row 26]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 372 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 373 * 16], m3
+
+ ; mode 3 [row 9]
+ movu m6, [r5 + 4 * 16]
+ movu m0, [r4 + 9]
+ movd m1, [r4 + 10]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 17]
+ movd m3, [r4 + 18]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 82 * 16], m1
+
+ ; mode 6 [row 19 - first half]
+ movu [r0 + 294 * 16], m1
+
+ movu m1, [r4 + 25]
+ movd m3, [r4 + 26]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 33]
+ movd m5, [r4 + 34]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 83 * 16], m3
+
+ ; mode 6 [row 19 - second half]
+ movu [r0 + 295 * 16], m3
+
+ ; mode 4 [row 12]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 152 * 16], m3
+
+ ; mode 4 [row 12 - first half]
+ movu [r0 + 296 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 153 * 16], m3
+
+ ; mode 4 [row 12 - second half]
+ movu [r0 + 297 * 16], m3
+
+ ; mode 3 [row 10]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 84 * 16], m3
+
+ ; mode 6 [row 21 - first half]
+ movu [r0 + 298 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 85 * 16], m3
+
+ ; mode 6 [row 21 - second half]
+ movu [r0 + 299 * 16], m3
+
+ ; mode 5 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 222 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 223 * 16], m3
+
+ ; mode 7 [row 28]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 376 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 377 * 16], m3
+
+ ; mode 7 [row 29]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 378 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 379 * 16], m3
+
+ ; mode 7 [row 30]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 380 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 381 * 16], m3
+
+ ; mode 3 [row 11]
+ movu m6, [r5 + 24 * 16]
+ movu m0, [r4 + 10]
+ movd m1, [r4 + 11]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 18]
+ movd m3, [r4 + 19]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 86 * 16], m1
+
+ ; mode 6 [row 23 - first half]
+ movu [r0 + 302 * 16], m1
+
+ movu m1, [r4 + 26]
+ movd m3, [r4 + 27]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 34]
+ movd m5, [r4 + 35]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 87 * 16], m3
+
+ ; mode 6 [row 23 - second half]
+ movu [r0 + 303 * 16], m3
+
+ ; mode 4 [row 13]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 154 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 155 * 16], m3
+
+ ; mode 4 [row 14]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 156 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 157 * 16], m3
+
+ ; mode 5 [row 16]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 224 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 225 * 16], m3
+
+ ; mode 5 [row 17]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 226 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 227 * 16], m3
+
+ ; mode 6 [row 22]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 300 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 301 * 16], m3
+
+ ; mode 3 [row 12]
+ movu m6, [r5 + 18 * 16]
+ movu m0, [r4 + 11]
+ movd m1, [r4 + 12]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 19]
+ movd m3, [r4 + 20]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 88 * 16], m1
+
+ ; mode 6 [row 25 - first half]
+ movu [r0 + 306 * 16], m1
+
+ movu m1, [r4 + 27]
+ movd m3, [r4 + 28]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 35]
+ movd m5, [r4 + 36]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 89 * 16], m3
+
+ ; mode 6 [row 25 - second half]
+ movu [r0 + 307 * 16], m3
+
+ ; mode 4 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 158 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 159 * 16], m3
+
+ ; mode 5 [row 18]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 228 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 229 * 16], m3
+
+ ; mode 5 [row 19]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 230 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 231 * 16], m3
+
+ ; mode 6 [row 24]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 304 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 305 * 16], m3
+
+ ; mode 6 [row 26]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 308 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 309 * 16], m3
+
+ ; mode 3 [row 13]
+ movu m6, [r5 + 12 * 16]
+ movu m0, [r4 + 12]
+ movd m1, [r4 + 13]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 20]
+ movd m3, [r4 + 21]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 90 * 16], m1
+
+ movu m1, [r4 + 28]
+ movd m3, [r4 + 29]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 36]
+ movd m5, [r4 + 37]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 91 * 16], m3
+
+ ; mode 4 [row 16]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 160 * 16], m3
+
+ ; mode 5 [row 20 - first half]
+ movu [r0 + 232 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 161 * 16], m3
+
+ ; mode 5 [row 20 - second half]
+ movu [r0 + 233 * 16], m3
+
+ ; mode 4 [row 17]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 162 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 163 * 16], m3
+
+ ; mode 5 [row 21]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 234 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 235 * 16], m3
+
+ ; mode 6 [row 27]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 310 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 311 * 16], m3
+
+ ; mode 6 [row 28]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 312 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 313 * 16], m3
+
+ ; mode 3 [row 14]
+ movu m6, [r5 + 6 * 16]
+ movu m0, [r4 + 13]
+ movd m1, [r4 + 14]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 21]
+ movd m3, [r4 + 22]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 92 * 16], m1
+
+ ; mode 6 [row 29 - first half]
+ movu [r0 + 314 * 16], m1
+
+ movu m1, [r4 + 29]
+ movd m3, [r4 + 30]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 37]
+ movd m5, [r4 + 38]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 93 * 16], m3
+
+ ; mode 6 [row 29 - second half]
+ movu [r0 + 315 * 16], m3
+
+ ; mode 4 [row 18]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 164 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 165 * 16], m3
+
+ ; mode 5 [row 22]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 236 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 237 * 16], m3
+
+ ; mode 5 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 238 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 239 * 16], m3
+
+ ; mode 6 [row 30]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 316 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 317 * 16], m3
+
+ ; mode 3 [row 16]
+ movu m6, [r5 + 26 * 16]
+ movu m0, [r4 + 14]
+ movd m1, [r4 + 15]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 22]
+ movd m3, [r4 + 23]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 96 * 16], m1
+
+ ; mode 5 [row 25 - first half]
+ movu [r0 + 242 * 16], m1
+
+ movu m1, [r4 + 30]
+ movd m3, [r4 + 31]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 38]
+ movd m5, [r4 + 39]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 97 * 16], m3
+
+ ; mode 5 [row 25 - second half]
+ movu [r0 + 243 * 16], m3
+
+ ; mode 4 [row 19]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 166 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 167 * 16], m3
+
+ ; mode 4 [row 20]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 168 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 169 * 16], m3
+
+ ; mode 5 [row 24]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 240 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 241 * 16], m3
+
+ ; mode 3 [row 17]
+ movu m6, [r5 + 20 * 16]
+ movu m0, [r4 + 15]
+ movd m1, [r4 + 16]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 23]
+ movd m3, [r4 + 24]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 98 * 16], m1
+
+ movu m1, [r4 + 31]
+ movd m3, [r4 + 32]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 39]
+ movd m5, [r4 + 40]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 99 * 16], m3
+
+ ; mode 4 [row 21]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 170 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 171 * 16], m3
+
+ ; mode 5 [row 26]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 244 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 245 * 16], m3
+
+ ; mode 5 [row 27]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 246 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 247 * 16], m3
+
+ ; mode 3 [row 18]
+ movu m6, [r5 + 14 * 16]
+ movu m0, [r4 + 16]
+ movd m1, [r4 + 17]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 24]
+ movd m3, [r4 + 25]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 100 * 16], m1
+
+ movu m1, [r4 + 32]
+ movd m3, [r4 + 33]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 40]
+ movd m5, [r4 + 41]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 101 * 16], m3
+
+ ; mode 4 [row 22]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 172 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 173 * 16], m3
+
+ ; mode 4 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 174 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 175 * 16], m3
+
+ ; mode 5 [row 28]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 248 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 249 * 16], m3
+
+ ; mode 5 [row 29]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 250 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 251 * 16], m3
+
+ ; mode 3 [row 19]
+ movu m6, [r5 + 8 * 16]
+ movu m0, [r4 + 17]
+ movd m1, [r4 + 18]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 25]
+ movd m3, [r4 + 26]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 102 * 16], m1
+
+ movu m1, [r4 + 33]
+ movd m3, [r4 + 34]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 41]
+ movd m5, [r4 + 42]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 103 * 16], m3
+
+ ; mode 4 [row 24]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 176 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 177 * 16], m3
+
+ ; mode 5 [row 30]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 252 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 253 * 16], m3
+
+ ; mode 3 [row 20]
+ movu m6, [r5 + 2 * 16]
+ movu m0, [r4 + 18]
+ movd m1, [r4 + 19]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 26]
+ movd m3, [r4 + 27]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 104 * 16], m1
+
+ movu m1, [r4 + 34]
+ movd m3, [r4 + 35]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 42]
+ movd m5, [r4 + 43]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 105 * 16], m3
+
+ ; mode 4 [row 25]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 178 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 179 * 16], m3
+
+ ; mode 4 [row 26]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 180 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 181 * 16], m3
+
+ ; mode 3 [row 21]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 106 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 107 * 16], m3
+
+ ; mode 3 [row 22]
+ movu m6, [r5 + 22 * 16]
+ movu m0, [r4 + 19]
+ movd m1, [r4 + 20]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 27]
+ movd m3, [r4 + 28]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 108 * 16], m1
+
+ movu m1, [r4 + 35]
+ movd m3, [r4 + 36]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 43]
+ movd m5, [r4 + 44]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 109 * 16], m3
+
+ ; mode 4 [row 27]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 182 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 183 * 16], m3
+
+ ; mode 3 [row 23]
+ movu m6, [r5 + 16 * 16]
+ movu m0, [r4 + 20]
+ movd m1, [r4 + 21]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 28]
+ movd m3, [r4 + 29]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 110 * 16], m1
+
+ movu m1, [r4 + 36]
+ movd m3, [r4 + 37]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 44]
+ movd m5, [r4 + 45]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 111 * 16], m3
+
+ ; mode 4 [row 28]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 184 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 185 * 16], m3
+
+ ; mode 4 [row 29]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 186 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 187 * 16], m3
+
+ ; mode 3 [row 24]
+ movu m6, [r5 + 10 * 16]
+ movu m0, [r4 + 21]
+ movd m1, [r4 + 22]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 29]
+ movd m3, [r4 + 30]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 112 * 16], m1
+
+ movu m1, [r4 + 37]
+ movd m3, [r4 + 38]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 45]
+ movd m5, [r4 + 46]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 113 * 16], m3
+
+ ; mode 4 [row 30]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 188 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 189 * 16], m3
+
+ ; mode 3 [row 25]
+ movu m6, [r5 + 4 * 16]
+ movu m0, [r4 + 22]
+ movd m1, [r4 + 23]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 30]
+ movd m3, [r4 + 31]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 114 * 16], m1
+
+ movu m1, [r4 + 38]
+ movd m3, [r4 + 39]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 46]
+ movd m5, [r4 + 47]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 115 * 16], m3
+
+ ; mode 3 [row 26]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 116 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 117 * 16], m3
+
+ ; mode 3 [row 27]
+ movu m6, [r5 + 24 * 16]
+ movu m0, [r4 + 23]
+ movd m1, [r4 + 24]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 31]
+ movd m3, [r4 + 32]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 118 * 16], m1
+
+ movu m1, [r4 + 39]
+ movd m3, [r4 + 40]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 47]
+ movd m5, [r4 + 48]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 119 * 16], m3
+
+ ; mode 3 [row 28]
+ movu m6, [r5 + 18 * 16]
+ movu m0, [r4 + 24]
+ movd m1, [r4 + 25]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 32]
+ movd m3, [r4 + 33]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 120 * 16], m1
+
+ movu m1, [r4 + 40]
+ movd m3, [r4 + 41]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 48]
+ movd m5, [r4 + 49]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 121 * 16], m3
+
+ ; mode 3 [row 29]
+ movu m6, [r5 + 12 * 16]
+ movu m0, [r4 + 25]
+ movd m1, [r4 + 26]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 33]
+ movd m3, [r4 + 34]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 122 * 16], m1
+
+ movu m1, [r4 + 41]
+ movd m3, [r4 + 42]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 49]
+ movd m5, [r4 + 50]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 123 * 16], m3
+
+ ; mode 3 [row 30]
+ movu m6, [r5 + 6 * 16]
+ movu m0, [r4 + 26]
+ movd m1, [r4 + 27]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r4 + 34]
+ movd m3, [r4 + 35]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 124 * 16], m1
+
+ movu m1, [r4 + 42]
+ movd m3, [r4 + 43]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r4 + 50]
+ movd m5, [r4 + 51]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 125 * 16], m3
+
+ ; mode 10
+ movu m1, [r2 + 1]
+ movu m2, [r2 + 17]
+ movu [r0 + 512 * 16], m1
+ movu [r0 + 513 * 16], m2
+ movu [r0 + 514 * 16], m1
+ movu [r0 + 515 * 16], m2
+ movu [r0 + 516 * 16], m1
+ movu [r0 + 517 * 16], m2
+ movu [r0 + 518 * 16], m1
+ movu [r0 + 519 * 16], m2
+ movu [r0 + 520 * 16], m1
+ movu [r0 + 521 * 16], m2
+ movu [r0 + 522 * 16], m1
+ movu [r0 + 523 * 16], m2
+ movu [r0 + 524 * 16], m1
+ movu [r0 + 525 * 16], m2
+ movu [r0 + 526 * 16], m1
+ movu [r0 + 527 * 16], m2
+
+ movu [r0 + 528 * 16], m1
+ movu [r0 + 529 * 16], m2
+ movu [r0 + 530 * 16], m1
+ movu [r0 + 531 * 16], m2
+ movu [r0 + 532 * 16], m1
+ movu [r0 + 533 * 16], m2
+ movu [r0 + 534 * 16], m1
+ movu [r0 + 535 * 16], m2
+ movu [r0 + 536 * 16], m1
+ movu [r0 + 537 * 16], m2
+ movu [r0 + 538 * 16], m1
+ movu [r0 + 539 * 16], m2
+ movu [r0 + 540 * 16], m1
+ movu [r0 + 541 * 16], m2
+ movu [r0 + 542 * 16], m1
+ movu [r0 + 543 * 16], m2
+
+ movu [r0 + 544 * 16], m1
+ movu [r0 + 545 * 16], m2
+ movu [r0 + 546 * 16], m1
+ movu [r0 + 547 * 16], m2
+ movu [r0 + 548 * 16], m1
+ movu [r0 + 549 * 16], m2
+ movu [r0 + 550 * 16], m1
+ movu [r0 + 551 * 16], m2
+ movu [r0 + 552 * 16], m1
+ movu [r0 + 553 * 16], m2
+ movu [r0 + 554 * 16], m1
+ movu [r0 + 555 * 16], m2
+ movu [r0 + 556 * 16], m1
+ movu [r0 + 557 * 16], m2
+ movu [r0 + 558 * 16], m1
+ movu [r0 + 559 * 16], m2
+
+ movu [r0 + 560 * 16], m1
+ movu [r0 + 561 * 16], m2
+ movu [r0 + 562 * 16], m1
+ movu [r0 + 563 * 16], m2
+ movu [r0 + 564 * 16], m1
+ movu [r0 + 565 * 16], m2
+ movu [r0 + 566 * 16], m1
+ movu [r0 + 567 * 16], m2
+ movu [r0 + 568 * 16], m1
+ movu [r0 + 569 * 16], m2
+ movu [r0 + 570 * 16], m1
+ movu [r0 + 571 * 16], m2
+ movu [r0 + 572 * 16], m1
+ movu [r0 + 573 * 16], m2
+ movu [r0 + 574 * 16], m1
+ movu [r0 + 575 * 16], m2
+
+ ; mode 11 [row 0]
+ movu m0, [r4]
+
+ ; mode 11 [row 15 - first half]
+ movu [r0 + 606 * 16], m0
+
+ movu [r0 + 606 * 16], m0
+
+ ; mode 12 [row 31]
+ pslldq m6, m0, 4
+ pinsrb m6, [r3 + 26], 0
+ pinsrb m6, [r3 + 19], 1
+ pinsrb m6, [r3 + 13], 2
+ pinsrb m6, [r3 + 6], 3
+ movu [r0 + 702 * 16], m6
+ movu m6, [r4 + 12]
+ movu [r0 + 703 * 16], m6
+
+ ; mode 11 [row 31]
+ pslldq m6, m0, 1
+ pinsrb m6, [r3 + 16], 0
+ movu [r0 + 638 * 16], m6
+ movu m6, [r4 + 15]
+ movu [r0 + 639 * 16], m6
+
+ movd m1, [r4 + 1]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, [r5 + 30 * 16]
+ pmulhrsw m1, m7
+ movu m2, [r4 + 8]
+ movd m3, [r4 + 9]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, [r5 + 30 * 16]
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 576 * 16], m1
+
+ movu m1, [r4 + 16]
+
+ ; mode 11 [row 15 - second half]
+ movu [r0 + 607 * 16], m1
+
+ movd m3, [r4 + 17]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, [r5 + 30 * 16]
+ pmulhrsw m3, m7
+ movu m4, [r4 + 24]
+ movd m5, [r4 + 25]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, [r5 + 30 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 577 * 16], m3
+
+ ; mode 11 [row 1]
+ pmaddubsw m3, m0, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 578 * 16], m3
+ pmaddubsw m3, m1, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 579 * 16], m3
+
+ ; mode 11 [row 2]
+ pmaddubsw m3, m0, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 580 * 16], m3
+ pmaddubsw m3, m1, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 581 * 16], m3
+
+ ; mode 11 [row 3]
+ pmaddubsw m3, m0, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 582 * 16], m3
+ pmaddubsw m3, m1, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 583 * 16], m3
+
+ ; mode 11 [row 4]
+ pmaddubsw m3, m0, [r5 + 22 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 22 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 584 * 16], m3
+
+ ; mode 12 [row 1 - first half]
+ movu [r0 + 642 * 16], m3
+
+ pmaddubsw m3, m1, [r5 + 22 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 22 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 585 * 16], m3
+
+ ; mode 12 [row 1 - second half]
+ movu [r0 + 643 * 16], m3
+
+ ; mode 11 [row 5]
+ pmaddubsw m3, m0, [r5 + 20 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 20 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 586 * 16], m3
+ pmaddubsw m3, m1, [r5 + 20 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 20 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 587 * 16], m3
+
+ ; mode 11 [row 6]
+ pmaddubsw m3, m0, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 588 * 16], m3
+ pmaddubsw m3, m1, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 589 * 16], m3
+
+ ; mode 11 [row 7]
+ pmaddubsw m3, m0, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 16 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 590 * 16], m3
+ pmaddubsw m3, m1, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 16 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 591 * 16], m3
+
+ ; mode 11 [row 8]
+ pmaddubsw m3, m0, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 14 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 592 * 16], m3
+
+ ; mode 13 [row 1 - first half]
+ movu [r0 + 706 * 16], m3
+
+ pmaddubsw m3, m1, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 14 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 593 * 16], m3
+
+ ; mode 13 [row 1 - second half]
+ movu [r0 + 707 * 16], m3
+
+ ; mode 11 [row 9]
+ pmaddubsw m3, m0, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 12 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 594 * 16], m3
+
+ ; mode 12 [row 3 - first half]
+ movu [r0 + 646 * 16], m3
+
+ pmaddubsw m3, m1, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 12 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 595 * 16], m3
+
+ ; mode 12 [row 3 - second half]
+ movu [r0 + 647 * 16], m3
+
+ ; mode 11 [row 10]
+ pmaddubsw m3, m0, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 10 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 596 * 16], m3
+ pmaddubsw m3, m1, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 10 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 597 * 16], m3
+
+ ; mode 11 [row 11]
+ pmaddubsw m3, m0, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 8 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 598 * 16], m3
+ pmaddubsw m3, m1, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 8 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 599 * 16], m3
+
+ ; mode 11 [row 12]
+ pmaddubsw m3, m0, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 600 * 16], m3
+
+ ; mode 14 [row 1 - first half]
+ movu [r0 + 770 * 16], m3
+
+ pmaddubsw m3, m1, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 601 * 16], m3
+
+ ; mode 14 [row 1 - second half]
+ movu [r0 + 771 * 16], m3
+
+ ; mode 11 [row 13]
+ pmaddubsw m3, m0, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 4 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 602 * 16], m3
+ pmaddubsw m3, m1, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 4 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 603 * 16], m3
+
+ ; mode 11 [row 14]
+ pmaddubsw m3, m0, [r5 + 2 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 2 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 604 * 16], m3
+
+ ; mode 13 [row 5 - first half]
+ movu [r0 + 650 * 16], m3
+
+ pmaddubsw m3, m1, [r5 + 2 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 2 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 605 * 16], m3
+
+ ; mode 13 [row 5 - second half]
+ movu [r0 + 651 * 16], m3
+
+ ; mode 12 [row 0]
+ pmaddubsw m3, m0, [r5 + 27 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 27 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 640 * 16], m3
+ pmaddubsw m3, m1, [r5 + 27 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 27 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 641 * 16], m3
+
+ ; mode 12 [row 2]
+ pmaddubsw m3, m0, [r5 + 17 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 17 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 644 * 16], m3
+ pmaddubsw m3, m1, [r5 + 17 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 17 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 645 * 16], m3
+
+ ; mode 12 [row 4]
+ pmaddubsw m3, m0, [r5 + 7 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 7 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 648 * 16], m3
+ pmaddubsw m3, m1, [r5 + 7 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 7 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 649 * 16], m3
+
+ ; mode 13 [row 0]
+ pmaddubsw m3, m0, [r5 + 23 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 23 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 704 * 16], m3
+ pmaddubsw m3, m1, [r5 + 23 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 23 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 705 * 16], m3
+
+ ; mode 13 [row 2]
+ pmaddubsw m3, m0, [r5 + 5 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 5 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 708 * 16], m3
+ pmaddubsw m3, m1, [r5 + 5 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 5 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 709 * 16], m3
+
+ ; mode 14 [row 0]
+ pmaddubsw m3, m0, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 19 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 768 * 16], m3
+ pmaddubsw m3, m1, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 19 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 769 * 16], m3
+
+ ; mode 15 [row 0]
+ pmaddubsw m3, m0, [r5 + 15 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 15 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 832 * 16], m3
+ pmaddubsw m3, m1, [r5 + 15 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 15 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 833 * 16], m3
+
+ ; mode 11 [row 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r3 + 16], 0
+ pmaddubsw m3, m0, [r5 + 30 * 16]
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 8], 1
+ pinsrb m2, [r4 + 7], 0
+ pmaddubsw m5, m2, [r5 + 30 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 608 * 16], m3
+ pslldq m1, 2
+ pinsrb m1, [r4 + 16], 1
+ pinsrb m1, [r4 + 15], 0
+ pmaddubsw m3, m1, [r5 + 30 * 16]
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrb m4, [r4 + 24], 1
+ pinsrb m4, [r4 + 23], 0
+ pmaddubsw m5, m4, [r5 + 30 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 609 * 16], m3
+
+ ; mode 11 [row 17]
+ pmaddubsw m3, m0, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 610 * 16], m3
+ pmaddubsw m3, m1, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 611 * 16], m3
+
+ ; mode 11 [row 18]
+ pmaddubsw m3, m0, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 612 * 16], m3
+ pmaddubsw m3, m1, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 613 * 16], m3
+
+ ; mode 11 [row 19]
+ pmaddubsw m3, m0, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 614 * 16], m3
+ pmaddubsw m3, m1, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 615 * 16], m3
+
+ ; mode 11 [row 20]
+ pmaddubsw m3, m0, [r5 + 22 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 22 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 616 * 16], m3
+ pmaddubsw m3, m1, [r5 + 22 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 22 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 617 * 16], m3
+
+ ; mode 11 [row 21]
+ pmaddubsw m3, m0, [r5 + 20 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 20 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 618 * 16], m3
+ pmaddubsw m3, m1, [r5 + 20 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 20 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 619 * 16], m3
+
+ ; mode 11 [row 22]
+ pmaddubsw m3, m0, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 620 * 16], m3
+ pmaddubsw m3, m1, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 621 * 16], m3
+
+ ; mode 11 [row 23]
+ pmaddubsw m3, m0, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 16 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 622 * 16], m3
+ pmaddubsw m3, m1, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 16 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 623 * 16], m3
+
+ ; mode 11 [row 24]
+ pmaddubsw m3, m0, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 14 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 624 * 16], m3
+ pmaddubsw m3, m1, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 14 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 625 * 16], m3
+
+ ; mode 11 [row 25]
+ pmaddubsw m3, m0, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 12 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 626 * 16], m3
+ pmaddubsw m3, m1, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 12 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 627 * 16], m3
+
+ ; mode 11 [row 26]
+ pmaddubsw m3, m0, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 10 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 628 * 16], m3
+ pmaddubsw m3, m1, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 10 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 629 * 16], m3
+
+ ; mode 11 [row 27]
+ pmaddubsw m3, m0, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 8 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 630 * 16], m3
+ pmaddubsw m3, m1, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 8 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 631 * 16], m3
+
+ ; mode 11 [row 28]
+ pmaddubsw m3, m0, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 632 * 16], m3
+ pmaddubsw m3, m1, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 633 * 16], m3
+
+ ; mode 11 [row 29]
+ pmaddubsw m3, m0, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 4 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 634 * 16], m3
+ pmaddubsw m3, m1, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 4 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 635 * 16], m3
+
+ ; mode 11 [row 30]
+ pmaddubsw m3, m0, [r5 + 2 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 2 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 636 * 16], m3
+ pmaddubsw m3, m1, [r5 + 2 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 2 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 637 * 16], m3
+
+ ; mode 12 [row 6]
+ pinsrb m0, [r3 + 6], 0
+ pmaddubsw m3, m0, [r5 + 29 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 29 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 652 * 16], m3
+ pmaddubsw m3, m1, [r5 + 29 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 29 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 653 * 16], m3
+
+ ; mode 12 [row 7]
+ pmaddubsw m3, m0, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 654 * 16], m3
+ pmaddubsw m3, m1, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 655 * 16], m3
+
+ ; mode 12 [row 8]
+ pmaddubsw m3, m0, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 19 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 656 * 16], m3
+ pmaddubsw m3, m1, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 19 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 657 * 16], m3
+
+ ; mode 12 [row 9]
+ pmaddubsw m3, m0, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 14 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 658 * 16], m3
+ pmaddubsw m3, m1, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 14 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 659 * 16], m3
+
+ ; mode 12 [row 10]
+ pmaddubsw m3, m0, [r5 + 9 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 9 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 660 * 16], m3
+ pmaddubsw m3, m1, [r5 + 9 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 9 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 661 * 16], m3
+
+ ; mode 12 [row 11]
+ pmaddubsw m3, m0, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 4 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 662 * 16], m3
+ pmaddubsw m3, m1, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 4 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 663 * 16], m3
+
+ ; mode 13 [row 3]
+ movu m6, m0
+ pinsrb m6, [r3 + 4], 0
+ pmaddubsw m3, m6, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 710 * 16], m3
+ pmaddubsw m3, m1, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 711 * 16], m3
+
+ ; mode 13 [row 4]
+ pmaddubsw m3, m6, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 19 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 712 * 16], m3
+ pmaddubsw m3, m1, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 19 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 713 * 16], m3
+
+ ; mode 13 [row 5]
+ pmaddubsw m3, m6, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 10 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 714 * 16], m3
+ pmaddubsw m3, m1, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 10 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 715 * 16], m3
+
+ ; mode 13 [row 6]
+ pmaddubsw m3, m6, [r5 + 1 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 1 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 716 * 16], m3
+ pmaddubsw m3, m1, [r5 + 1 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 1 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 717 * 16], m3
+
+ ; mode 14 [row 2]
+ movu m6, m0
+ pinsrb m6, [r4 + 0], 1
+ pinsrb m6, [r3 + 2], 0
+ pmaddubsw m3, m6, [r5 + 25 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 25 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 772 * 16], m3
+ pmaddubsw m3, m1, [r5 + 25 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 25 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 773 * 16], m3
+
+ ; mode 14 [row 3]
+ pmaddubsw m3, m6, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 12 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 774 * 16], m3
+ pmaddubsw m3, m1, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 12 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 775 * 16], m3
+
+ ; mode 15 [row 1]
+ pmaddubsw m3, m6, [r5 + 30 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 30 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 834 * 16], m3
+ pmaddubsw m3, m1, [r5 + 30 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 30 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 835 * 16], m3
+
+ ; mode 15 [row 2]
+ pmaddubsw m3, m6, [r5 + 13 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 13 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 836 * 16], m3
+ pmaddubsw m3, m1, [r5 + 13 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 13 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 837 * 16], m3
+
+ ; mode 15 [row 3]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 2], 1
+ pinsrb m6, [r3 + 4], 0
+ pmaddubsw m3, m6, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 7], 1
+ pinsrb m2, [r4 + 6], 0
+ pmaddubsw m5, m2, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 838 * 16], m3
+ pslldq m1, 2
+ pinsrb m1, [r4 + 15], 1
+ pinsrb m1, [r4 + 14], 0
+ pmaddubsw m3, m1, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrb m4, [r4 + 23], 1
+ pinsrb m4, [r4 + 22], 0
+ pmaddubsw m5, m4, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 839 * 16], m3
+
+ ; mode 15 [row 4]
+ pmaddubsw m3, m6, [r5 + 11 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 11 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 840 * 16], m3
+ pmaddubsw m3, m1, [r5 + 11 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 11 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 841 * 16], m3
+
+ ; mode 15 [row 5, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 4], 1
+ pinsrb m6, [r3 + 6], 0
+ pmaddubsw m3, m6, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 842 * 16], m3
+
+ ; mode 15 [row 6, 0-7]
+ pmaddubsw m3, m6, [r5 + 9 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 844 * 16], m3
+
+ ; mode 15 [row 7, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 6], 1
+ pinsrb m6, [r3 + 8], 0
+ pmaddubsw m3, m6, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 846 * 16], m3
+
+ ; mode 15 [row 8, 0-7]
+ pmaddubsw m3, m6, [r5 + 7 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 848 * 16], m3
+
+ ; mode 15 [row 9, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 8], 1
+ pinsrb m6, [r3 + 9], 0
+ pmaddubsw m3, m6, [r5 + 22 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 850 * 16], m3
+
+ ; mode 15 [row 10, 0-7]
+ pmaddubsw m3, m6, [r5 + 5 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 852 * 16], m3
+
+ ; mode 15 [row 11, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 9], 1
+ pinsrb m6, [r3 + 11], 0
+ pmaddubsw m3, m6, [r5 + 20 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 854 * 16], m3
+
+ ; mode 15 [row 12, 0-7]
+ pmaddubsw m3, m6, [r5 + 3 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 856 * 16], m3
+
+ ; mode 15 [row 13, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 11], 1
+ pinsrb m6, [r3 + 13], 0
+ pmaddubsw m3, m6, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 858 * 16], m3
+
+ ; mode 15 [row 14, 0-7]
+ pmaddubsw m3, m6, [r5 + 1 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 860 * 16], m3
+
+ ; mode 15 [row 15, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 13], 1
+ pinsrb m6, [r3 + 15], 0
+ pmaddubsw m3, m6, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 862 * 16], m3
+
+ ; mode 15 [row 16, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 15], 1
+ pinsrb m6, [r3 + 17], 0
+ pmaddubsw m3, m6, [r5 + 31 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 864 * 16], m3
+
+ ; mode 15 [row 17, 0-7]
+ pmaddubsw m3, m6, [r5 + 14 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 866 * 16], m3
+
+ ; mode 15 [row 18, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 17], 1
+ pinsrb m6, [r3 + 19], 0
+ pmaddubsw m3, m6, [r5 + 29 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 868 * 16], m3
+
+ ; mode 15 [row 19, 0-7]
+ pmaddubsw m3, m6, [r5 + 12 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 870 * 16], m3
+
+ ; mode 15 [row 20, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 19], 1
+ pinsrb m6, [r3 + 21], 0
+ pmaddubsw m3, m6, [r5 + 27 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 872 * 16], m3
+
+ ; mode 15 [row 21, 0-7]
+ pmaddubsw m3, m6, [r5 + 10 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 874 * 16], m3
+
+ ; mode 15 [row 22, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 21], 1
+ pinsrb m6, [r3 + 23], 0
+ pmaddubsw m3, m6, [r5 + 25 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 876 * 16], m3
+
+ ; mode 15 [row 23, 0-7]
+ pmaddubsw m3, m6, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 878 * 16], m3
+
+ ; mode 15 [row 24, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 23], 1
+ pinsrb m6, [r3 + 24], 0
+ pmaddubsw m3, m6, [r5 + 23 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 880 * 16], m3
+
+ ; mode 15 [row 25, 0-7]
+ pmaddubsw m3, m6, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 882 * 16], m3
+
+ ; mode 15 [row 26, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 24], 1
+ pinsrb m6, [r3 + 26], 0
+ pmaddubsw m3, m6, [r5 + 21 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 884 * 16], m3
+
+ ; mode 15 [row 27, 0-7]
+ pmaddubsw m3, m6, [r5 + 4 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 886 * 16], m3
+
+ ; mode 15 [row 28, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 26], 1
+ pinsrb m6, [r3 + 28], 0
+ pmaddubsw m3, m6, [r5 + 19 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 888 * 16], m3
+
+ ; mode 15 [row 29, 0-7]
+ pmaddubsw m3, m6, [r5 + 2 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 890 * 16], m3
+
+ ; mode 15 [row 30, 0-7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 28], 1
+ pinsrb m6, [r3 + 30], 0
+ pmaddubsw m3, m6, [r5 + 17 * 16]
+ pmulhrsw m3, m7
+ packuswb m3, m3
+ movh [r0 + 892 * 16], m3
+
+ ; mode 15 [row 31, 0-7]
+ pshufb m3, m6, [tab_S2]
+ movh [r0 + 894 * 16], m3
+
+ ; mode 12 [row 12]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 6], 1
+ pinsrb m0, [r3 + 13], 0
+ pmaddubsw m3, m0, [r5 + 31 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 31 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 664 * 16], m3
+ pmaddubsw m3, m1, [r5 + 31 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 31 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 665 * 16], m3
+
+ ; mode 12 [row 13]
+ pmaddubsw m3, m0, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 666 * 16], m3
+ pmaddubsw m3, m1, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 667 * 16], m3
+
+ ; mode 12 [row 14]
+ pmaddubsw m3, m0, [r5 + 21 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 21 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 668 * 16], m3
+ pmaddubsw m3, m1, [r5 + 21 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 21 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 669 * 16], m3
+
+ ; mode 12 [row 15]
+ pmaddubsw m3, m0, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 16 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 670 * 16], m3
+ pmaddubsw m3, m1, [r5 + 16 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 16 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 671 * 16], m3
+
+ ; mode 12 [row 16]
+ pmaddubsw m3, m0, [r5 + 11 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 11 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 672 * 16], m3
+ pmaddubsw m3, m1, [r5 + 11 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 11 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 673 * 16], m3
+
+ ; mode 12 [row 17]
+ pmaddubsw m3, m0, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 674 * 16], m3
+ pmaddubsw m3, m1, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 675 * 16], m3
+
+ ; mode 12 [row 18]
+ pmaddubsw m3, m0, [r5 + 1 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 1 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 676 * 16], m3
+ pmaddubsw m3, m1, [r5 + 1 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 1 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 677 * 16], m3
+
+ ; mode 13 [row 7]
+ movu m6, m0
+ pinsrb m6, [r3 + 4], 2
+ pinsrb m6, [r3 + 4], 1
+ pinsrb m6, [r3 + 7], 0
+ pmaddubsw m3, m6, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 718 * 16], m3
+ pmaddubsw m3, m1, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 719 * 16], m3
+
+ ; mode 13 [row 8]
+ pmaddubsw m3, m6, [r5 + 15 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 15 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 720 * 16], m3
+ pmaddubsw m3, m1, [r5 + 15 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 15 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 721 * 16], m3
+
+ ; mode 13 [row 9]
+ pmaddubsw m3, m6, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 722 * 16], m3
+ pmaddubsw m3, m1, [r5 + 6 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 6 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 723 * 16], m3
+
+ ; mode 14 [row 4]
+ pinsrb m6, [r3 + 2], 2
+ pinsrb m6, [r3 + 2], 1
+ pinsrb m6, [r3 + 5], 0
+ pmaddubsw m3, m6, [r5 + 31 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 31 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 776 * 16], m3
+ pmaddubsw m3, m1, [r5 + 31 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 31 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 777 * 16], m3
+
+ ; mode 14 [row 5]
+ pmaddubsw m3, m6, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 778 * 16], m3
+ pmaddubsw m3, m1, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 779 * 16], m3
+
+ ; mode 14 [row 6]
+ pmaddubsw m3, m6, [r5 + 5 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 5 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 780 * 16], m3
+ pmaddubsw m3, m1, [r5 + 5 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 5 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 781 * 16], m3
+
+ ; mode 14 [row 7]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 5], 1
+ pinsrb m6, [r3 + 7], 0
+ pmaddubsw m3, m6, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 5], 0
+ pmaddubsw m5, m2, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 782 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 13], 0
+ pmaddubsw m3, m1, [r5 + 24 * 16]
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 21], 0
+ pmaddubsw m5, m4, [r5 + 24 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 783 * 16], m3
+
+ ; mode 14 [row 8]
+ pmaddubsw m3, m6, [r5 + 11 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 11 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 784 * 16], m3
+ pmaddubsw m3, m1, [r5 + 11 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 11 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 785 * 16], m3
+
+ ; mode 15 [row 5, 8-31]
+ pmaddubsw m5, m2, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movh [r0 + 842 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 26 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 26 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 843 * 16], m3
+
+ ; mode 15 [row 6, 8-31]
+ pmaddubsw m5, m2, [r5 + 9 * 16]
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movh [r0 + 844 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 9 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 9 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 845 * 16], m3
+
+ ; mode 12 [row 19]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 13], 1
+ pinsrb m0, [r3 + 19], 0
+ pmaddubsw m3, m0, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 678 * 16], m3
+ pmaddubsw m3, m1, [r5 + 28 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 28 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 679 * 16], m3
+
+ ; mode 12 [row 20]
+ pmaddubsw m3, m0, [r5 + 23 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 23 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 680 * 16], m3
+ pmaddubsw m3, m1, [r5 + 23 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 23 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 681 * 16], m3
+
+ ; mode 12 [row 21]
+ pmaddubsw m3, m0, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 682 * 16], m3
+ pmaddubsw m3, m1, [r5 + 18 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 18 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 683 * 16], m3
+
+ ; mode 12 [row 22]
+ pmaddubsw m3, m0, [r5 + 13 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 13 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 684 * 16], m3
+ pmaddubsw m3, m1, [r5 + 13 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 13 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 685 * 16], m3
+
+ ; mode 12 [row 23]
+ pmaddubsw m3, m0, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 8 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 686 * 16], m3
+ pmaddubsw m3, m1, [r5 + 8 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 8 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 687 * 16], m3
+
+ ; mode 12 [row 24]
+ pmaddubsw m3, m0, [r5 + 3 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, [r5 + 3 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 688 * 16], m3
+ pmaddubsw m3, m1, [r5 + 3 * 16]
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, [r5 + 3 * 16]
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 689 * 16], m3
+
+ ; mode 13 [row 10]
+ movu m7, m6
+ movu m6, m0
+ pinsrb m6, [r3 + 4], 4
+ pinsrb m6, [r3 + 4], 3
+ pinsrb m6, [r3 + 7], 2
+ pinsrb m6, [r3 + 7], 1
+ pinsrb m6, [r3 + 11], 0
+ pmaddubsw m3, m6, [r5 + 29 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 29 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 724 * 16], m3
+ pmaddubsw m3, m1, [r5 + 29 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 29 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 725 * 16], m3
+
+ ; mode 13 [row 11]
+ pmaddubsw m3, m6, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 726 * 16], m3
+ pmaddubsw m3, m1, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 727 * 16], m3
+
+ ; mode 13 [row 12]
+ pmaddubsw m3, m6, [r5 + 11 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 11 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 728 * 16], m3
+ pmaddubsw m3, m1, [r5 + 11 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 11 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 729 * 16], m3
+
+ ; mode 13 [row 13]
+ pmaddubsw m3, m6, [r5 + 2 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 2 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 730 * 16], m3
+ pmaddubsw m3, m1, [r5 + 2 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 2 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 731 * 16], m3
+
+ ; mode 14 [row 9]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 7], 1
+ pinsrb m7, [r3 + 10], 0
+ pmaddubsw m3, m7, [r5 + 30 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrw m2, [r4 + 4], 0
+ pmaddubsw m5, m2, [r5 + 30 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 786 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 12], 0
+ pmaddubsw m3, m1, [r5 + 30 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrb m4, [r4 + 21], 1
+ pinsrb m4, [r4 + 20], 0
+ pmaddubsw m5, m4, [r5 + 30 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 787 * 16], m3
+
+ ; mode 14 [row 10]
+ pmaddubsw m3, m7, [r5 + 17 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 17 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 788 * 16], m3
+ pmaddubsw m3, m1, [r5 + 17 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 17 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 789 * 16], m3
+
+ ; mode 14 [row 11]
+ pmaddubsw m3, m7, [r5 + 4 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 4 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 790 * 16], m3
+ pmaddubsw m3, m1, [r5 + 4 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 4 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 791 * 16], m3
+
+ movu m6, [pw_1024]
+
+ ; mode 15 [row 7, 8-31]
+ pmaddubsw m5, m2, [r5 + 24 * 16]
+ pmulhrsw m5, m6
+ packuswb m5, m5
+ movh [r0 + 846 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 24 * 16]
+ pmulhrsw m3, m6
+ pmaddubsw m5, m4, [r5 + 24 * 16]
+ pmulhrsw m5, m6
+ packuswb m3, m5
+ movu [r0 + 847 * 16], m3
+
+ ; mode 15 [row 8, 8-31]
+ pmaddubsw m5, m2, [r5 + 7 * 16]
+ pmulhrsw m5, m6
+ packuswb m5, m5
+ movh [r0 + 848 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 7 * 16]
+ pmulhrsw m3, m6
+ pmaddubsw m5, m4, [r5 + 7 * 16]
+ pmulhrsw m5, m6
+ packuswb m3, m5
+ movu [r0 + 849 * 16], m3
+
+ ; mode 12 [row 25]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 19], 1
+ pinsrb m0, [r3 + 26], 0
+ pmaddubsw m3, m0, [r5 + 30 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 30 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 690 * 16], m3
+ pmaddubsw m3, m1, [r5 + 30 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 30 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 691 * 16], m3
+
+ ; mode 12 [row 26]
+ pmaddubsw m3, m0, [r5 + 25 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 25 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 692 * 16], m3
+ pmaddubsw m3, m1, [r5 + 25 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 25 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 693 * 16], m3
+
+ ; mode 12 [row 27]
+ pmaddubsw m3, m0, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 694 * 16], m3
+ pmaddubsw m3, m1, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 695 * 16], m3
+
+ ; mode 12 [row 28]
+ pmaddubsw m3, m0, [r5 + 15 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 15 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 696 * 16], m3
+ pmaddubsw m3, m1, [r5 + 15 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 15 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 697 * 16], m3
+
+ ; mode 12 [row 29]
+ pmaddubsw m3, m0, [r5 + 10 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 10 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 698 * 16], m3
+ pmaddubsw m3, m1, [r5 + 10 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 10 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 699 * 16], m3
+
+ ; mode 12 [row 30]
+ pmaddubsw m3, m0, [r5 + 5 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 5 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 700 * 16], m3
+ pmaddubsw m3, m1, [r5 + 5 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 5 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 701 * 16], m3
+
+ ; mode 13 [row 14]
+ movu m6, m0
+ pinsrb m6, [r3 + 4], 6
+ pinsrb m6, [r3 + 4], 5
+ pinsrb m6, [r3 + 7], 4
+ pinsrb m6, [r3 + 7], 3
+ pinsrb m6, [r3 + 11], 2
+ pinsrb m6, [r3 + 11], 1
+ pinsrb m6, [r3 + 14], 0
+ pmaddubsw m3, m6, [r5 + 25 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 25 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 732 * 16], m3
+ pmaddubsw m3, m1, [r5 + 25 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 25 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 733 * 16], m3
+
+ ; mode 13 [row 15]
+ pmaddubsw m3, m6, [r5 + 16 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 16 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 734 * 16], m3
+ pmaddubsw m3, m1, [r5 + 16 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 16 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 735 * 16], m3
+
+ ; mode 13 [row 16]
+ pmaddubsw m3, m6, [r5 + 7 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 7 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 736 * 16], m3
+ pmaddubsw m3, m1, [r5 + 7 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 7 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 737 * 16], m3
+
+ ; mode 13 [row 17]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 14], 1
+ pinsrb m6, [r3 + 18], 0
+ pmaddubsw m3, m6, [r5 + 30 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrw m2, [r4 + 3], 0
+ pmaddubsw m5, m2, [r5 + 30 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 738 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 11], 0
+ pmaddubsw m3, m1, [r5 + 30 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 19], 0
+ pmaddubsw m5, m4, [r5 + 30 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 739 * 16], m3
+
+ ; mode 13 [row 18]
+ pmaddubsw m3, m6, [r5 + 21 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 21 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 740 * 16], m3
+ pmaddubsw m3, m1, [r5 + 21 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 21 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 741 * 16], m3
+
+ ; mode 13 [row 19]
+ pmaddubsw m3, m6, [r5 + 12 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 12 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 742 * 16], m3
+ pmaddubsw m3, m1, [r5 + 12 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 12 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 743 * 16], m3
+
+ ; mode 13 [row 20]
+ pmaddubsw m3, m6, [r5 + 3 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 3 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 744 * 16], m3
+ pmaddubsw m3, m1, [r5 + 3 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 3 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 745 * 16], m3
+
+ ; mode 14 [row 12]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 10], 1
+ pinsrb m7, [r3 + 12], 0
+ pmaddubsw m3, m7, [r5 + 23 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 23 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 792 * 16], m3
+ pmaddubsw m3, m1, [r5 + 23 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 23 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 793 * 16], m3
+
+ ; mode 14 [row 13]
+ pmaddubsw m3, m7, [r5 + 10 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 10 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 794 * 16], m3
+ pmaddubsw m3, m1, [r5 + 10 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 10 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 795 * 16], m3
+
+ ; mode 15 [row 9]
+ pmaddubsw m5, m2, [r5 + 22 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movu [r0 + 850 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 22 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 22 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 851 * 16], m3
+
+ ; mode 15 [row 10]
+ pmaddubsw m5, m2, [r5 + 5 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movu [r0 + 852 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 5 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 5 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 853 * 16], m3
+
+ ; mode 13 [row 21]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 18], 1
+ pinsrb m6, [r3 + 21], 0
+ pmaddubsw m3, m6, [r5 + 26 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrw m2, [r4 + 2], 0
+ pmaddubsw m5, m2, [r5 + 26 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 746 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 10], 0
+ pmaddubsw m3, m1, [r5 + 26 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 18], 0
+ pmaddubsw m5, m4, [r5 + 26 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 747 * 16], m3
+
+ ; mode 13 [row 22]
+ pmaddubsw m3, m6, [r5 + 17 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 17 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 748 * 16], m3
+ pmaddubsw m3, m1, [r5 + 17 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 17 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 749 * 16], m3
+
+ ; mode 13 [row 23]
+ pmaddubsw m3, m6, [r5 + 8 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 8 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 750 * 16], m3
+ pmaddubsw m3, m1, [r5 + 8 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 8 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 751 * 16], m3
+
+ ; mode 14 [row 14]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 12], 1
+ pinsrb m7, [r3 + 15], 0
+ pmaddubsw m3, m7, [r5 + 29 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 29 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 796 * 16], m3
+ pmaddubsw m3, m1, [r5 + 29 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 29 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 797 * 16], m3
+
+ ; mode 14 [row 15]
+ pmaddubsw m3, m7, [r5 + 16 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 16 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 798 * 16], m3
+ pmaddubsw m3, m1, [r5 + 16 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 16 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 799 * 16], m3
+
+ ; mode 14 [row 16]
+ pmaddubsw m3, m7, [r5 + 3 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 3 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 800 * 16], m3
+ pmaddubsw m3, m1, [r5 + 3 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 3 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 801 * 16], m3
+
+ ; mode 15 [row 11]
+ pmaddubsw m5, m2, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 854 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 855 * 16], m3
+
+ ; mode 15 [row 12]
+ pmaddubsw m5, m2, [r5 + 3 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 856 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 3 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 3 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 857 * 16], m3
+
+ ; mode 13 [row 24]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 21], 1
+ pinsrb m6, [r3 + 25], 0
+ pmaddubsw m3, m6, [r5 + 31 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrw m2, [r4 + 1], 0
+ pmaddubsw m5, m2, [r5 + 31 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 752 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 9], 0
+ pmaddubsw m3, m1, [r5 + 31 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 17], 0
+ pmaddubsw m5, m4, [r5 + 31 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 753 * 16], m3
+
+ ; mode 13 [row 25]
+ pmaddubsw m3, m6, [r5 + 22 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 22 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 754 * 16], m3
+ pmaddubsw m3, m1, [r5 + 22 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 22 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 755 * 16], m3
+
+ ; mode 13 [row 26]
+ pmaddubsw m3, m6, [r5 + 13 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 13 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 756 * 16], m3
+ pmaddubsw m3, m1, [r5 + 13 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 13 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 757 * 16], m3
+
+ ; mode 13 [row 27]
+ pmaddubsw m3, m6, [r5 + 4 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 4 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 758 * 16], m3
+ pmaddubsw m3, m1, [r5 + 4 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 4 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 759 * 16], m3
+
+ ; mode 14 [row 17]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 15], 1
+ pinsrb m7, [r3 + 17], 0
+ pmaddubsw m3, m7, [r5 + 22 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 22 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 802 * 16], m3
+ pmaddubsw m3, m1, [r5 + 22 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 22 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 803 * 16], m3
+
+ ; mode 14 [row 18]
+ pmaddubsw m3, m7, [r5 + 9 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 9 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 804 * 16], m3
+ pmaddubsw m3, m1, [r5 + 9 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 9 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 805 * 16], m3
+
+ ; mode 15 [row 13]
+ pmaddubsw m5, m2, [r5 + 18 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 858 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 18 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 18 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 859 * 16], m3
+
+ ; mode 15 [row 14]
+ pmaddubsw m5, m2, [r5 + 1 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 860 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 1 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 1 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 861 * 16], m3
+
+ ; mode 13 [row 28]
+ pslldq m6, 2
+ pinsrb m6, [r3 + 25], 1
+ pinsrb m6, [r3 + 28], 0
+ pmaddubsw m3, m6, [r5 + 27 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrw m2, [r4 + 0], 0
+ pmaddubsw m5, m2, [r5 + 27 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 760 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 8], 0
+ pmaddubsw m3, m1, [r5 + 27 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 16], 0
+ pmaddubsw m5, m4, [r5 + 27 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 761 * 16], m3
+
+ ; mode 13 [row 29]
+ pmaddubsw m3, m6, [r5 + 18 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 18 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 762 * 16], m3
+ pmaddubsw m3, m1, [r5 + 18 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 18 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 763 * 16], m3
+
+ ; mode 13 [row 30]
+ pmaddubsw m3, m6, [r5 + 9 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 9 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 764 * 16], m3
+ pmaddubsw m3, m1, [r5 + 9 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 9 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 765 * 16], m3
+
+ ; mode 14 [row 19]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 17], 1
+ pinsrb m7, [r3 + 20], 0
+ pmaddubsw m3, m7, [r5 + 28 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 28 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 806 * 16], m3
+ pmaddubsw m3, m1, [r5 + 28 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 28 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 807 * 16], m3
+
+ ; mode 14 [row 20]
+ pmaddubsw m3, m7, [r5 + 15 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 15 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 808 * 16], m3
+ pmaddubsw m3, m1, [r5 + 15 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 15 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 809 * 16], m3
+
+ ; mode 14 [row 21]
+ pmaddubsw m3, m7, [r5 + 2 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 2 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 810 * 16], m3
+ pmaddubsw m3, m1, [r5 + 2 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 2 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 811 * 16], m3
+
+ ; mode 15 [row 15]
+ pmaddubsw m5, m2, [r5 + 16 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 862 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 16 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 16 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 863 * 16], m3
+
+ ; mode 14 [row 22]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 20], 1
+ pinsrb m7, [r3 + 22], 0
+ pmaddubsw m3, m7, [r5 + 21 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrb m2, [r4 + 0], 1
+ pinsrb m2, [r3 + 2], 0
+ pmaddubsw m5, m2, [r5 + 21 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 812 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 7], 0
+ pmaddubsw m3, m1, [r5 + 21 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 15], 0
+ pmaddubsw m5, m4, [r5 + 21 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 813 * 16], m3
+
+ ; mode 14 [row 23]
+ pmaddubsw m3, m7, [r5 + 8 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 8 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 814 * 16], m3
+ pmaddubsw m3, m1, [r5 + 8 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 8 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 815 * 16], m3
+
+ ; mode 15 [row 16]
+ pmaddubsw m5, m2, [r5 + 31 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 864 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 31 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 31 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 865 * 16], m3
+
+ ; mode 15 [row 17]
+ pmaddubsw m5, m2, [r5 + 14 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 866 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 14 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 14 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 867 * 16], m3
+
+ ; mode 14 [row 24]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 22], 1
+ pinsrb m7, [r3 + 25], 0
+ pmaddubsw m3, m7, [r5 + 27 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrb m2, [r3 + 2], 1
+ pinsrb m2, [r3 + 5], 0
+ pmaddubsw m5, m2, [r5 + 27 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 816 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 6], 0
+ pmaddubsw m3, m1, [r5 + 27 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 14], 0
+ pmaddubsw m5, m4, [r5 + 27 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 817 * 16], m3
+
+ ; mode 14 [row 25]
+ pmaddubsw m3, m7, [r5 + 14 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 14 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 818 * 16], m3
+ pmaddubsw m3, m1, [r5 + 14 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 14 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 819 * 16], m3
+
+ ; mode 14 [row 26]
+ pmaddubsw m3, m7, [r5 + 1 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 1 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 820 * 16], m3
+ pmaddubsw m3, m1, [r5 + 1 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 1 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 821 * 16], m3
+
+ ; mode 15 [row 18]
+ pinsrb m2, [r3 + 4], 0
+ pmaddubsw m5, m2, [r5 + 29 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 868 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 29 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 29 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 869 * 16], m3
+
+ ; mode 15 [row 19]
+ pmaddubsw m5, m2, [r5 + 12 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 870 * 16 + 8], m5
+ pmaddubsw m3, m1, [r5 + 12 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 12 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 871 * 16], m3
+
+ ; mode 15 [row 20 - 8 to 15]
+ pslldq m3, m2, 2
+ pinsrb m3, [r3 + 4], 1
+ pinsrb m3, [r3 + 6], 0
+ pmaddubsw m5, m3, [r5 + 27 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 872 * 16 + 8], m5
+
+ ; mode 15 [row 21 - 8 to 15]
+ pmaddubsw m5, m3, [r5 + 10 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 874 * 16 + 8], m5
+
+ ; mode 15 [row 22 - 8 to 15]
+ pslldq m3, 2
+ pinsrb m3, [r3 + 6], 1
+ pinsrb m3, [r3 + 8], 0
+ pmaddubsw m5, m3, [r5 + 25 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 876 * 16 + 8], m5
+
+ ; mode 15 [row 23 - 8 to 15]
+ pmaddubsw m5, m3, [r5 + 8 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 878 * 16 + 8], m5
+
+ ; mode 15 [row 24 - 8 to 15]
+ pslldq m3, 2
+ pinsrb m3, [r3 + 8], 1
+ pinsrb m3, [r3 + 9], 0
+ pmaddubsw m5, m3, [r5 + 23 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 880 * 16 + 8], m5
+
+ ; mode 15 [row 25 - 8 to 15]
+ pmaddubsw m5, m3, [r5 + 6 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 882 * 16 + 8], m5
+
+ ; mode 15 [row 26 - 8 to 15]
+ pslldq m3, 2
+ pinsrb m3, [r3 + 9], 1
+ pinsrb m3, [r3 + 11], 0
+ pmaddubsw m5, m3, [r5 + 21 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 884 * 16 + 8], m5
+
+ ; mode 15 [row 27 - 8 to 15]
+ pmaddubsw m5, m3, [r5 + 4 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 886 * 16 + 8], m5
+
+ ; mode 15 [row 28 - 8 to 15]
+ pslldq m3, 2
+ pinsrb m3, [r3 + 11], 1
+ pinsrb m3, [r3 + 13], 0
+ pmaddubsw m5, m3, [r5 + 19 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 888 * 16 + 8], m5
+
+ ; mode 15 [row 29 - 8 to 15]
+ pmaddubsw m5, m3, [r5 + 2 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 890 * 16 + 8], m5
+
+ ; mode 15 [row 30 - 8 to 15]
+ pslldq m3, 2
+ pinsrb m3, [r3 + 13], 1
+ pinsrb m3, [r3 + 15], 0
+ pmaddubsw m5, m3, [r5 + 17 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m5, m5
+ movh [r0 + 892 * 16 + 8], m5
+
+ ; mode 15 [row 31, 8 to 15]
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 894 * 16 + 8], m5
+
+ ; mode 14 [row 27]
+ pinsrb m2, [r3 + 5], 0
+ pslldq m7, 2
+ pinsrb m7, [r3 + 25], 1
+ pinsrb m7, [r3 + 27], 0
+ pmaddubsw m3, m7, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrb m2, [r3 + 5], 1
+ pinsrb m2, [r3 + 7], 0
+ pmaddubsw m5, m2, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 822 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 5], 0
+ pmaddubsw m3, m1, [r5 + 20 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 13], 0
+ pmaddubsw m5, m4, [r5 + 20 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 823 * 16], m3
+
+ ; mode 15 [row 20 - 16 to 31]
+ pmaddubsw m3, m1, [r5 + 27 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 27 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 873 * 16], m3
+
+ ; mode 15 [row 21 - 16 to 31]
+ pmaddubsw m3, m1, [r5 + 10 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 10 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 875 * 16], m3
+
+ ; mode 14 [row 28]
+ pmaddubsw m3, m7, [r5 + 7 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 7 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 824 * 16], m3
+ pmaddubsw m3, m1, [r5 + 7 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 7 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 825 * 16], m3
+
+ ; mode 14 [row 29]
+ pslldq m7, 2
+ pinsrb m7, [r3 + 27], 1
+ pinsrb m7, [r3 + 30], 0
+ pmaddubsw m3, m7, [r5 + 26 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m2, 2
+ pinsrb m2, [r3 + 7], 1
+ pinsrb m2, [r3 + 10], 0
+ pmaddubsw m5, m2, [r5 + 26 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 826 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 4], 0
+ pmaddubsw m3, m1, [r5 + 26 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 12], 0
+ pmaddubsw m5, m4, [r5 + 26 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 827 * 16], m3
+
+ ; mode 14 [row 30]
+ pmaddubsw m3, m7, [r5 + 13 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m2, [r5 + 13 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 828 * 16], m3
+ pmaddubsw m3, m1, [r5 + 13 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 13 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 829 * 16], m3
+
+ ; mode 15 [row 22]
+ pmaddubsw m3, m1, [r5 + 25 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 25 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 877 * 16], m3
+
+ ; mode 15 [row 23]
+ pmaddubsw m3, m1, [r5 + 8 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 8 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 879 * 16], m3
+
+ ; mode 14 [row 31]
+ pshufb m3, m7, [tab_S2]
+ movh [r0 + 830 * 16], m3
+ pshufb m3, m2, [tab_S2]
+ movh [r0 + 830 * 16 + 8], m3
+ pshufb m3, m1, [tab_S2]
+ movh [r0 + 831 * 16], m3
+ pshufb m3, m4, [tab_S2]
+ movh [r0 + 831 * 16 + 8], m3
+
+ ; mode 13 [row 31]
+ pshufb m0, m6, [tab_S2]
+ movh [r0 + 766 * 16], m0
+ movh m0, [r4]
+ movh [r0 + 766 * 16 + 8], m0
+ movu m0, [r4 + 8]
+ movu [r0 + 767 * 16], m0
+
+ ; mode 15 [row 24]
+ pslldq m1, 2
+ pinsrw m1, [r4 + 3], 0
+ pmaddubsw m3, m1, [r5 + 23 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 11], 0
+ pmaddubsw m5, m4, [r5 + 23 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 881 * 16], m3
+
+ ; mode 15 [row 25]
+ pmaddubsw m3, m1, [r5 + 6 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 6 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 883 * 16], m3
+
+ ; mode 15 [row 26]
+ pslldq m1, 2
+ pinsrw m1, [r4 + 2], 0
+ pmaddubsw m3, m1, [r5 + 21 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 10], 0
+ pmaddubsw m5, m4, [r5 + 21 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 885 * 16], m3
+
+ ; mode 15 [row 27]
+ pmaddubsw m3, m1, [r5 + 4 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 4 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 887 * 16], m3
+
+ ; mode 15 [row 28]
+ pslldq m1, 2
+ pinsrw m1, [r4 + 1], 0
+ pmaddubsw m3, m1, [r5 + 19 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 9], 0
+ pmaddubsw m5, m4, [r5 + 19 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 889 * 16], m3
+
+ ; mode 15 [row 29]
+ pmaddubsw m3, m1, [r5 + 2 * 16]
+ pmulhrsw m3, [pw_1024]
+ pmaddubsw m5, m4, [r5 + 2 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 891 * 16], m3
+
+ ; mode 15 [row 30]
+ pslldq m1, 2
+ pinsrw m1, [r4 + 0], 0
+ pmaddubsw m3, m1, [r5 + 17 * 16]
+ pmulhrsw m3, [pw_1024]
+ pslldq m4, 2
+ pinsrw m4, [r4 + 8], 0
+ pmaddubsw m5, m4, [r5 + 17 * 16]
+ pmulhrsw m5, [pw_1024]
+ packuswb m3, m5
+ movu [r0 + 893 * 16], m3
+
+ ; mode 15 [row 31]
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 895 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 895 * 16 + 8], m5
+
+ ; mode 16 [row 0]
+ movu m6, [r5 + 11 * 16]
+ movu m7, [pw_1024]
+ movh m0, [r4 ]
+ movh m1, [r4 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movh m2, [r4 + 8]
+ movh m3, [r4 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 896 * 16], m1
+
+ movh m1, [r4 + 16]
+ movh m3, [r4 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movh m4, [r4 + 24]
+ movh m5, [r4 + 25]
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 897 * 16], m3
+
+ ; mode16 [row 1]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4], 1
+ pinsrb m0, [r3 + 2], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 898 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 15], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 23], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 899 * 16], m3
+
+ ; mode16 [row 2]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 900 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 901 * 16], m3
+
+ ; mode16 [row 3]
+ movu m6, [r5 + 12 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 2], 1
+ pinsrb m0, [r3 + 3], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 902 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 14], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 22], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 903 * 16], m3
+
+ ; mode16 [row 4]
+ movu m6, [r5 + 23 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 3], 1
+ pinsrb m0, [r3 + 5], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 904 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 13], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 21], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 905 * 16], m3
+
+ ; mode16 [row 5]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 906 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 907 * 16], m3
+
+ ; mode16 [row 6]
+ movu m6, [r5 + 13 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 5], 1
+ pinsrb m0, [r3 + 6], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 5], 1
+ pinsrb m2, [r4 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 908 * 16], m3
+ pslldq m1, 2
+ pinsrw m1, [r4 + 12], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 20], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 909 * 16], m3
+
+ ; mode16 [row 7]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 6], 1
+ pinsrb m0, [r3 + 8], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 910 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 11], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 19], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 911 * 16], m3
+
+ ; mode16 [row 8]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 912 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 913 * 16], m3
+
+ ; mode16 [row 9]
+ movu m6, [r5 + 14 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 8], 1
+ pinsrb m0, [r3 + 9], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 914 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 10], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 18], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 915 * 16], m3
+
+ ; mode16 [row 10]
+ movu m6, [r5 + 25 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 9], 1
+ pinsrb m0, [r3 + 11], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 916 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 9], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrb m4, [r4 + 18], 1
+ pinsrb m4, [r4 + 17], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 917 * 16], m3
+
+ ; mode16 [row 11]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 918 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 919 * 16], m3
+
+ ; mode16 [row 12]
+ movu m6, [r5 + 15 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 11], 1
+ pinsrb m0, [r3 + 12], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 920 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 8], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 16], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 921 * 16], m3
+
+ ; mode16 [row 13]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 12], 1
+ pinsrb m0, [r3 + 14], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 0], 1
+ pinsrb m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 922 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 7], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 15], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 923 * 16], m3
+
+ ; mode16 [row 14]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 924 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 925 * 16], m3
+
+ ; mode16 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 14], 1
+ pinsrb m0, [r3 + 15], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 2], 1
+ pinsrb m2, [r3 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 926 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 6], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 14], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 927 * 16], m3
+
+ ; mode16 [row 16]
+ movu m6, [r5 + 27 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 15], 1
+ pinsrb m0, [r3 + 17], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 3], 1
+ pinsrb m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 928 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 5], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 13], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 929 * 16], m3
+
+ ; mode16 [row 17]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 930 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 931 * 16], m3
+
+ ; mode16 [row 18]
+ movu m6, [r5 + 17 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 17], 1
+ pinsrb m0, [r3 + 18], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 5], 1
+ pinsrb m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 932 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 4], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 12], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 933 * 16], m3
+
+ ; mode16 [row 19]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 18], 1
+ pinsrb m0, [r3 + 20], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 6], 1
+ pinsrb m2, [r3 + 8], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 934 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 3], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 11], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 935 * 16], m3
+
+ ; mode16 [row 20]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 936 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 937 * 16], m3
+
+ ; mode16 [row 21]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 20], 1
+ pinsrb m0, [r3 + 21], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 8], 1
+ pinsrb m2, [r3 + 9], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 938 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 2], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 10], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 939 * 16], m3
+
+ ; mode16 [row 22]
+ movu m6, [r5 + 29 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 21], 1
+ pinsrb m0, [r3 + 23], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 9], 1
+ pinsrb m2, [r3 + 11], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 940 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 1], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 9], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 941 * 16], m3
+
+ ; mode16 [row 23]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 942 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 943 * 16], m3
+
+ ; mode16 [row 24]
+ movu m6, [r5 + 19 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 23], 1
+ pinsrb m0, [r3 + 24], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 11], 1
+ pinsrb m2, [r3 + 12], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 944 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 0], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 8], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 945 * 16], m3
+
+ ; mode16 [row 25]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 24], 1
+ pinsrb m0, [r3 + 26], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 12], 1
+ pinsrb m2, [r3 + 14], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 946 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r4 + 0], 1
+ pinsrb m1, [r3 + 2], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 7], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 947 * 16], m3
+
+ ; mode16 [row 26]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 948 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 949 * 16], m3
+
+ ; mode16 [row 27]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 26], 1
+ pinsrb m0, [r3 + 27], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 14], 1
+ pinsrb m2, [r3 + 15], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 950 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 2], 1
+ pinsrb m1, [r3 + 3], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 6], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 951 * 16], m3
+
+ ; mode16 [row 28]
+ movu m6, [r5 + 31 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 27], 1
+ pinsrb m0, [r3 + 29], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 15], 1
+ pinsrb m2, [r3 + 17], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 952 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 3], 1
+ pinsrb m1, [r3 + 5], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 5], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 953 * 16], m3
+
+ ; mode16 [row 29]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 954 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 955 * 16], m3
+
+ ; mode16 [row 30]
+ movu m6, [r5 + 21 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 29], 1
+ pinsrb m0, [r3 + 30], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 17], 1
+ pinsrb m2, [r3 + 18], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 956 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 5], 1
+ pinsrb m1, [r3 + 6], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 4], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 957 * 16], m3
+
+ ; mode16 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 958 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 958 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 959 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 959 * 16 + 8], m5
+
+ ; mode 17 [row 0]
+ movu m6, [r5 + 6 * 16]
+ movu m7, [pw_1024]
+ movh m0, [r4 ]
+ movh m1, [r4 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movh m2, [r4 + 8]
+ movh m3, [r4 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 960 * 16], m1
+
+ movh m1, [r4 + 16]
+ movh m3, [r4 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movh m4, [r4 + 24]
+ movh m5, [r4 + 25]
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 961 * 16], m3
+
+ ; mode17 [row 1]
+ movu m6, [r5 + 12 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 0], 1
+ pinsrb m0, [r3 + 1], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 962 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 15], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 23], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 963 * 16], m3
+
+ ; mode17 [row 2]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 1], 1
+ pinsrb m0, [r3 + 2], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 964 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 14], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 22], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 965 * 16], m3
+
+ ; mode17 [row 3]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 2], 1
+ pinsrb m0, [r3 + 4], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 966 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 13], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 21], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 967 * 16], m3
+
+ ; mode17 [row 4]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 4], 1
+ pinsrb m0, [r3 + 5], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 968 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 12], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 20], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 969 * 16], m3
+
+ ; mode17 [row 5]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 970 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 971 * 16], m3
+
+ ; mode17 [row 6]
+ movu m6, [r5 + 10 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 5], 1
+ pinsrb m0, [r3 + 6], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 972 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 11], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 19], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 973 * 16], m3
+
+ ; mode17 [row 7]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 6], 1
+ pinsrb m0, [r3 + 7], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 974 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 10], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 18], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 975 * 16], m3
+
+ ; mode17 [row 8]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 7], 1
+ pinsrb m0, [r3 + 9], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 976 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 9], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 17], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 977 * 16], m3
+
+ ; mode17 [row 9]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 9], 1
+ pinsrb m0, [r3 + 10], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrw m2, [r4 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 978 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 8], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 16], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 979 * 16], m3
+
+ ; mode17 [row 10]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 980 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 981 * 16], m3
+
+ ; mode17 [row 11]
+ movu m6, [r5 + 8 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 10], 1
+ pinsrb m0, [r3 + 11], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 0], 1
+ pinsrb m2, [r3 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 982 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 7], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 15], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 983 * 16], m3
+
+ ; mode17 [row 12]
+ movu m6, [r5 + 14 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 11], 1
+ pinsrb m0, [r3 + 12], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 1], 1
+ pinsrb m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 984 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 6], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 14], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 985 * 16], m3
+
+ ; mode17 [row 13]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 12], 1
+ pinsrb m0, [r3 + 14], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 2], 1
+ pinsrb m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 986 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 5], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 13], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 987 * 16], m3
+
+ ; mode17 [row 14]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 14], 1
+ pinsrb m0, [r3 + 15], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 4], 1
+ pinsrb m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 988 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 4], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 12], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 989 * 16], m3
+
+ ; mode17 [row 15]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 990 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 990 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 991 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 991 * 16 + 8], m5
+
+ ; mode17 [row 16]
+ movu m6, [r5 + 6 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 15], 1
+ pinsrb m0, [r3 + 16], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 5], 1
+ pinsrb m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 992 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 3], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 11], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 993 * 16], m3
+
+ ; mode17 [row 17]
+ movu m6, [r5 + 12 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 16], 1
+ pinsrb m0, [r3 + 17], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 6], 1
+ pinsrb m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 994 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 2], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 10], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 995 * 16], m3
+
+ ; mode17 [row 18]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 17], 1
+ pinsrb m0, [r3 + 18], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 7], 1
+ pinsrb m2, [r3 + 9], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 996 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 1], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 9], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 997 * 16], m3
+
+ ; mode17 [row 19]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 18], 1
+ pinsrb m0, [r3 + 20], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 9], 1
+ pinsrb m2, [r3 + 10], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 998 * 16], m3
+
+ pslldq m1, 2
+ pinsrw m1, [r4 + 0], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 8], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 999 * 16], m3
+
+ ; mode17 [row 20]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 20], 1
+ pinsrb m0, [r3 + 21], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 10], 1
+ pinsrb m2, [r3 + 11], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1000 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r4 + 0], 1
+ pinsrb m1, [r3 + 1], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ ;pinsrb m4, [r4 + 8], 1
+ ;pinsrb m4, [r4 + 7], 0
+ pinsrw m4, [r4 + 7], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1001 * 16], m3
+
+ ; mode17 [row 21]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1002 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1003 * 16], m3
+
+ ; mode17 [row 22]
+ movu m6, [r5 + 10 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 21], 1
+ pinsrb m0, [r3 + 22], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 11], 1
+ pinsrb m2, [r3 + 12], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1004 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 1], 1
+ pinsrb m1, [r3 + 2], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 6], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1005 * 16], m3
+
+ ; mode17 [row 23]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 22], 1
+ pinsrb m0, [r3 + 23], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 12], 1
+ pinsrb m2, [r3 + 14], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1006 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 2], 1
+ pinsrb m1, [r3 + 4], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 5], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1007 * 16], m3
+
+ ; mode17 [row 24]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 23], 1
+ pinsrb m0, [r3 + 25], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 14], 1
+ pinsrb m2, [r3 + 15], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1008 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 4], 1
+ pinsrb m1, [r3 + 5], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 4], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1009 * 16], m3
+
+ ; mode17 [row 25]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 25], 1
+ pinsrb m0, [r3 + 26], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 15], 1
+ pinsrb m2, [r3 + 16], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1010 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 5], 1
+ pinsrb m1, [r3 + 6], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 3], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1011 * 16], m3
+
+ ; mode17 [row 26]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1012 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1013 * 16], m3
+
+ ; mode17 [row 27]
+ movu m6, [r5 + 8 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 26], 1
+ pinsrb m0, [r3 + 27], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 16], 1
+ pinsrb m2, [r3 + 17], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1014 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 6], 1
+ pinsrb m1, [r3 + 7], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 2], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1015 * 16], m3
+
+ ; mode17 [row 28]
+ movu m6, [r5 + 14 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 27], 1
+ pinsrb m0, [r3 + 28], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 17], 1
+ pinsrb m2, [r3 + 18], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1016 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 7], 1
+ pinsrb m1, [r3 + 9], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 1], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1017 * 16], m3
+
+ ; mode17 [row 29]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 28], 1
+ pinsrb m0, [r3 + 30], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 18], 1
+ pinsrb m2, [r3 + 20], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1018 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 9], 1
+ pinsrb m1, [r3 + 10], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrw m4, [r4 + 0], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1019 * 16], m3
+
+ ; mode17 [row 30]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r3 + 30], 1
+ pinsrb m0, [r3 + 31], 0
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 20], 1
+ pinsrb m2, [r3 + 21], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1020 * 16], m3
+
+ pslldq m1, 2
+ pinsrb m1, [r3 + 10], 1
+ pinsrb m1, [r3 + 11], 0
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pslldq m4, 2
+ pinsrb m4, [r4 + 0], 1
+ pinsrb m4, [r3 + 1], 0
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1021 * 16], m3
+
+ ; mode17 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1022 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1022 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1023 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1023 * 16 + 8], m5
+
+ ;mode 18[row 0]
+ movu m0, [r3]
+ movu [r0 + 1024 * 16], m0
+ movu m1, [r3 + 16]
+ movu [r0 + 1025 * 16], m1
+
+ ;mode 18[row 1]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 1], 0
+ movu [r0 + 1026 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 15], 0
+ movu [r0 + 1027 * 16], m1
+
+ ;mode 18[row 2]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 2], 0
+ movu [r0 + 1028 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 14], 0
+ movu [r0 + 1029 * 16], m1
+
+ ;mode 18[row 3]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 3], 0
+ movu [r0 + 1030 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 13], 0
+ movu [r0 + 1031 * 16], m1
+
+ ;mode 18[row 4]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 4], 0
+ movu [r0 + 1032 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 12], 0
+ movu [r0 + 1033 * 16], m1
+
+ ;mode 18[row 5]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 5], 0
+ movu [r0 + 1034 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 11], 0
+ movu [r0 + 1035 * 16], m1
+
+ ;mode 18[row 6]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 6], 0
+ movu [r0 + 1036 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 10], 0
+ movu [r0 + 1037 * 16], m1
+
+ ;mode 18[row 7]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 7], 0
+ movu [r0 + 1038 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 9], 0
+ movu [r0 + 1039 * 16], m1
+
+ ;mode 18[row 8]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 8], 0
+ movu [r0 + 1040 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 8], 0
+ movu [r0 + 1041 * 16], m1
+
+ ;mode 18[row 9]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 9], 0
+ movu [r0 + 1042 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 7], 0
+ movu [r0 + 1043 * 16], m1
+
+ ;mode 18[row 10]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 10], 0
+ movu [r0 + 1044 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 6], 0
+ movu [r0 + 1045 * 16], m1
+
+ ;mode 18[row 11]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 11], 0
+ movu [r0 + 1046 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 5], 0
+ movu [r0 + 1047 * 16], m1
+
+ ;mode 18[row 12]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 12], 0
+ movu [r0 + 1048 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 4], 0
+ movu [r0 + 1049 * 16], m1
+
+ ;mode 18[row 13]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 13], 0
+ movu [r0 + 1050 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 3], 0
+ movu [r0 + 1051 * 16], m1
+
+ ;mode 18[row 14]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 14], 0
+ movu [r0 + 1052 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 2], 0
+ movu [r0 + 1053 * 16], m1
+
+ ;mode 18[row 15]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 15], 0
+ movu [r0 + 1054 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 1], 0
+ movu [r0 + 1055 * 16], m1
+
+ ;mode 18[row 16]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 16], 0
+ movu [r0 + 1056 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r3 + 0], 0
+ movu [r0 + 1057 * 16], m1
+
+ ;mode 18[row 17]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 17], 0
+ movu [r0 + 1058 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 1], 0
+ movu [r0 + 1059 * 16], m1
+
+ ;mode 18[row 18]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 18], 0
+ movu [r0 + 1060 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 2], 0
+ movu [r0 + 1061 * 16], m1
+
+ ;mode 18[row 19]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 19], 0
+ movu [r0 + 1062 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 3], 0
+ movu [r0 + 1063 * 16], m1
+
+ ;mode 18[row 20]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 20], 0
+ movu [r0 + 1064 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 4], 0
+ movu [r0 + 1065 * 16], m1
+
+ ;mode 18[row 21]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 21], 0
+ movu [r0 + 1066 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 5], 0
+ movu [r0 + 1067 * 16], m1
+
+ ;mode 18[row 22]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 22], 0
+ movu [r0 + 1068 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 6], 0
+ movu [r0 + 1069 * 16], m1
+
+ ;mode 18[row 23]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 23], 0
+ movu [r0 + 1070 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 7], 0
+ movu [r0 + 1071 * 16], m1
+
+ ;mode 18[row 24]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 24], 0
+ movu [r0 + 1072 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 8], 0
+ movu [r0 + 1073 * 16], m1
+
+ ;mode 18[row 25]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 25], 0
+ movu [r0 + 1074 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 9], 0
+ movu [r0 + 1075 * 16], m1
+
+ ;mode 18[row 26]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 26], 0
+ movu [r0 + 1076 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 10], 0
+ movu [r0 + 1077 * 16], m1
+
+ ;mode 18[row 27]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 27], 0
+ movu [r0 + 1078 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 11], 0
+ movu [r0 + 1079 * 16], m1
+
+ ;mode 18[row 28]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 28], 0
+ movu [r0 + 1080 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 12], 0
+ movu [r0 + 1081 * 16], m1
+
+ ;mode 18[row 29]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 29], 0
+ movu [r0 + 1082 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 13], 0
+ movu [r0 + 1083 * 16], m1
+
+ ;mode 18[row 30]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 30], 0
+ movu [r0 + 1084 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 14], 0
+ movu [r0 + 1085 * 16], m1
+
+ ;mode 18[row 31]
+ pslldq m0, 1
+ pinsrb m0, [r4 + 31], 0
+ movu [r0 + 1086 * 16], m0
+ pslldq m1, 1
+ pinsrb m1, [r4 + 15], 0
+ movu [r0 + 1087 * 16], m1
+
+ ; mode 19 [row 0]
+ movu m6, [r5 + 6 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 1088 * 16], m1
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1089 * 16], m4
+
+ ; mode 19 [row 1]
+ movu m6, [r5 + 12 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 1], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1090 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1091 * 16], m4
+
+ ; mode 19 [row 2]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 1], 1
+ pinsrb m0, [r4 + 2], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1092 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 14], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 22], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1093 * 16], m4
+
+ ; mode 19 [row 3]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 2], 1
+ pinsrb m0, [r4 + 4], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1094 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 13], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 21], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1095 * 16], m4
+
+ ; mode 19 [row 4]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 4], 1
+ pinsrb m0, [r4 + 5], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1096 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 12], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 20], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1097 * 16], m4
+
+ ; mode 19 [row 5]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1098 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1099 * 16], m4
+
+ ; mode 19 [row 6]
+ movu m6, [r5 + 10 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 5], 1
+ pinsrb m0, [r4 + 6], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1100 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 11], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 19], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1101 * 16], m4
+
+ ; mode 19 [row 7]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 6], 1
+ pinsrb m0, [r4 + 7], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1102 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 10], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 18], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1103 * 16], m4
+
+ ; mode 19 [row 8]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 7], 1
+ pinsrb m0, [r4 + 9], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1104 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 9], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 17], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1105 * 16], m4
+
+ ; mode 19 [row 9]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 9], 1
+ pinsrb m0, [r4 + 10], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1106 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 8], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 16], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1107 * 16], m4
+
+ ; mode 19 [row 10]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1108 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1109 * 16], m4
+
+ ; mode 19 [row 11]
+ movu m6, [r5 + 8 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 10], 1
+ pinsrb m0, [r4 + 11], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 0], 1
+ pinsrb m2, [r4 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1110 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 7], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 15], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1111 * 16], m4
+
+ ; mode 19 [row 12]
+ movu m6, [r5 + 14 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 11], 1
+ pinsrb m0, [r4 + 12], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 1], 1
+ pinsrb m2, [r4 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1112 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 6], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 14], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1113 * 16], m4
+
+ ; mode 19 [row 13]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 12], 1
+ pinsrb m0, [r4 + 14], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 2], 1
+ pinsrb m2, [r4 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1114 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 5], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 13], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1115 * 16], m4
+
+ ; mode 19 [row 14]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 14], 1
+ pinsrb m0, [r4 + 15], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 4], 1
+ pinsrb m2, [r4 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1116 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 4], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 12], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1117 * 16], m4
+
+ ; mode19 [row 15]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1118 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1118 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1119 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1119 * 16 + 8], m5
+
+ ; mode 19 [row 16]
+ movu m6, [r5 + 6 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 15], 1
+ pinsrb m0, [r4 + 16], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 5], 1
+ pinsrb m2, [r4 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1120 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 3], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 11], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1121 * 16], m4
+
+ ; mode 19 [row 17]
+ movu m6, [r5 + 12 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 16], 1
+ pinsrb m0, [r4 + 17], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 6], 1
+ pinsrb m2, [r4 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1122 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 2], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 10], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1123 * 16], m4
+
+ ; mode 19 [row 18]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 17], 1
+ pinsrb m0, [r4 + 18], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 7], 1
+ pinsrb m2, [r4 + 9], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1124 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 1], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 9], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1125 * 16], m4
+
+ ; mode 19 [row 19]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 18], 1
+ pinsrb m0, [r4 + 20], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 9], 1
+ pinsrb m2, [r4 + 10], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1126 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 0], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 8], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1127 * 16], m4
+
+ ; mode 19 [row 20]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 20], 1
+ pinsrb m0, [r4 + 21], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 10], 1
+ pinsrb m2, [r4 + 11], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1128 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 0], 1
+ pinsrb m1, [r4 + 1], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r3 + 8], 1
+ pinsrb m3, [r3 + 7], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1129 * 16], m4
+
+ ; mode 19 [row 21]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1130 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1131 * 16], m4
+
+ ; mode 19 [row 22]
+ movu m6, [r5 + 10 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 21], 1
+ pinsrb m0, [r4 + 22], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 11], 1
+ pinsrb m2, [r4 + 12], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1132 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 1], 1
+ pinsrb m1, [r4 + 2], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 6], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1133 * 16], m4
+
+ ; mode 19 [row 23]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 22], 1
+ pinsrb m0, [r4 + 23], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 12], 1
+ pinsrb m2, [r4 + 14], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1134 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 2], 1
+ pinsrb m1, [r4 + 4], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 5], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1135 * 16], m4
+
+ ; mode 19 [row 24]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 23], 1
+ pinsrb m0, [r4 + 25], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 14], 1
+ pinsrb m2, [r4 + 15], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1136 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 4], 1
+ pinsrb m1, [r4 + 5], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 4], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1137 * 16], m4
+
+ ; mode 19 [row 25]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 25], 1
+ pinsrb m0, [r4 + 26], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 15], 1
+ pinsrb m2, [r4 + 16], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1138 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 5], 1
+ pinsrb m1, [r4 + 6], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 3], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1139 * 16], m4
+
+ ; mode 19 [row 26]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1140 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1141 * 16], m4
+
+ ; mode 19 [row 27]
+ movu m6, [r5 + 8 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 26], 1
+ pinsrb m0, [r4 + 27], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 16], 1
+ pinsrb m2, [r4 + 17], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1142 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 6], 1
+ pinsrb m1, [r4 + 7], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 2], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1143 * 16], m4
+
+ ; mode 19 [row 28]
+ movu m6, [r5 + 14 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 27], 1
+ pinsrb m0, [r4 + 28], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 17], 1
+ pinsrb m2, [r4 + 18], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1144 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 7], 1
+ pinsrb m1, [r4 + 9], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 1], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1145 * 16], m4
+
+ ; mode 19 [row 29]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 28], 1
+ pinsrb m0, [r4 + 30], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 18], 1
+ pinsrb m2, [r4 + 20], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1146 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 9], 1
+ pinsrb m1, [r4 + 10], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 0], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1147 * 16], m4
+
+ ; mode 19 [row 30]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 30], 1
+ pinsrb m0, [r4 + 31], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 20], 1
+ pinsrb m2, [r4 + 21], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1148 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 10], 1
+ pinsrb m1, [r4 + 11], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r4 + 0], 1
+ pinsrb m3, [r4 + 1], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1149 * 16], m4
+
+ ; mode19 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1150 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1150 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1151 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1151 * 16 + 8], m5
+
+ ; mode 20 [row 0]
+ movu m6, [r5 + 11 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 1152 * 16], m1
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1153 * 16], m4
+
+ ; mode 20 [row 1]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 2], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1154 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1155 * 16], m4
+
+ ; mode 20 [row 2]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1156 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1157 * 16], m4
+
+ ; mode 20 [row 3]
+ movu m6, [r5 + 12 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 2], 1
+ pinsrb m0, [r4 + 3], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1158 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 14], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 22], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1159 * 16], m4
+
+ ; mode 20 [row 4]
+ movu m6, [r5 + 23 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 3], 1
+ pinsrb m0, [r4 + 5], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1160 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 13], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 21], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1161 * 16], m4
+
+ ; mode 20 [row 5]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1162 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1163 * 16], m4
+
+ ; mode 20 [row 6]
+ movu m6, [r5 + 13 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 5], 1
+ pinsrb m0, [r4 + 6], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1164 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 12], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 20], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1165 * 16], m4
+
+ ; mode 20 [row 7]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 6], 1
+ pinsrb m0, [r4 + 8], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1166 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 11], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 19], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1167 * 16], m4
+
+ ; mode 20 [row 8]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1168 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1169 * 16], m4
+
+ ; mode 20 [row 9]
+ movu m6, [r5 + 14 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 8], 1
+ pinsrb m0, [r4 + 9], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 3], 1
+ pinsrb m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1170 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 10], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 18], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1171 * 16], m4
+
+ ; mode 20 [row 10]
+ movu m6, [r5 + 25 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 9], 1
+ pinsrb m0, [r4 + 11], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1172 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 9], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 17], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1173 * 16], m4
+
+ ; mode 20 [row 11]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1174 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1175 * 16], m4
+
+ ; mode 20 [row 12]
+ movu m6, [r5 + 15 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 11], 1
+ pinsrb m0, [r4 + 12], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r3 + 1], 1
+ pinsrb m2, [r3 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1176 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 8], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 16], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1177 * 16], m4
+
+ ; mode 20 [row 13]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 12], 1
+ pinsrb m0, [r4 + 14], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 0], 1
+ pinsrb m2, [r4 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1178 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 7], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 15], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1179 * 16], m4
+
+ ; mode 20 [row 14]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1180 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1181 * 16], m4
+
+ ; mode 20 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 14], 1
+ pinsrb m0, [r4 + 15], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 2], 1
+ pinsrb m2, [r4 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1182 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 6], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 14], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1183 * 16], m4
+
+ ; mode 20 [row 16]
+ movu m6, [r5 + 27 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 15], 1
+ pinsrb m0, [r4 + 17], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 3], 1
+ pinsrb m2, [r4 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1184 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 5], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 13], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1185 * 16], m4
+
+ ; mode 20 [row 17]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1186 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1187 * 16], m4
+
+ ; mode 20 [row 18]
+ movu m6, [r5 + 17 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 17], 1
+ pinsrb m0, [r4 + 18], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 5], 1
+ pinsrb m2, [r4 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1188 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 4], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 12], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1189 * 16], m4
+
+ ; mode 20 [row 19]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 18], 1
+ pinsrb m0, [r4 + 20], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 6], 1
+ pinsrb m2, [r4 + 8], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1190 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 3], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 11], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1191 * 16], m4
+
+ ; mode 20 [row 20]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1192 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1193 * 16], m4
+
+ ; mode 20 [row 21]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 20], 1
+ pinsrb m0, [r4 + 21], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 8], 1
+ pinsrb m2, [r4 + 9], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1194 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 2], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 10], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1195 * 16], m4
+
+ ; mode 20 [row 22]
+ movu m6, [r5 + 29 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 21], 1
+ pinsrb m0, [r4 + 23], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 9], 1
+ pinsrb m2, [r4 + 11], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1196 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 1], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 9], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1197 * 16], m4
+
+ ; mode 20 [row 23]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1198 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1199 * 16], m4
+
+ ; mode 20 [row 24]
+ movu m6, [r5 + 19 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 23], 1
+ pinsrb m0, [r4 + 24], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 11], 1
+ pinsrb m2, [r4 + 12], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1200 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 0], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 8], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1201 * 16], m4
+
+ ; mode 20 [row 25]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 24], 1
+ pinsrb m0, [r4 + 26], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 12], 1
+ pinsrb m2, [r4 + 14], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1202 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 0], 1
+ pinsrb m1, [r4 + 2], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 7], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1203 * 16], m4
+
+ ; mode 20 [row 26]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1204 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1205 * 16], m4
+
+ ; mode 20 [row 27]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 26], 1
+ pinsrb m0, [r4 + 27], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 14], 1
+ pinsrb m2, [r4 + 15], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1206 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 2], 1
+ pinsrb m1, [r4 + 3], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 6], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1207 * 16], m4
+
+ ; mode 20 [row 28]
+ movu m6, [r5 + 31 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 27], 1
+ pinsrb m0, [r4 + 29], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 15], 1
+ pinsrb m2, [r4 + 17], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1208 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 3], 1
+ pinsrb m1, [r4 + 5], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 5], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1209 * 16], m4
+
+ ; mode 20 [row 29]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1210 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1211 * 16], m4
+
+ ; mode 20 [row 30]
+ movu m6, [r5 + 21 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 29], 1
+ pinsrb m0, [r4 + 30], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 17], 1
+ pinsrb m2, [r4 + 18], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1212 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r4 + 5], 1
+ pinsrb m1, [r4 + 6], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 4], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1213 * 16], m4
+
+ ; mode20 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1214 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1214 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1215 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1215 * 16 + 8], m5
+
+ ; mode 21 [row 0]
+ movu m6, [r5 + 15 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 1216 * 16], m1
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1217 * 16], m4
+
+ ; mode 21 [row 1]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 2], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1218 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1219 * 16], m4
+
+ ; mode 21 [row 2]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1220 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1221 * 16], m4
+
+ ; mode 21 [row 3]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 2], 1
+ pinsrb m0, [r4 + 4], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1222 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 14], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 22], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1223 * 16], m4
+
+ ; mode 21 [row 4]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1224 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1225 * 16], m4
+
+ ; mode 21 [row 5]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 4], 1
+ pinsrb m0, [r4 + 6], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1226 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 13], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 21], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1227 * 16], m4
+
+ ; mode 21 [row 6]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1228 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1229 * 16], m4
+
+ ; mode 21 [row 7]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 6], 1
+ pinsrb m0, [r4 + 8], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1230 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 12], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 20], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1231 * 16], m4
+
+ ; mode 21 [row 8]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1232 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1233 * 16], m4
+
+ ; mode 21 [row 9]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 8], 1
+ pinsrb m0, [r4 + 9], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1234 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 11], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 19], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1235 * 16], m4
+
+ ; mode 21 [row 10]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1236 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1237 * 16], m4
+
+ ; mode 21 [row 11]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 9], 1
+ pinsrb m0, [r4 + 11], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1238 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 10], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 18], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1239 * 16], m4
+
+ ; mode 21 [row 12]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1240 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1241 * 16], m4
+
+ ; mode 21 [row 13]
+ movu m6, [r5 + 18 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 11], 1
+ pinsrb m0, [r4 + 13], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1242 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 9], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 17], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1243 * 16], m4
+
+ ; mode 21 [row 14]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1244 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1245 * 16], m4
+
+ ; mode 21 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 13], 1
+ pinsrb m0, [r4 + 15], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1246 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 8], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 16], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1247 * 16], m4
+
+ ; mode 21 [row 16]
+ movu m6, [r5 + 31 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 15], 1
+ pinsrb m0, [r4 + 17], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 0], 1
+ pinsrb m2, [r4 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1248 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 7], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 15], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1249 * 16], m4
+
+ ; mode 21 [row 17]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1250 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1251 * 16], m4
+
+ ; mode 21 [row 18]
+ movu m6, [r5 + 29 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 17], 1
+ pinsrb m0, [r4 + 19], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 2], 1
+ pinsrb m2, [r4 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1252 * 16], m4
+ pslldq m1, 2
+ pinsrb m1, [r3 + 7], 1
+ pinsrb m1, [r3 + 6], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrb m3, [r3 + 15], 1
+ pinsrb m3, [r3 + 14], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1253 * 16], m4
+
+ ; mode 21 [row 19]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1254 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1255 * 16], m4
+
+ ; mode 21 [row 20]
+ movu m6, [r5 + 27 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 19], 1
+ pinsrb m0, [r4 + 21], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 4], 1
+ pinsrb m2, [r4 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1256 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 5], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 13], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1257 * 16], m4
+
+ ; mode 21 [row 21]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1258 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1259 * 16], m4
+
+ ; mode 21 [row 22]
+ movu m6, [r5 + 25 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 21], 1
+ pinsrb m0, [r4 + 23], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 6], 1
+ pinsrb m2, [r4 + 8], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1260 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 4], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 12], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1261 * 16], m4
+
+ ; mode 21 [row 23]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1262 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1263 * 16], m4
+
+ ; mode 21 [row 24]
+ movu m6, [r5 + 23 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 23], 1
+ pinsrb m0, [r4 + 24], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 8], 1
+ pinsrb m2, [r4 + 9], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1264 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 3], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 11], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1265 * 16], m4
+
+ ; mode 21 [row 25]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1266 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1267 * 16], m4
+
+ ; mode 21 [row 26]
+ movu m6, [r5 + 21 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 24], 1
+ pinsrb m0, [r4 + 26], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 9], 1
+ pinsrb m2, [r4 + 11], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1268 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 2], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 10], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1269 * 16], m4
+
+ ; mode 21 [row 27]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1270 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1271 * 16], m4
+
+ ; mode 21 [row 28]
+ movu m6, [r5 + 19 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 26], 1
+ pinsrb m0, [r4 + 28], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 11], 1
+ pinsrb m2, [r4 + 13], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1272 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 1], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 9], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1273 * 16], m4
+
+ ; mode 21 [row 29]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1274 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1275 * 16], m4
+
+ ; mode 21 [row 30]
+ movu m6, [r5 + 17 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 28], 1
+ pinsrb m0, [r4 + 30], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 13], 1
+ pinsrb m2, [r4 + 15], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1276 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 0], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 8], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1277 * 16], m4
+
+ ; mode21 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1278 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1278 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1279 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1279 * 16 + 8], m5
+
+ ; mode 22 [row 0]
+ movu m6, [r5 + 19 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 1280 * 16], m1
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1281 * 16], m4
+
+ ; mode 22 [row 1]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1282 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1283 * 16], m4
+
+ ; mode 22 [row 2]
+ movu m6, [r5 + 25 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 2], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1284 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1285 * 16], m4
+
+ ; mode 22 [row 3]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1286 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1287 * 16], m4
+
+ ; mode 22 [row 4]
+ movu m6, [r5 + 31 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 2], 1
+ pinsrb m0, [r4 + 5], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1288 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 14], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 22], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1289 * 16], m4
+
+ ; mode 22 [row 5]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1290 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1291 * 16], m4
+
+ ; mode 22 [row 6]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1292 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1293 * 16], m4
+
+ ; mode 22 [row 7]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 5], 1
+ pinsrb m0, [r4 + 7], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1294 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 13], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 21], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1295 * 16], m4
+
+ ; mode 22 [row 8]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1296 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1297 * 16], m4
+
+ ; mode 22 [row 9]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 7], 1
+ pinsrb m0, [r4 + 10], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1298 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 12], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 20], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1299 * 16], m4
+
+ ; mode 22 [row 10]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1300 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1301 * 16], m4
+
+ ; mode 22 [row 11]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1302 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1303 * 16], m4
+
+ ; mode 22 [row 12]
+ movu m6, [r5 + 23 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 10], 1
+ pinsrb m0, [r4 + 12], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1304 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 11], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 19], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1305 * 16], m4
+
+ ; mode 22 [row 13]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1306 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1307 * 16], m4
+
+ ; mode 22 [row 14]
+ movu m6, [r5 + 29 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 12], 1
+ pinsrb m0, [r4 + 15], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1308 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 10], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 18], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1309 * 16], m4
+
+ ; mode 22 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1310 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1311 * 16], m4
+
+ ; mode 22 [row 16]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1312 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1313 * 16], m4
+
+ ; mode 22 [row 17]
+ movu m6, [r5 + 22 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 15], 1
+ pinsrb m0, [r4 + 17], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1314 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 9], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 17], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1315 * 16], m4
+
+ ; mode 22 [row 18]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1316 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1317 * 16], m4
+
+ ; mode 22 [row 19]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 17], 1
+ pinsrb m0, [r4 + 20], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1318 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 8], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 16], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1319 * 16], m4
+
+ ; mode 22 [row 20]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1320 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1321 * 16], m4
+
+ ; mode 22 [row 21]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1322 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1323 * 16], m4
+
+ ; mode 22 [row 22]
+ movu m6, [r5 + 21 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 20], 1
+ pinsrb m0, [r4 + 22], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 0], 1
+ pinsrb m2, [r4 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1324 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 7], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 15], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1325 * 16], m4
+
+ ; mode 22 [row 23]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1326 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1327 * 16], m4
+
+ ; mode 22 [row 24]
+ movu m6, [r5 + 27 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 22], 1
+ pinsrb m0, [r4 + 25], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 2], 1
+ pinsrb m2, [r4 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1328 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 6], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 14], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1329 * 16], m4
+
+ ; mode 22 [row 25]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1330 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1331 * 16], m4
+
+ ; mode 22 [row 26]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1332 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1333 * 16], m4
+
+ ; mode 22 [row 27]
+ movu m6, [r5 + 20 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 25], 1
+ pinsrb m0, [r4 + 27], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 5], 1
+ pinsrb m2, [r4 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1334 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 5], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 13], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1335 * 16], m4
+
+ ; mode 22 [row 28]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1336 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1337 * 16], m4
+
+ ; mode 22 [row 29]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 27], 1
+ pinsrb m0, [r4 + 30], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrb m2, [r4 + 7], 1
+ pinsrb m2, [r4 + 10], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1338 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 4], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 12], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1339 * 16], m4
+
+ ; mode 22 [row 30]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1340 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1341 * 16], m4
+
+ ; mode22 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1342 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1342 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1343 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1343 * 16 + 8], m5
+
+ ; mode 23 [row 0]
+ movu m6, [r5 + 23 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m1, m0, m6
+ pmulhrsw m1, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m3, m2, m6
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movu [r0 + 1344 * 16], m1
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1345 * 16], m4
+
+ ; mode 23 [row 1]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1346 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1347 * 16], m4
+
+ ; mode 23 [row 2]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1348 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1349 * 16], m4
+
+ ; mode 23 [row 3]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 4], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1350 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1351 * 16], m4
+
+ ; mode 23 [row 4]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1352 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1353 * 16], m4
+
+ ; mode 23 [row 5]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1354 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1355 * 16], m4
+
+ ; mode 23 [row 6]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1356 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1357 * 16], m4
+
+ ; mode 23 [row 7]
+ movu m6, [r5 + 24 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 4], 1
+ pinsrb m0, [r4 + 7], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1358 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 14], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 22], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1359 * 16], m4
+
+ ; mode 23 [row 8]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1360 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1361 * 16], m4
+
+ ; mode 23 [row 9]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1362 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1363 * 16], m4
+
+ ; mode 23 [row 10]
+ movu m6, [r5 + 29 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 7], 1
+ pinsrb m0, [r4 + 11], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1364 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 13], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 21], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1365 * 16], m4
+
+ ; mode 23 [row 11]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1366 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1367 * 16], m4
+
+ ; mode 23 [row 12]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1368 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1369 * 16], m4
+
+ ; mode 23 [row 13]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1370 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1371 * 16], m4
+
+ ; mode 23 [row 14]
+ movu m6, [r5 + 25 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 11], 1
+ pinsrb m0, [r4 + 14], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1372 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 12], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 20], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1373 * 16], m4
+
+ ; mode 23 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1374 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1375 * 16], m4
+
+ ; mode 23 [row 16]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1376 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1377 * 16], m4
+
+ ; mode 23 [row 17]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 14], 1
+ pinsrb m0, [r4 + 18], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 3], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1378 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 11], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 19], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1379 * 16], m4
+
+ ; mode 23 [row 18]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1380 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1381 * 16], m4
+
+ ; mode 23 [row 19]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1382 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1383 * 16], m4
+
+ ; mode 23 [row 20]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1384 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1385 * 16], m4
+
+ ; mode 23 [row 21]
+ movu m6, [r5 + 26 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 18], 1
+ pinsrb m0, [r4 + 21], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 2], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1386 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 10], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 18], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1387 * 16], m4
+
+ ; mode 23 [row 22]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1388 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1389 * 16], m4
+
+ ; mode 23 [row 23]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1390 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1391 * 16], m4
+
+ ; mode 23 [row 24]
+ movu m6, [r5 + 31 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 21], 1
+ pinsrb m0, [r4 + 25], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 1], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1392 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 9], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 17], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1393 * 16], m4
+
+ ; mode 23 [row 25]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1394 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1395 * 16], m4
+
+ ; mode 23 [row 26]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1396 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1397 * 16], m4
+
+ ; mode 23 [row 27]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1398 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1399 * 16], m4
+
+ ; mode 23 [row 28]
+ movu m6, [r5 + 27 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 25], 1
+ pinsrb m0, [r4 + 28], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 0], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1400 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 8], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 16], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1401 * 16], m4
+
+ ; mode 23 [row 29]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1402 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1403 * 16], m4
+
+ ; mode 23 [row 30]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1404 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1405 * 16], m4
+
+ ; mode23 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1406 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1406 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1407 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1407 * 16 + 8], m5
+
+ ; mode 24 [row 0]
+ movu m6, [r5 + 27 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1408 * 16], m4
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1409 * 16], m4
+
+ ; mode 24 [row 1]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1410 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1411 * 16], m4
+
+ ; mode 24 [row 2]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1412 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1413 * 16], m4
+
+ ; mode 24 [row 3]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1414 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1415 * 16], m4
+
+ ; mode 24 [row 4]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1416 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1417 * 16], m4
+
+ ; mode 24 [row 5]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1418 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1419 * 16], m4
+
+ ; mode 24 [row 6]
+ movu m6, [r5 + 29 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 6], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1420 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1421 * 16], m4
+
+ ; mode 24 [row 7]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1422 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1423 * 16], m4
+
+ ; mode 24 [row 8]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1424 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1425 * 16], m4
+
+ ; mode 24 [row 9]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1426 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1427 * 16], m4
+
+ ; mode 24 [row 10]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1428 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1429 * 16], m4
+
+ ; mode 24 [row 11]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1430 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1431 * 16], m4
+
+ ; mode 24 [row 12]
+ movu m6, [r5 + 31 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 6], 1
+ pinsrb m0, [r4 + 13], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 6], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1432 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 14], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 22], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1433 * 16], m4
+
+ ; mode 24 [row 13]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1434 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1435 * 16], m4
+
+ ; mode 24 [row 14]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1436 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1437 * 16], m4
+
+ ; mode 24 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1438 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1439 * 16], m4
+
+ ; mode 24 [row 16]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1440 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1441 * 16], m4
+
+ ; mode 24 [row 17]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1442 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1443 * 16], m4
+
+ ; mode 24 [row 18]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1444 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1445 * 16], m4
+
+ ; mode 24 [row 19]
+ movu m6, [r5 + 28 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 13], 1
+ pinsrb m0, [r4 + 19], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 5], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1446 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 13], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 21], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1447 * 16], m4
+
+ ; mode 24 [row 20]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1448 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1449 * 16], m4
+
+ ; mode 24 [row 21]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1450 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1451 * 16], m4
+
+ ; mode 24 [row 22]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1452 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1453 * 16], m4
+
+ ; mode 24 [row 23]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1454 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1455 * 16], m4
+
+ ; mode 24 [row 24]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1456 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1457 * 16], m4
+
+ ; mode 24 [row 25]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 19], 1
+ pinsrb m0, [r4 + 26], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 4], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1458 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 12], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 20], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1459 * 16], m4
+
+ ; mode 24 [row 26]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1460 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1461 * 16], m4
+
+ ; mode 24 [row 27]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1462 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1463 * 16], m4
+
+ ; mode 24 [row 28]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1464 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1465 * 16], m4
+
+ ; mode 24 [row 29]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1466 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1467 * 16], m4
+
+ ; mode 24 [row 30]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1468 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1469 * 16], m4
+
+ ; mode 24 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1470 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1470 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1471 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1471 * 16 + 8], m5
+
+ ; mode 25 [row 0]
+ movu m6, [r5 + 30 * 16]
+ movu m0, [r3 ]
+ movu m1, [r3 + 1 ]
+ punpcklbw m0, m1
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ movu m2, [r3 + 8]
+ movu m3, [r3 + 9]
+ punpcklbw m2, m3
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1472 * 16], m4
+
+ movu m1, [r3 + 16]
+ movu m3, [r3 + 17]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 24]
+ movu m5, [r3 + 25]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1473 * 16], m4
+
+ ; mode 25 [row 1]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1474 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1475 * 16], m4
+
+ ; mode 25 [row 2]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1476 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1477 * 16], m4
+
+ ; mode 25 [row 3]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1478 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1479 * 16], m4
+
+ ; mode 25 [row 4]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1480 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1481 * 16], m4
+
+ ; mode 25 [row 5]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1482 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1483 * 16], m4
+
+ ; mode 25 [row 6]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1484 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1485 * 16], m4
+
+ ; mode 25 [row 7]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1486 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1487 * 16], m4
+
+ ; mode 25 [row 8]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1488 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1489 * 16], m4
+
+ ; mode 25 [row 9]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1490 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1491 * 16], m4
+
+ ; mode 25 [row 10]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1492 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1493 * 16], m4
+
+ ; mode 25 [row 11]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1494 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1495 * 16], m4
+
+ ; mode 25 [row 12]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1496 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1497 * 16], m4
+
+ ; mode 25 [row 13]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1498 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1499 * 16], m4
+
+ ; mode 25 [row 14]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1500 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1501 * 16], m4
+
+ ; mode 25 [row 15]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1502 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1502 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1503 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1503 * 16 + 8], m5
+
+ ; mode 25 [row 16]
+ movu m6, [r5 + 30 * 16]
+ pslldq m0, 2
+ pinsrb m0, [r4 + 0], 1
+ pinsrb m0, [r4 + 16], 0
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pslldq m2, 2
+ pinsrw m2, [r3 + 7], 0
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1504 * 16], m4
+ pslldq m1, 2
+ pinsrw m1, [r3 + 15], 0
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pslldq m3, 2
+ pinsrw m3, [r3 + 23], 0
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1505 * 16], m4
+
+ ; mode 25 [row 17]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1506 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1507 * 16], m4
+
+ ; mode 25 [row 18]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1508 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1509 * 16], m4
+
+ ; mode 25 [row 19]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1510 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1511 * 16], m4
+
+ ; mode 25 [row 20]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1512 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1513 * 16], m4
+
+ ; mode 25 [row 21]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1514 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1515 * 16], m4
+
+ ; mode 25 [row 22]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1516 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1517 * 16], m4
+
+ ; mode 25 [row 23]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1518 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1519 * 16], m4
+
+ ; mode 25 [row 24]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1520 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1521 * 16], m4
+
+ ; mode 25 [row 25]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1522 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1523 * 16], m4
+
+ ; mode 25 [row 26]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1524 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1525 * 16], m4
+
+ ; mode 25 [row 27]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1526 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1527 * 16], m4
+
+ ; mode 25 [row 28]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1528 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1529 * 16], m4
+
+ ; mode 25 [row 29]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1530 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1531 * 16], m4
+
+ ; mode 25 [row 30]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1532 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1533 * 16], m4
+
+ ; mode 25 [row 31]
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1534 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1534 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1535 * 16], m5
+ pshufb m5, m3, [tab_S2]
+ movh [r0 + 1535 * 16 + 8], m5
+
+ ; mode 26
+ movu m1, [r1 + 1]
+ movu m2, [r1 + 17]
+ movu [r0 + 1536 * 16], m1
+ movu [r0 + 1537 * 16], m2
+ movu [r0 + 1538 * 16], m1
+ movu [r0 + 1539 * 16], m2
+ movu [r0 + 1540 * 16], m1
+ movu [r0 + 1541 * 16], m2
+ movu [r0 + 1542 * 16], m1
+ movu [r0 + 1543 * 16], m2
+ movu [r0 + 1544 * 16], m1
+ movu [r0 + 1545 * 16], m2
+ movu [r0 + 1546 * 16], m1
+ movu [r0 + 1547 * 16], m2
+ movu [r0 + 1548 * 16], m1
+ movu [r0 + 1549 * 16], m2
+ movu [r0 + 1550 * 16], m1
+ movu [r0 + 1551 * 16], m2
+
+ movu [r0 + 1552 * 16], m1
+ movu [r0 + 1553 * 16], m2
+ movu [r0 + 1554 * 16], m1
+ movu [r0 + 1555 * 16], m2
+ movu [r0 + 1556 * 16], m1
+ movu [r0 + 1557 * 16], m2
+ movu [r0 + 1558 * 16], m1
+ movu [r0 + 1559 * 16], m2
+ movu [r0 + 1560 * 16], m1
+ movu [r0 + 1561 * 16], m2
+ movu [r0 + 1562 * 16], m1
+ movu [r0 + 1563 * 16], m2
+ movu [r0 + 1564 * 16], m1
+ movu [r0 + 1565 * 16], m2
+ movu [r0 + 1566 * 16], m1
+ movu [r0 + 1567 * 16], m2
+
+ movu [r0 + 1568 * 16], m1
+ movu [r0 + 1569 * 16], m2
+ movu [r0 + 1570 * 16], m1
+ movu [r0 + 1571 * 16], m2
+ movu [r0 + 1572 * 16], m1
+ movu [r0 + 1573 * 16], m2
+ movu [r0 + 1574 * 16], m1
+ movu [r0 + 1575 * 16], m2
+ movu [r0 + 1576 * 16], m1
+ movu [r0 + 1577 * 16], m2
+ movu [r0 + 1578 * 16], m1
+ movu [r0 + 1579 * 16], m2
+ movu [r0 + 1580 * 16], m1
+ movu [r0 + 1581 * 16], m2
+ movu [r0 + 1582 * 16], m1
+ movu [r0 + 1583 * 16], m2
+
+ movu [r0 + 1584 * 16], m1
+ movu [r0 + 1585 * 16], m2
+ movu [r0 + 1586 * 16], m1
+ movu [r0 + 1587 * 16], m2
+ movu [r0 + 1588 * 16], m1
+ movu [r0 + 1589 * 16], m2
+ movu [r0 + 1590 * 16], m1
+ movu [r0 + 1591 * 16], m2
+ movu [r0 + 1592 * 16], m1
+ movu [r0 + 1593 * 16], m2
+ movu [r0 + 1594 * 16], m1
+ movu [r0 + 1595 * 16], m2
+ movu [r0 + 1596 * 16], m1
+ movu [r0 + 1597 * 16], m2
+ movu [r0 + 1598 * 16], m1
+ movu [r0 + 1599 * 16], m2
+
+ ; mode 27 [row 0]
+ movu m6, [r5 + 2 * 16]
+ movu m0, [r3 + 1 ]
+ movu m1, [r3 + 2 ]
+ punpcklbw m0, m1
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ movu m2, [r3 + 9]
+ movu m3, [r3 + 10]
+ punpcklbw m2, m3
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1600 * 16], m4
+
+ movu m1, [r3 + 17]
+ movu m3, [r3 + 18]
+ punpcklbw m1, m3
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ movu m3, [r3 + 25]
+ movu m5, [r3 + 26]
+ punpcklbw m3, m5
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1601 * 16], m4
+
+ ; mode 27 [row 1]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1602 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1603 * 16], m4
+
+ ; mode 27 [row 2]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1604 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1605 * 16], m4
+
+ ; mode 27 [row 3]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1606 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1607 * 16], m4
+
+ ; mode 27 [row 4]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1608 * 16], m4
+
+ ; mode 28 [row 1 -first half]
+ movu [r0 + 1666 * 16], m4
+
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1609 * 16], m4
+
+ ; mode 28 [row 1 - second half]
+ movu [r0 + 1667 * 16], m4
+
+ ; mode 27 [row 5]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1610 * 16], m4
+
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1611 * 16], m4
+
+ ; mode 27 [row 6]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1612 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1613 * 16], m4
+
+ ; mode 27 [row 7]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1614 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1615 * 16], m4
+
+ ; mode 27 [row 8]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1616 * 16], m4
+
+ ; mode 29 [row 1 - first half]
+ movu [r0 + 1730 * 16], m4
+
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1617 * 16], m4
+
+ ; mode 29 [row 1 - second half]
+ movu [r0 + 1731 * 16], m4
+
+ ; mode 27 [row 9]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1618 * 16], m4
+
+ ; mode 28 [row 3 -first half]
+ movu [r0 + 1670 * 16], m4
+
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1619 * 16], m4
+
+ ; mode 28 [row 3 -second half]
+ movu [r0 + 1671 * 16], m4
+
+ ; mode 27 [row 10]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1620 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1621 * 16], m4
+
+ ; mode 27 [row 11]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1622 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1623 * 16], m4
+
+ ; mode 27 [row 12]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1624 * 16], m4
+
+ ; mode 30 [row 1 - first half]
+ movu [r0 + 1794 * 16], m4
+
+ ; mode 33 [row 0 - first half]
+ movu [r0 + 1984 * 16], m4
+
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1625 * 16], m4
+
+ ; mode 30 [row 1 - second half]
+ movu [r0 + 1795 * 16], m4
+
+ ; mode 33 [row 0 - second half]
+ movu [r0 + 1985 * 16], m4
+
+ ; mode 27 [row 13]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1626 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1627 * 16], m4
+
+ ; mode 27 [row 14]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1628 * 16], m4
+
+ ; mode 28 [row 5 first half]
+ movu [r0 + 1674 * 16], m4
+
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1629 * 16], m4
+
+ ; mode 28 [row 5 second half]
+ movu [r0 + 1675 * 16], m4
+
+ ; mode 28 [row 0]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1664 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1665 * 16], m4
+
+ ; mode 28 [row 2]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1668 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1669 * 16], m4
+
+ ; mode 28 [row 4]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1672 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1673 * 16], m4
+
+ ; mode 30 [row 0]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1792 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1793 * 16], m4
+
+ ; mode 29 [row 0]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1728 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1729 * 16], m4
+
+ ; mode 29 [row 2]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1732 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1733 * 16], m4
+
+ ; mode 31 [row 0]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1856 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1857 * 16], m4
+
+ ; mode 32 [row 0]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m4, m0, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1920 * 16], m4
+ pmaddubsw m4, m1, m6
+ pmulhrsw m4, m7
+ pmaddubsw m5, m3, m6
+ pmulhrsw m5, m7
+ packuswb m4, m5
+ movu [r0 + 1921 * 16], m4
+
+ ; mode 27 [row 15]
+ movu m0, [r3 + 2]
+ movd m1, [r3 + 3]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 10]
+ movd m3, [r3 + 11]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 18]
+ movd m3, [r3 + 19]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 26]
+ movd m5, [r3 + 27]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1630 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1630 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1631 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1631 * 16 + 8], m5
+
+ ; mode 27 [row 16]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1632 * 16], m3
+
+ ; mode 31 [row 1 - first half]
+ movu [r0 + 1858 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1633 * 16], m3
+
+ ; mode 31 [row 1 - second half]
+ movu [r0 + 1859 * 16], m3
+
+ ; mode 27 [row 17]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1634 * 16], m3
+
+ ; mode 29 [row 3 - first half]
+ movu [r0 + 1734 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1635 * 16], m3
+
+ ; mode 29 [row 3 - second half]
+ movu [r0 + 1735 * 16], m3
+
+ ; mode 27 [row 18]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1636 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1637 * 16], m3
+
+ ; mode 27 [row 19]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1638 * 16], m3
+
+ ; mode 28 [row 7 - first half]
+ movu [r0 + 1678 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1639 * 16], m3
+
+ ; mode 28 [row 7 - second half]
+ movu [r0 + 1679 * 16], m3
+
+ ; mode 27 [row 20]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1640 * 16], m3
+
+ ; mode 32 [row 1 - first half]
+ movu [r0 + 1922 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1641 * 16], m3
+
+ ; mode 32 [row 1 - second half]
+ movu [r0 + 1923 * 16], m3
+
+ ; mode 27 [row 21]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1642 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1643 * 16], m3
+
+ ; mode 27 [row 22]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1644 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1645 * 16], m3
+
+ ; mode 27 [row 23]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1646 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1647 * 16], m3
+
+ ; mode 27 [row 24]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1648 * 16], m3
+
+ ; mode 28 [row 9 - first half]
+ movu [r0 + 1682 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1649 * 16], m3
+
+ ; mode 28 [row 9 - second half]
+ movu [r0 + 1683 * 16], m3
+
+ ; mode 27 [row 25]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1650 * 16], m3
+
+ ; mode 30 [row 3 - first half]
+ movu [r0 + 1798 * 16], m3
+
+ ; mode 33 [row 1 - first half]
+ movu [r0 + 1986 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1651 * 16], m3
+
+ ; mode 30 [row 3 - second half]
+ movu [r0 + 1799 * 16], m3
+
+ ; mode 33 [row 1 - second half]
+ movu [r0 + 1987 * 16], m3
+
+ ; mode 27 [row 26]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1652 * 16], m3
+
+ ; mode 29 [row 5 - first half]
+ movu [r0 + 1738 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1653 * 16], m3
+
+ ; mode 29 [row 5 - second half]
+ movu [r0 + 1739 * 16], m3
+
+ ; mode 27 [row 27]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1654 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1655 * 16], m3
+
+ ; mode 27 [row 28]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1656 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1657 * 16], m3
+
+ ; mode 27 [row 29]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1658 * 16], m3
+
+ ; mode 28 [row 11 - first half]
+ movu [r0 + 1686 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1659 * 16], m3
+
+ ; mode 28 [row 11 - second half]
+ movu [r0 + 1687 * 16], m3
+
+ ; mode 27 [row 30]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1660 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1661 * 16], m3
+
+ ; mode 28 [row 6]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1676 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1677 * 16], m3
+
+ ; mode 28 [row 8]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1680 * 16], m3
+
+ ; mode 29 [row 4 - first half]
+ movu [r0 + 1736 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1681 * 16], m3
+
+ ; mode 29 [row 4 - second half]
+ movu [r0 + 1737 * 16], m3
+
+ ; mode 28 [row 10]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1684 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1685 * 16], m3
+
+ ; mode 29 [row 6]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1740 * 16], m3
+
+ ; mode 32 [row 2 - first half]
+ movu [r0 + 1924 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1741 * 16], m3
+
+ ; mode 32 [row 2 - second half]
+ movu [r0 + 1925 * 16], m3
+
+ ; mode 30 [row 2]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1796 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1797 * 16], m3
+
+ ; mode 31 [row 2]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1860 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1861 * 16], m3
+
+ ; mode 27 [row 15]
+ movu m0, [r3 + 3]
+ movd m1, [r3 + 4]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 11]
+ movd m3, [r3 + 12]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 19]
+ movd m3, [r3 + 20]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 27]
+ movd m5, [r3 + 28]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1662 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1662 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1663 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1663 * 16 + 8], m5
+
+ ; mode 28 [row 12]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1688 * 16], m3
+
+ ; mode 30 [row 4 - first half]
+ movu [r0 + 1800 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1689 * 16], m3
+
+ ; mode 30 [row 4 - second half]
+ movu [r0 + 1801 * 16], m3
+
+ ; mode 28 [row 13]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1690 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1691 * 16], m3
+
+ ; mode 28 [row 14]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1692 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1693 * 16], m3
+
+ ; mode 28 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1694 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1695 * 16], m3
+
+ ; mode 28 [row 16]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1696 * 16], m3
+
+ ; mode 31 [row 4 - first half]
+ movu [r0 + 1864 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1697 * 16], m3
+
+ ; mode 31 [row 4 - second half]
+ movu [r0 + 1865 * 16], m3
+
+ ; mode 28 [row 17]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1698 * 16], m3
+
+ ; mode 29 [row 9 - first half]
+ movu [r0 + 1746 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1699 * 16], m3
+
+ ; mode 29 [row 9 - second half]
+ movu [r0 + 1747 * 16], m3
+
+ ; mode 28 [row 18]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1700 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1701 * 16], m3
+
+ ; mode 29 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1742 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1743 * 16], m3
+
+ ; mode 29 [row 8]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1744 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1745 * 16], m3
+
+ ; mode 30 [row 5]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1802 * 16], m3
+
+ ; mode 33 [row 2 - first half]
+ movu [r0 + 1988 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1803 * 16], m3
+
+ ; mode 33 [row 2 - second half]
+ movu [r0 + 1989 * 16], m3
+
+ ; mode 30 [row 6]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1804 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1805 * 16], m3
+
+ ; mode 31 [row 3]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1862 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1863 * 16], m3
+
+ ; mode 32 [row 3]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1926 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1927 * 16], m3
+
+ ; mode 28 [row 19]
+ movu m6, [r5 + 4 * 16]
+ movu m0, [r3 + 4]
+ movd m1, [r3 + 5]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 12]
+ movd m4, [r3 + 13]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1702 * 16], m3
+
+ movu m1, [r3 + 20]
+ movd m3, [r3 + 21]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 28]
+ movd m5, [r3 + 29]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1703 * 16], m3
+
+ ; mode 28 [row 20]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1704 * 16], m3
+
+ ; mode 32 [row 4 - first half]
+ movu [r0 + 1928 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1705 * 16], m3
+
+ ; mode 32 [row 4 - second half]
+ movu [r0 + 1929 * 16], m3
+
+ ; mode 28 [row 21]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1706 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1707 * 16], m3
+
+ ; mode 28 [row 22]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1708 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1709 * 16], m3
+
+ ; mode 28 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1710 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1711 * 16], m3
+
+ ; mode 28 [row 24]
+ movu m6, [r5 + 29 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1712 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1713 * 16], m3
+
+ ; mode 29 [row 10]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1748 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1749 * 16], m3
+
+ ; mode 29 [row 11]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1750 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1751 * 16], m3
+
+ ; mode 29 [row 12]
+ movu m6, [r5 + 21 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1752 * 16], m3
+
+ ; mode 30 [row 8 -first half]
+ movu [r0 + 1808 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1753 * 16], m3
+
+ ; mode 30 [row 8 -second half]
+ movu [r0 + 1809 * 16], m3
+
+ ; mode 29 [row 13]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1754 * 16], m3
+
+ ; mode 32 [row 5 - first half]
+ movu [r0 + 1930 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1755 * 16], m3
+
+ ; mode 32 [row 5 - second half]
+ movu [r0 + 1931 * 16], m3
+
+ ; mode 30 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1806 * 16], m3
+
+ ; mode 33 [row 3 - first half]
+ movu [r0 + 1990 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1807 * 16], m3
+
+ ; mode 33 [row 3 - second half]
+ movu [r0 + 1991 * 16], m3
+
+ ; mode 31 [row 5]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1866 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1867 * 16], m3
+
+ ; mode 31 [row 6]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1868 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1869 * 16], m3
+
+ ; mode 28 [row 25]
+ movu m6, [r5 + 2 * 16]
+ movu m0, [r3 + 5]
+ movd m1, [r3 + 6]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 13]
+ movd m4, [r3 + 14]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1714 * 16], m3
+
+ movu m1, [r3 + 21]
+ movd m3, [r3 + 22]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 29]
+ movd m5, [r3 + 30]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1715 * 16], m3
+
+ ; mode 28 [row 26]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1716 * 16], m3
+
+ ; mode 29 [row 14 - first half]
+ movu [r0 + 1756 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1717 * 16], m3
+
+ ; mode 29 [row 14 - second half]
+ movu [r0 + 1757 * 16], m3
+
+ ; mode 28 [row 27]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1718 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1719 * 16], m3
+
+ ; mode 28 [row 28]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1720 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1721 * 16], m3
+
+ ; mode 28 [row 29]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1722 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1723 * 16], m3
+
+ ; mode 28 [row 30]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1724 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1725 * 16], m3
+
+ ; mode 29 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1758 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1759 * 16], m3
+
+ ; mode 29 [row 16]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1760 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1761 * 16], m3
+
+ ; mode 30 [row 9]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1810 * 16], m3
+
+ ; mode 33 [row 4 - first half]
+ movu [r0 + 1992 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1811 * 16], m3
+
+ ; mode 33 [row 4 - second half]
+ movu [r0 + 1993 * 16], m3
+
+ ; mode 30 [row 10]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1812 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1813 * 16], m3
+
+ ; mode 31 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1870 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1871 * 16], m3
+
+ ; mode 31 [row 8]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1872 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1873 * 16], m3
+
+ ; mode 32 [row 6]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1932 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1933 * 16], m3
+
+ ; mode 30 [row 11]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1814 * 16], m3
+
+ ; mode 33 [row 5 - first half]
+ movu [r0 + 1994 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1815 * 16], m3
+
+ ; mode 33 [row 5 - second half]
+ movu [r0 + 1995 * 16], m3
+
+ ; mode 28 [row 31]
+ movu m0, [r3 + 6]
+ movd m1, [r3 + 7]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 14]
+ movd m3, [r3 + 15]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 22]
+ movd m3, [r3 + 23]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 30]
+ movd m5, [r3 + 31]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1726 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1726 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1727 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1727 * 16 + 8], m5
+
+ ; mode 29 [row 17]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1762 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1763 * 16], m3
+
+ ; mode 29 [row 18]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1764 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1765 * 16], m3
+
+ ; mode 29 [row 19]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1766 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1767 * 16], m3
+
+ ; mode 29 [row 20]
+ movu m6, [r5 + 29 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1768 * 16], m3
+
+ ; mode 32 [row 8 - first halif]
+ movu [r0 + 1936 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1769 * 16], m3
+
+ ; mode 32 [row 8 - second halif]
+ movu [r0 + 1937 * 16], m3
+
+ ; mode 30 [row 12]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1816 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1817 * 16], m3
+
+ ; mode 30 [row 13]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1818 * 16], m3
+
+ ; mode 33 [row 6 - first half]
+ movu [r0 + 1996 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1819 * 16], m3
+
+ ; mode 33 [row 6 - second half]
+ movu [r0 + 1997 * 16], m3
+
+ ; mode 31 [row 9]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1874 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1875 * 16], m3
+
+ ; mode 31 [row 10]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1876 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1877 * 16], m3
+
+ ; mode 32 [row 7]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1934 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1935 * 16], m3
+
+ ; mode 29 [row 21]
+ movu m6, [r5 + 6 * 16]
+ movu m0, [r3 + 7]
+ movd m1, [r3 + 8]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 15]
+ movd m4, [r3 + 16]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1770 * 16], m3
+
+ movu m1, [r3 + 23]
+ movd m3, [r3 + 24]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 31]
+ movd m5, [r3 + 32]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1771 * 16], m3
+
+ ; mode 29 [row 22]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1772 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1773 * 16], m3
+
+ ; mode 29 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1774 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1775 * 16], m3
+
+ ; mode 30 [row 14]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1820 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1821 * 16], m3
+
+ ; mode 30 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1822 * 16], m3
+
+ ; mode 33 [row 7 - first half]
+ movu [r0 + 1998 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1823 * 16], m3
+
+ ; mode 33 [row 7 - second half]
+ movu [r0 + 1999 * 16], m3
+
+ ; mode 30 [row 16]
+ movu m6, [r5 + 29 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1824 * 16], m3
+
+ ; mode 31 [row 12 - first half]
+ movu [r0 + 1880 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1825 * 16], m3
+
+ ; mode 31 [row 12 - second half]
+ movu [r0 + 1881 * 16], m3
+
+ ; mode 31 [row 11]
+ movu m6, [r5 + 12 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1878 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1879 * 16], m3
+
+ ; mode 32 [row 9]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1938 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1939 * 16], m3
+
+ ; mode 29 [row 24]
+ movu m6, [r5 + 1 * 16]
+ movu m0, [r3 + 8]
+ movd m1, [r3 + 9]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 16]
+ movd m4, [r3 + 17]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1776 * 16], m3
+
+ movu m1, [r3 + 24]
+ movd m3, [r3 + 25]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 32]
+ movd m5, [r3 + 33]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1777 * 16], m3
+
+ ; mode 29 [row 25]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1778 * 16], m3
+
+ ; mode 30 [row 17 - first half]
+ movu [r0 + 1826 * 16], m3
+
+ ; mode 33 [row 8 - first half]
+ movu [r0 + 2000 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1779 * 16], m3
+
+ ; mode 30 [row 17 - second half]
+ movu [r0 + 1827 * 16], m3
+
+ ; mode 33 [row 8 - second half]
+ movu [r0 + 2001 * 16], m3
+
+ ; mode 29 [row 26]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1780 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1781 * 16], m3
+
+ ; mode 29 [row 27]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1782 * 16], m3
+
+ ; mode 32 [row 11 - first half]
+ movu [r0 + 1942 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1783 * 16], m3
+
+ ; mode 32 [row 11 - second half]
+ movu [r0 + 1943 * 16], m3
+
+ ; mode 30 [row 18]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1828 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1829 * 16], m3
+
+ ; mode 31 [row 13]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1882 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1883 * 16], m3
+
+ ; mode 31 [row 14]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1884 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1885 * 16], m3
+
+ ; mode 32 [row 10]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1940 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1941 * 16], m3
+
+ ; mode 29 [row 28]
+ movu m6, [r5 + 5 * 16]
+ movu m0, [r3 + 9]
+ movd m1, [r3 + 10]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 17]
+ movd m4, [r3 + 18]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1784 * 16], m3
+
+ movu m1, [r3 + 25]
+ movd m3, [r3 + 26]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 33]
+ movd m5, [r3 + 34]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1785 * 16], m3
+
+ ; mode 29 [row 29]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1786 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1787 * 16], m3
+
+ ; mode 29 [row 30]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1788 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1789 * 16], m3
+
+ ; mode 30 [row 19]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1830 * 16], m3
+
+ ; mode 33 [row 9 - first half]
+ movu [r0 + 2002 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1831 * 16], m3
+
+ ; mode 33 [row 9 - second half]
+ movu [r0 + 2003 * 16], m3
+
+ ; mode 30 [row 20]
+ movu m6, [r5 + 17 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1832 * 16], m3
+
+ ; mode 32 [row 12 - first half]
+ movu [r0 + 1944 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1833 * 16], m3
+
+ ; mode 32 [row 12 - second half]
+ movu [r0 + 1945 * 16], m3
+
+ ; mode 30 [row 21]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1834 * 16], m3
+
+ ; mode 33 [row 10 - first half]
+ movu [r0 + 2004 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1835 * 16], m3
+
+ ; mode 33 [row 10 - second half]
+ movu [r0 + 2005 * 16], m3
+
+ ; mode 31 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1886 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1887 * 16], m3
+
+ ; mode 29 [row 31]
+ movu m0, [r3 + 10]
+ movd m1, [r3 + 11]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 18]
+ movd m3, [r3 + 19]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 26]
+ movd m3, [r3 + 27]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 34]
+ movd m5, [r3 + 35]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1790 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1790 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1791 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1791 * 16 + 8], m5
+
+ ; mode 30 [row 22]
+ movu m6, [r5 + 11 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1836 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1837 * 16], m3
+
+ ; mode 30 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1838 * 16], m3
+
+ ; mode 33 [row 11 - first half]
+ movu [r0 + 2006 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1839 * 16], m3
+
+ ; mode 33 [row 11 - second half]
+ movu [r0 + 2007 * 16], m3
+
+ ; mode 31 [row 16]
+ movu m6, [r5 + 1 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1888 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1889 * 16], m3
+
+ ; mode 31 [row 17]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1890 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1891 * 16], m3
+
+ ; mode 32 [row 13]
+ movu m6, [r5 + 6 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1946 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1947 * 16], m3
+
+ ; mode 32 [row 14]
+ movu m6, [r5 + 27 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1948 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1949 * 16], m3
+
+ ; mode 30 [row 24]
+ movu m6, [r5 + 5 * 16]
+ movu m0, [r3 + 11]
+ movd m1, [r3 + 12]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 19]
+ movd m4, [r3 + 20]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1840 * 16], m3
+
+ movu m1, [r3 + 27]
+ movd m3, [r3 + 28]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 35]
+ movd m5, [r3 + 36]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1841 * 16], m3
+
+ ; mode 30 [row 25]
+ movu m6, [r5 + 18 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1842 * 16], m3
+
+ ; mode 33 [row 12 - first half]
+ movu [r0 + 2008 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1843 * 16], m3
+
+ ; mode 33 [row 12 - second half]
+ movu [r0 + 2009 * 16], m3
+
+ ; mode 30 [row 26]
+ movu m6, [r5 + 31 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1844 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1845 * 16], m3
+
+ ; mode 31 [row 18]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1892 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1893 * 16], m3
+
+ ; mode 31 [row 19]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1894 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1895 * 16], m3
+
+ ; mode 32 [row 15]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1950 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1951 * 16], m3
+
+ ; mode 30 [row 27]
+ movu m6, [r5 + 12 * 16]
+ movu m0, [r3 + 12]
+ movd m1, [r3 + 13]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 20]
+ movd m4, [r3 + 21]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1846 * 16], m3
+
+ ; mode 33 [row 13 - first half]
+ movu [r0 + 2010 * 16], m3
+
+ movu m1, [r3 + 28]
+ movd m3, [r3 + 29]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 36]
+ movd m5, [r3 + 37]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1847 * 16], m3
+
+ ; mode 33 [row 13 - second half]
+ movu [r0 + 2011 * 16], m3
+
+ ; mode 30 [row 28]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1848 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1849 * 16], m3
+
+ ; mode 31 [row 20]
+ movu m6, [r5 + 5 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1896 * 16], m3
+
+ ; mode 32 [row 16 - first half]
+ movu [r0 + 1952 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1897 * 16], m3
+
+ ; mode 32 [row 16 - second half]
+ movu [r0 + 1953 * 16], m3
+
+ ; mode 31 [row 21]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1898 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1899 * 16], m3
+
+ ; mode 32 [row 17]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1954 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1955 * 16], m3
+
+ ; mode 30 [row 29]
+ movu m6, [r5 + 6 * 16]
+ movu m0, [r3 + 13]
+ movd m1, [r3 + 14]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 21]
+ movd m4, [r3 + 22]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1850 * 16], m3
+
+ ; mode 33 [row 14 - first half]
+ movu [r0 + 2012 * 16], m3
+
+ movu m1, [r3 + 29]
+ movd m3, [r3 + 30]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 37]
+ movd m5, [r3 + 38]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1851 * 16], m3
+
+ ; mode 33 [row 14 - second half]
+ movu [r0 + 2013 * 16], m3
+
+ ; mode 30 [row 30]
+ movu m6, [r5 + 19 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1852 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1853 * 16], m3
+
+ ; mode 31 [row 22]
+ movu m6, [r5 + 7 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1900 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1901 * 16], m3
+
+ ; mode 31 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1902 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1903 * 16], m3
+
+ ; mode 32 [row 18]
+ movu m6, [r5 + 15 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1956 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1957 * 16], m3
+
+ ; mode 30 [row 31]
+ movu m0, [r3 + 14]
+ movd m1, [r3 + 15]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 22]
+ movd m3, [r3 + 23]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 30]
+ movd m3, [r3 + 31]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 38]
+ movd m5, [r3 + 39]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1854 * 16], m5
+
+ ; mode 33 [row 15 - first eight]
+ movh [r0 + 2014 * 16], m5
+
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1854 * 16 + 8], m5
+
+ ; mode 33 [row 15 - second eight]
+ movh [r0 + 2014 * 16 + 8], m5
+
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1855 * 16], m5
+
+ ; mode 33 [row 15 - third eight]
+ movh [r0 + 2015 * 16], m5
+
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1855 * 16 + 8], m5
+
+ ; mode 33 [row 15 - fourth eight]
+ movh [r0 + 2015 * 16 + 8], m5
+
+ ; mode 31 [row 24]
+ movu m6, [r5 + 9 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1904 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1905 * 16], m3
+
+ ; mode 31 [row 25]
+ movu m6, [r5 + 26 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1906 * 16], m3
+
+ ; mode 33 [row 16 - first half]
+ movu [r0 + 2016 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1907 * 16], m3
+
+ ; mode 33 [row 16 - second half]
+ movu [r0 + 2017 * 16], m3
+
+ ; mode 32 [row 19]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1958 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1959 * 16], m3
+
+ ; mode 32 [row 20]
+ movu m6, [r5 + 25 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1960 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1961 * 16], m3
+
+ ; mode 31 [row 26]
+ movu m6, [r5 + 11 * 16]
+ movu m0, [r3 + 15]
+ movd m1, [r3 + 16]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 23]
+ movd m4, [r3 + 24]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1908 * 16], m3
+
+ movu m1, [r3 + 31]
+ movd m3, [r3 + 32]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 39]
+ movd m5, [r3 + 40]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1909 * 16], m3
+
+ ; mode 31 [row 27]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1910 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1911 * 16], m3
+
+ ; mode 32 [row 21]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1962 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1963 * 16], m3
+
+ ; mode 33 [row 17]
+ movu m6, [r5 + 20 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2018 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2019 * 16], m3
+
+ ; mode 31 [row 28]
+ movu m6, [r5 + 13 * 16]
+ movu m0, [r3 + 16]
+ movd m1, [r3 + 17]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 24]
+ movd m4, [r3 + 25]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1912 * 16], m3
+
+ movu m1, [r3 + 32]
+ movd m3, [r3 + 33]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 40]
+ movd m5, [r3 + 41]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1913 * 16], m3
+
+ ; mode 31 [row 29]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1914 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1915 * 16], m3
+
+ ; mode 32 [row 22]
+ movu m6, [r5 + 3 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1964 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1965 * 16], m3
+
+ ; mode 32 [row 23]
+ movu m6, [r5 + 24 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1966 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1967 * 16], m3
+
+ ; mode 33 [row 18]
+ movu m6, [r5 + 14 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2020 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2021 * 16], m3
+
+ ; mode 31 [row 30]
+ movu m6, [r5 + 15 * 16]
+ movu m0, [r3 + 17]
+ movd m1, [r3 + 18]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 25]
+ movd m4, [r3 + 26]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1916 * 16], m3
+
+ movu m1, [r3 + 33]
+ movd m3, [r3 + 34]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 41]
+ movd m5, [r3 + 42]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1917 * 16], m3
+
+ ; mode 32 [row 24]
+ movu m6, [r5 + 13 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1968 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1969 * 16], m3
+
+ ; mode 33 [row 19]
+ movu m6, [r5 + 8 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2022 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2023 * 16], m3
+
+ ; mode 31 [row 31]
+ movu m0, [r3 + 18]
+ movd m1, [r3 + 19]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 26]
+ movd m3, [r3 + 27]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 34]
+ movd m3, [r3 + 35]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 42]
+ movd m5, [r3 + 43]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1918 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1918 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1919 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1919 * 16 + 8], m5
+
+ ; mode 32 [row 25]
+ movu m6, [r5 + 2 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1970 * 16], m3
+
+ ; mode 33 [row 20 - first half]
+ movu [r0 + 2024 * 16], m3
+
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1971 * 16], m3
+
+ ; mode 33 [row 20 - second half]
+ movu [r0 + 2025 * 16], m3
+
+ ; mode 32 [row 26]
+ movu m6, [r5 + 23 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1972 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1973 * 16], m3
+
+ ; mode 33 [row 21]
+ movu m6, [r5 + 28 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2026 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2027 * 16], m3
+
+ ; mode 32 [row 27]
+ movu m6, [r5 + 12 * 16]
+ movu m0, [r3 + 19]
+ movd m1, [r3 + 20]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 27]
+ movd m4, [r3 + 28]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1974 * 16], m3
+
+ movu m1, [r3 + 35]
+ movd m3, [r3 + 36]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 43]
+ movd m5, [r3 + 44]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1975 * 16], m3
+
+ ; mode 33 [row 22]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2028 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2029 * 16], m3
+
+ ; mode 32 [row 28]
+ movu m6, [r5 + 1 * 16]
+ movu m0, [r3 + 20]
+ movd m1, [r3 + 21]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 28]
+ movd m4, [r3 + 29]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1976 * 16], m3
+
+ movu m1, [r3 + 36]
+ movd m3, [r3 + 37]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 44]
+ movd m5, [r3 + 45]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1977 * 16], m3
+
+ ; mode 32 [row 29]
+ movu m6, [r5 + 22 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1978 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1979 * 16], m3
+
+ ; mode 33 [row 23]
+ movu m6, [r5 + 16 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2030 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2031 * 16], m3
+
+ ; mode 32 [row 30]
+ movu m6, [r5 + 11 * 16]
+ movu m0, [r3 + 21]
+ movd m1, [r3 + 22]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 29]
+ movd m4, [r3 + 30]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1980 * 16], m3
+
+ movu m1, [r3 + 37]
+ movd m3, [r3 + 38]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 45]
+ movd m5, [r3 + 46]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 1981 * 16], m3
+
+ ; mode 33 [row 24]
+ movu m6, [r5 + 10 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2032 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2033 * 16], m3
+
+ ; mode 32 [row 31]
+ movu m0, [r3 + 22]
+ movd m1, [r3 + 23]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ movu m2, [r3 + 30]
+ movd m3, [r3 + 31]
+ palignr m3, m2, 1
+ punpcklbw m2, m3
+ movu m1, [r3 + 38]
+ movd m3, [r3 + 39]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ movu m4, [r3 + 46]
+ movd m5, [r3 + 47]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+
+ pshufb m5, m0, [tab_S2]
+ movh [r0 + 1982 * 16], m5
+ pshufb m5, m2, [tab_S2]
+ movh [r0 + 1982 * 16 + 8], m5
+ pshufb m5, m1, [tab_S2]
+ movh [r0 + 1983 * 16], m5
+ pshufb m5, m4, [tab_S2]
+ movh [r0 + 1983 * 16 + 8], m5
+
+ ; mode 33 [row 25]
+ movu m6, [r5 + 4 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2034 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2035 * 16], m3
+
+ ; mode 33 [row 26]
+ movu m6, [r5 + 30 * 16]
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2036 * 16], m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2037 * 16], m3
+
+ ; mode 33 [row 27]
+ movu m6, [r5 + 24 * 16]
+ movu m0, [r3 + 23]
+ movd m1, [r3 + 24]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 31]
+ movd m4, [r3 + 32]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2038 * 16], m3
+
+ movu m1, [r3 + 39]
+ movd m3, [r3 + 40]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 47]
+ movd m5, [r3 + 48]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2039 * 16], m3
+
+ ; mode 33 [row 28]
+ movu m6, [r5 + 18 * 16]
+ movu m0, [r3 + 24]
+ movd m1, [r3 + 25]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 32]
+ movd m4, [r3 + 33]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2040 * 16], m3
+
+ movu m1, [r3 + 40]
+ movd m3, [r3 + 41]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 48]
+ movd m5, [r3 + 49]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2041 * 16], m3
+
+ ; mode 33 [row 29]
+ movu m6, [r5 + 12 * 16]
+ movu m0, [r3 + 25]
+ movd m1, [r3 + 26]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 33]
+ movd m4, [r3 + 34]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2042 * 16], m3
+
+ movu m1, [r3 + 41]
+ movd m3, [r3 + 42]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 49]
+ movd m5, [r3 + 50]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2043 * 16], m3
+
+ ; mode 33 [row 30]
+ movu m6, [r5 + 6 * 16]
+ movu m0, [r3 + 26]
+ movd m1, [r3 + 27]
+ palignr m1, m0, 1
+ punpcklbw m0, m1
+ pmaddubsw m3, m0, m6
+ pmulhrsw m3, m7
+ movu m2, [r3 + 34]
+ movd m4, [r3 + 35]
+ palignr m4, m2, 1
+ punpcklbw m2, m4
+ pmaddubsw m5, m2, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2044 * 16], m3
+
+ movu m1, [r3 + 42]
+ movd m3, [r3 + 43]
+ palignr m3, m1, 1
+ punpcklbw m1, m3
+ pmaddubsw m3, m1, m6
+ pmulhrsw m3, m7
+ movu m4, [r3 + 50]
+ movd m5, [r3 + 51]
+ palignr m5, m4, 1
+ punpcklbw m4, m5
+ pmaddubsw m5, m4, m6
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ movu [r0 + 2045 * 16], m3
+
+ ; mode 33 [row 31]
+ movu m5, [r3 + 27]
+ movu [r0 + 2046 * 16], m5
+ movu m5, [r3 + 43]
+ movu [r0 + 2047 * 16], m5
+
+ ;mode 34 [row 0]
+ movu m0, [r3 + 2]
+ movu [r0 + 2048 * 16], m0
+ movu m1, [r3 + 18]
+ movu [r0 + 2049 * 16], m1
+
+ ;mode 34 [row 1]
+ movu m2, [r3 + 34]
+ palignr m3, m1, m0, 1
+ movu [r0 + 2050 * 16], m3
+ palignr m4, m2, m1, 1
+ movu [r0 + 2051 * 16], m4
+
+ ;mode 34 [row 2]
+ palignr m3, m1, m0, 2
+ movu [r0 + 2052 * 16], m3
+ palignr m4, m2, m1, 2
+ movu [r0 + 2053 * 16], m4
+
+ ;mode 34 [row 3]
+ palignr m3, m1, m0, 3
+ movu [r0 + 2054 * 16], m3
+ palignr m4, m2, m1, 3
+ movu [r0 + 2055 * 16], m4
+
+ ;mode 34 [row 4]
+ palignr m3, m1, m0, 4
+ movu [r0 + 2056 * 16], m3
+ palignr m4, m2, m1, 4
+ movu [r0 + 2057 * 16], m4
+
+ ;mode 34 [row 5]
+ palignr m3, m1, m0, 5
+ movu [r0 + 2058 * 16], m3
+ palignr m4, m2, m1, 5
+ movu [r0 + 2059 * 16], m4
+
+ ;mode 34 [row 6]
+ palignr m3, m1, m0, 6
+ movu [r0 + 2060 * 16], m3
+ palignr m4, m2, m1, 6
+ movu [r0 + 2061 * 16], m4
+
+ ;mode 34 [row 7]
+ palignr m3, m1, m0, 7
+ movu [r0 + 2062 * 16], m3
+ palignr m4, m2, m1, 7
+ movu [r0 + 2063 * 16], m4
+
+ ;mode 34 [row 8]
+ palignr m3, m1, m0, 8
+ movu [r0 + 2064 * 16], m3
+ palignr m4, m2, m1, 8
+ movu [r0 + 2065 * 16], m4
+
+ ;mode 34 [row 9]
+ palignr m3, m1, m0, 9
+ movu [r0 + 2066 * 16], m3
+ palignr m4, m2, m1, 9
+ movu [r0 + 2067 * 16], m4
+
+ ;mode 34 [row 10]
+ palignr m3, m1, m0, 10
+ movu [r0 + 2068 * 16], m3
+ palignr m4, m2, m1, 10
+ movu [r0 + 2069 * 16], m4
+
+ ;mode 34 [row 11]
+ palignr m3, m1, m0, 11
+ movu [r0 + 2070 * 16], m3
+ palignr m4, m2, m1, 11
+ movu [r0 + 2071 * 16], m4
+
+ ;mode 34 [row 12]
+ palignr m3, m1, m0, 12
+ movu [r0 + 2072 * 16], m3
+ palignr m4, m2, m1, 12
+ movu [r0 + 2073 * 16], m4
+
+ ;mode 34 [row 13]
+ palignr m3, m1, m0, 13
+ movu [r0 + 2074 * 16], m3
+ palignr m4, m2, m1, 13
+ movu [r0 + 2075 * 16], m4
+
+ ;mode 34 [row 14]
+ palignr m3, m1, m0, 14
+ movu [r0 + 2076 * 16], m3
+ palignr m4, m2, m1, 14
+ movu [r0 + 2077 * 16], m4
+
+ ;mode 34 [row 15]
+ palignr m3, m1, m0, 15
+ movu [r0 + 2078 * 16], m3
+ palignr m4, m2, m1, 15
+ movu [r0 + 2079 * 16], m4
+
+ ;mode 34 [row 16]
+ palignr m3, m1, m0, 16
+ movu [r0 + 2080 * 16], m3
+ palignr m4, m2, m1, 16
+ movu [r0 + 2081 * 16], m4
+
+ ;mode 34 [row 17]
+ movu m0, [r3 + 19]
+ movu [r0 + 2082 * 16], m0
+ movu m1, [r3 + 35]
+ movu [r0 + 2083 * 16], m1
+
+ mov r2d, r6d
+ mov [r4], r2b
+ mov r2d, [rsp]
+ mov [r1 + 64], r2b
+
+ ;mode 34 [row 18]
+ movu m2, [r3 + 51]
+ palignr m3, m1, m0, 1
+ movu [r0 + 2084 * 16], m3
+ palignr m4, m2, m1, 1
+ movu [r0 + 2085 * 16], m4
+
+ ;mode 34 [row 19]
+ palignr m3, m1, m0, 2
+ movu [r0 + 2086 * 16], m3
+ palignr m4, m2, m1, 2
+ movu [r0 + 2087 * 16], m4
+
+ ;mode 34 [row 20]
+ palignr m3, m1, m0, 3
+ movu [r0 + 2088 * 16], m3
+ palignr m4, m2, m1, 3
+ movu [r0 + 2089 * 16], m4
+
+ ;mode 34 [row 21]
+ palignr m3, m1, m0, 4
+ movu [r0 + 2090 * 16], m3
+ palignr m4, m2, m1, 4
+ movu [r0 + 2091 * 16], m4
+
+ ;mode 34 [row 22]
+ palignr m3, m1, m0, 5
+ movu [r0 + 2092 * 16], m3
+ palignr m4, m2, m1, 5
+ movu [r0 + 2093 * 16], m4
+
+ ;mode 34 [row 23]
+ palignr m3, m1, m0, 6
+ movu [r0 + 2094 * 16], m3
+ palignr m4, m2, m1, 6
+ movu [r0 + 2095 * 16], m4
+
+ ;mode 34 [row 24]
+ palignr m3, m1, m0, 7
+ movu [r0 + 2096 * 16], m3
+ palignr m4, m2, m1, 7
+ movu [r0 + 2097 * 16], m4
+
+ ;mode 34 [row 25]
+ palignr m3, m1, m0, 8
+ movu [r0 + 2098 * 16], m3
+ palignr m4, m2, m1, 8
+ movu [r0 + 2099 * 16], m4
+
+ ;mode 34 [row 26]
+ palignr m3, m1, m0, 9
+ movu [r0 + 2100 * 16], m3
+ palignr m4, m2, m1, 9
+ movu [r0 + 2101 * 16], m4
+
+ ;mode 34 [row 27]
+ palignr m3, m1, m0, 10
+ movu [r0 + 2102 * 16], m3
+ palignr m4, m2, m1, 10
+ movu [r0 + 2103 * 16], m4
+
+ ;mode 34 [row 28]
+ palignr m3, m1, m0, 11
+ movu [r0 + 2104 * 16], m3
+ palignr m4, m2, m1, 11
+ movu [r0 + 2105 * 16], m4
+
+ ;mode 34 [row 29]
+ palignr m3, m1, m0, 12
+ movu [r0 + 2106 * 16], m3
+ palignr m4, m2, m1, 12
+ movu [r0 + 2107 * 16], m4
+
+ ;mode 34 [row 30]
+ palignr m3, m1, m0, 13
+ movu [r0 + 2108 * 16], m3
+ palignr m4, m2, m1, 13
+ movu [r0 + 2109 * 16], m4
+
+ ;mode 34 [row 31]
+ palignr m3, m1, m0, 14
+ movu [r0 + 2110 * 16], m3
+ palignr m4, m2, m1, 14
+ movu [r0 + 2111 * 16], m4
+ RET
diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm
index 52fc42c..c968cd3 100644
--- a/source/common/x86/ipfilter8.asm
+++ b/source/common/x86/ipfilter8.asm
@@ -32,6 +32,13 @@ tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
ALIGN 32
+const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
+
+ALIGN 32
+const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
+ dd 2, 3, 3, 4, 4, 5, 5, 6
+
+ALIGN 32
tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
@@ -42,7 +49,6 @@ tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
-tab_c_512: times 8 dw 512
tab_c_526336: times 4 dd 8192*64+2048
tab_ChromaCoeff: db 0, 64, 0, 0
@@ -123,13 +129,63 @@ tab_LumaCoeffVer: times 8 db 0, 0
times 8 db 58, -10
times 8 db 4, -1
-tab_c_128: times 16 db 0x80
+ALIGN 32
+tab_LumaCoeffVer_32: times 16 db 0, 0
+ times 16 db 0, 64
+ times 16 db 0, 0
+ times 16 db 0, 0
+
+ times 16 db -1, 4
+ times 16 db -10, 58
+ times 16 db 17, -5
+ times 16 db 1, 0
+
+ times 16 db -1, 4
+ times 16 db -11, 40
+ times 16 db 40, -11
+ times 16 db 4, -1
+
+ times 16 db 0, 1
+ times 16 db -5, 17
+ times 16 db 58, -10
+ times 16 db 4, -1
+
+ALIGN 32
+tab_ChromaCoeffVer_32: times 16 db 0, 64
+ times 16 db 0, 0
+
+ times 16 db -2, 58
+ times 16 db 10, -2
+
+ times 16 db -4, 54
+ times 16 db 16, -2
+
+ times 16 db -6, 46
+ times 16 db 28, -4
+
+ times 16 db -4, 36
+ times 16 db 36, -4
+
+ times 16 db -4, 28
+ times 16 db 46, -6
+
+ times 16 db -2, 16
+ times 16 db 54, -4
+
+ times 16 db -2, 10
+ times 16 db 58, -2
+
tab_c_64_n64: times 8 db 64, -64
+const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+ALIGN 32
+interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
SECTION .text
-cextern idct4_shuf1
+cextern pb_128
cextern pw_1
cextern pw_512
cextern pw_2000
@@ -171,7 +227,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 2
@@ -203,7 +259,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 4
@@ -235,7 +291,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mov r5d, 16/2
@@ -285,7 +341,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
FILTER_H4_w4_2 t0, t1, t2
@@ -313,7 +369,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 2
@@ -345,7 +401,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 4
@@ -377,7 +433,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
%rep 8
@@ -409,7 +465,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
%endif
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mov r5d, 32/2
@@ -423,6 +479,9 @@ jnz .loop
RET
+ALIGN 32
+const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
+
%macro FILTER_H4_w6 3
movu %1, [srcq - 1]
@@ -606,7 +665,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
mov r5d, %2
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mova Tm1, [tab_Tm + 16]
@@ -662,7 +721,7 @@ movd coef2, [tab_ChromaCoeff + r4 * 4]
mov r5d, %2
pshufd coef2, coef2, 0
-mova t2, [tab_c_512]
+mova t2, [pw_512]
mova Tm0, [tab_Tm]
mova Tm1, [tab_Tm + 16]
@@ -749,7 +808,7 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
punpcklqdq m3, m3
%ifidn %3, pp
- mova m2, [tab_c_512]
+ mova m2, [pw_512]
%else
mova m2, [pw_2000]
%endif
@@ -845,7 +904,7 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6
pmulhrsw m3, [pw_512]
vextracti128 xm4, m3, 1
packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
- pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
+ pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0]
lea r0, [r3 * 3]
movd [r2], xm3
@@ -854,7 +913,664 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6
pextrd [r2+r0], xm3, 3
RET
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4 * 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+ mova m1, [tab_Lm]
+ mova m2, [tab_Lm + 32]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1, m2 - shuffle order table
+
+ sub r0, 3
+ lea r5, [r1 * 3]
+ lea r4, [r3 * 3]
+
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m3, m2
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ phaddw m3, m4
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+
+ phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
+ pmulhrsw m3, [pw_512]
+
+ ; Row 2
+ vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m6, m5, m2
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddubsw m6, m0
+ phaddw m5, m6
+
+ phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
+ vextracti128 xm4, m3, 1
+ punpcklwd xm5, xm3, xm4
+
+ movq [r2], xm5
+ movhps [r2 + r3], xm5
+
+ punpckhwd xm5, xm3, xm4
+ movq [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm5
+ RET
+
+%macro IPFILTER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4 * 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+ mova m1, [tab_Lm]
+ mova m2, [tab_Lm + 32]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1, m2 - shuffle order table
+
+ sub r0, 3
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+ mov r4d, %2 / 4
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m3, m2
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ phaddw m3, m4
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+
+ phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
+ pmulhrsw m3, [pw_512]
+
+ ; Row 2
+ vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m6, m5, m2
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddubsw m6, m0
+ phaddw m5, m6
+
+ phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
+ vextracti128 xm4, m3, 1
+ punpcklwd xm5, xm3, xm4
+
+ movq [r2], xm5
+ movhps [r2 + r3], xm5
+
+ punpckhwd xm5, xm3, xm4
+ movq [r2 + r3 * 2], xm5
+ movhps [r2 + r6], xm5
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+IPFILTER_LUMA_AVX2_8xN 8, 8
+IPFILTER_LUMA_AVX2_8xN 8, 16
+IPFILTER_LUMA_AVX2_8xN 8, 32
+
+%macro IPFILTER_LUMA_AVX2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, %2/2
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2+r3], xm5
+ lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r3 * 2]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+%macro IPFILTER_LUMA_32x_avx2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, %2
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + 16]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 24]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2 + 16], xm5
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+%macro IPFILTER_LUMA_64x_avx2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, %2
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + 16]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 24]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2 + 16], xm5
+
+ vbroadcasti128 m4, [r0 + 32]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 40]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5
+ pmulhrsw m4, [pw_512]
+ vbroadcasti128 m2, [r0 + 48]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 56]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2 +32], xm4
+ movu [r2 + 48], xm5
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_48x64, 4,6,8
+ sub r0, 3
+ mov r4d, r4m
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8]
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+ movu m3, [tab_Tm + 16]
+ vpbroadcastd m7, [pw_1]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+ ; m2 , m2 shuffle order table
+ ; m7 - pw_1
+
+ mov r4d, 64
+.loop:
+ ; Row 0
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
+ pmulhrsw m4, [pw_512]
+
+ vbroadcasti128 m2, [r0 + 16]
+ pshufb m5, m2, m3
+ pshufb m2, [tab_Tm]
+ pmaddubsw m2, m0
+ pmaddubsw m5, m1
+ paddw m2, m5
+ pmaddwd m2, m7
+ vbroadcasti128 m5, [r0 + 24]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m2, m5
+ pmulhrsw m2, [pw_512]
+ packuswb m4, m2
+ vpermq m4, m4, 11011000b
+ vextracti128 xm5, m4, 1
+ pshufd xm4, xm4, 11011000b
+ pshufd xm5, xm5, 11011000b
+ movu [r2], xm4
+ movu [r2 + 16], xm5
+
+ vbroadcasti128 m4, [r0 + 32]
+ pshufb m5, m4, m3
+ pshufb m4, [tab_Tm]
+ pmaddubsw m4, m0
+ pmaddubsw m5, m1
+ paddw m4, m5
+ pmaddwd m4, m7
+ vbroadcasti128 m5, [r0 + 40]
+ pshufb m6, m5, m3
+ pshufb m5, [tab_Tm]
+ pmaddubsw m5, m0
+ pmaddubsw m6, m1
+ paddw m5, m6
+ pmaddwd m5, m7
+ packssdw m4, m5
+ pmulhrsw m4, [pw_512]
+ packuswb m4, m4
+ vpermq m4, m4, 11011000b
+ pshufd xm4, xm4, 11011000b
+ movu [r2 + 32], xm4
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ dec r4d
+ jnz .loop
+ RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_4x4, 4,6,6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ vpbroadcastd m2, [pw_1]
+ vbroadcasti128 m1, [tab_Tm]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ dec r0
+
+ ; Row 0-1
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ vinserti128 m3, m3, [r0 + r1], 1
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+
+ ; Row 2-3
+ lea r0, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ vinserti128 m4, m4, [r0 + r1], 1
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+
+ packssdw m3, m4
+ pmulhrsw m3, [pw_512]
+ vextracti128 xm4, m3, 1
+ packuswb xm3, xm4
+
+ lea r0, [r3 * 3]
+ movd [r2], xm3
+ pextrd [r2+r3], xm3, 2
+ pextrd [r2+r3*2], xm3, 1
+ pextrd [r2+r0], xm3, 3
+ RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_32x32, 4,6,7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mova m1, [interp4_horiz_shuf1]
+ vpbroadcastd m2, [pw_1]
+ mova m6, [pw_512]
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ dec r0
+ mov r4d, 32
+
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+ vbroadcasti128 m4, [r0 + 4]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ packssdw m3, m4
+ pmulhrsw m3, m6
+
+ vbroadcasti128 m4, [r0 + 16]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ vbroadcasti128 m5, [r0 + 20]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ packssdw m4, m5
+ pmulhrsw m4, m6
+
+ packuswb m3, m4
+ vpermq m3, m3, 11011000b
+
+ movu [r2], m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ dec r4d
+ jnz .loop
+ RET
+
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mova m6, [pw_512]
+ mova m1, [interp4_horiz_shuf1]
+ vpbroadcastd m2, [pw_1]
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ dec r0
+ mov r4d, 8
+
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+ vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ packssdw m3, m4
+ pmulhrsw m3, m6
+
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ packssdw m4, m5
+ pmulhrsw m4, m6
+
+ packuswb m3, m4
+ vpermq m3, m3, 11011000b
+
+ vextracti128 xm4, m3, 1
+ movu [r2], xm3
+ movu [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ dec r4d
+ jnz .loop
+ RET
;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
@@ -863,6 +1579,91 @@ cglobal interp_8tap_horiz_pp_4x4, 4,6,6
IPFILTER_LUMA 12, 16, pp
IPFILTER_LUMA 4, 16, pp
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_8x8, 4,6,6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ movu m1, [tab_Tm]
+ vpbroadcastd m2, [pw_1]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ sub r0, 1
+ mov r4d, 2
+
+.loop:
+ ; Row 0
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+
+ ; Row 1
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ packssdw m3, m4
+ pmulhrsw m3, [pw_512]
+ lea r0, [r0 + r1 * 2]
+
+ ; Row 2
+ vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ packssdw m4, m5
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
+ mova m5, [interp_4tap_8x8_horiz_shuf]
+ vpermd m3, m5, m3
+ vextracti128 xm4, m3, 1
+ movq [r2], xm3
+ movhps [r2 + r3], xm3
+ lea r2, [r2 + r3 * 2]
+ movq [r2], xm4
+ movhps [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1*2]
+ dec r4d
+ jnz .loop
+ RET
+
+ IPFILTER_LUMA_AVX2 16, 4
+ IPFILTER_LUMA_AVX2 16, 8
+ IPFILTER_LUMA_AVX2 16, 12
+ IPFILTER_LUMA_AVX2 16, 16
+ IPFILTER_LUMA_AVX2 16, 32
+ IPFILTER_LUMA_AVX2 16, 64
+
+ IPFILTER_LUMA_32x_avx2 32 , 8
+ IPFILTER_LUMA_32x_avx2 32 , 16
+ IPFILTER_LUMA_32x_avx2 32 , 24
+ IPFILTER_LUMA_32x_avx2 32 , 32
+ IPFILTER_LUMA_32x_avx2 32 , 64
+
+ IPFILTER_LUMA_64x_avx2 64 , 64
+ IPFILTER_LUMA_64x_avx2 64 , 48
+ IPFILTER_LUMA_64x_avx2 64 , 32
+ IPFILTER_LUMA_64x_avx2 64 , 16
+
;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
@@ -1016,7 +1817,7 @@ IPFILTER_LUMA_PP_W8 64, 64
;-----------------------------------------------------------------------------
; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse4
cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
%define coef m7
%define stk_buf rsp
@@ -1040,7 +1841,7 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
mov r4, rsp
.loopH:
- FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
+ FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
psubw m1, [pw_2000]
mova [r4], m1
@@ -1108,7 +1909,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4]
lea r4, [r1 * 3]
lea r5, [r0 + 4 * r1]
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
movd m2, [r0]
movd m3, [r0 + r1]
@@ -1181,7 +1982,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4]
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
mov r4d, %2
lea r5, [3 * r1]
@@ -1289,7 +2090,7 @@ pmaddubsw m3, m0
phaddw m2, m3
-pmulhrsw m2, [tab_c_512]
+pmulhrsw m2, [pw_512]
packuswb m2, m2
movd [r2], m2
pextrd [r2 + r3], m2, 1
@@ -1313,7 +2114,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4]
%endif
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
lea r5, [r0 + 4 * r1]
lea r4, [r1 * 3]
@@ -1369,6 +2170,51 @@ pextrd [r2 + r3], m2, 3
RET
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
+ mov r4d, r4m
+ shl r4d, 6
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+
+ movd xm1, [r0]
+ pinsrd xm1, [r0 + r1], 1
+ pinsrd xm1, [r0 + r1 * 2], 2
+ pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
+ lea r0, [r0 + r1 * 4]
+ movd xm2, [r0]
+ pinsrd xm2, [r0 + r1], 1
+ pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4]
+ vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0]
+ mova m2, [interp4_vpp_shuf1]
+ vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0]
+ mova m2, [interp4_vpp_shuf1 + mmsize]
+ vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2]
+
+ mova m2, [interp4_vpp_shuf]
+ pshufb m0, m0, m2
+ pshufb m1, m1, m2
+ pmaddubsw m0, [r5]
+ pmaddubsw m1, [r5 + mmsize]
+ paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
+ pmulhrsw m0, [pw_512]
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ lea r5, [r3 * 3]
+ movd [r2], xm0
+ pextrd [r2 + r3], xm0, 1
+ pextrd [r2 + r3 * 2], xm0, 2
+ pextrd [r2 + r5], xm0, 3
+ RET
+
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -1388,7 +2234,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4]
pshufb m0, [tab_Cm]
-mova m1, [tab_c_512]
+mova m1, [pw_512]
mov r4d, %2
@@ -1590,7 +2436,7 @@ pmaddubsw m4, m5
paddw m0, m4
-mova m4, [tab_c_512]
+mova m4, [pw_512]
pmulhrsw m0, m4
packuswb m0, m0
@@ -2495,7 +3341,7 @@ movd m5, [tab_ChromaCoeff + r4 * 4]
pshufb m6, m5, [tab_Vm]
pshufb m5, [tab_Vm + 16]
-mova m4, [tab_c_512]
+mova m4, [pw_512]
lea r5, [r1 * 3]
mov r4d, %2
@@ -2573,6 +3419,84 @@ FILTER_V4_W8_H8_H16_H32 8, 32
FILTER_V4_W8_H8_H16_H32 8, 12
FILTER_V4_W8_H8_H16_H32 8, 64
+%macro PROCESS_CHROMA_AVX2_W8_8R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m0, [r5 + 1 * mmsize]
+ paddw m4, m0
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r1
+ PROCESS_CHROMA_AVX2_W8_8R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m3 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m3 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ movhps [r2 + r3 * 2], xm1
+ movhps [r2 + r4], xm4
+ RET
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -2593,7 +3517,7 @@ movd m5, [tab_ChromaCoeff + r4 * 4]
pshufb m6, m5, [tab_Vm]
pshufb m5, [tab_Vm + 16]
-mova m4, [tab_c_512]
+mova m4, [pw_512]
mov r4d, %2
lea r5, [3 * r1]
@@ -2716,7 +3640,7 @@ punpckhbw m6, m5, m7
pmaddubsw m6, m0
paddw m2, m6
-mova m6, [tab_c_512]
+mova m6, [pw_512]
pmulhrsw m4, m6
pmulhrsw m2, m6
@@ -2806,7 +3730,7 @@ punpcklbw m7, m5, m6
pmaddubsw m7, m0
paddw m4, m7
-mova m7, [tab_c_512]
+mova m7, [pw_512]
pmulhrsw m4, m7
pmulhrsw m2, m7
@@ -2855,6 +3779,217 @@ FILTER_V4_W16_H2 16, 32
FILTER_V4_W16_H2 16, 24
FILTER_V4_W16_H2 16, 64
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
+ mov r4d, r4m
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ mova m12, [r5]
+ mova m13, [r5 + mmsize]
+ lea r4, [r1 * 3]
+ sub r0, r1
+ lea r5, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, m12
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, m12
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, m13
+ paddw m0, m4
+ pmaddubsw m2, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, m13
+ paddw m1, m5
+ pmaddubsw m3, m12
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, m13
+ paddw m2, m6
+ pmaddubsw m4, m12
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, m13
+ paddw m3, m7
+ pmaddubsw m5, m12
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, m13
+ paddw m4, m8
+ pmaddubsw m6, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, m13
+ paddw m5, m9
+ pmaddubsw m7, m12
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, m13
+ paddw m6, m10
+ pmaddubsw m8, m12
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, m13
+ paddw m7, m11
+ pmaddubsw m9, m12
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ packuswb m6, m7
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vpermq m6, m6, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ vextracti128 xm7, m6, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r5], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r5], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm6, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm6, 1
+ pmaddubsw m6, m10, m13
+ paddw m8, m6
+ pmaddubsw m10, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 12
+ punpckhbw xm7, xm11, xm6
+ punpcklbw xm11, xm6
+ vinserti128 m11, m11, xm7, 1
+ pmaddubsw m7, m11, m13
+ paddw m9, m7
+ pmaddubsw m11, m12
+
+ movu xm7, [r0 + r1] ; m7 = row 13
+ punpckhbw xm0, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm0, 1
+ pmaddubsw m0, m6, m13
+ paddw m10, m0
+ pmaddubsw m6, m12
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm7, xm0
+ punpcklbw xm7, xm0
+ vinserti128 m7, m7, xm1, 1
+ pmaddubsw m1, m7, m13
+ paddw m11, m1
+ pmaddubsw m7, m12
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, m13
+ paddw m6, m2
+ pmaddubsw m0, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, m13
+ paddw m7, m3
+ pmaddubsw m1, m12
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m2, m13
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m3, m13
+ paddw m1, m3
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m6, m14 ; m6 = word: row 12
+ pmulhrsw m7, m14 ; m7 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m6, m7
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m6, m6, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm7, m6, 1
+ vextracti128 xm1, m0, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r5], xm11
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm6
+ movu [r2 + r3], xm7
+ movu [r2 + r3 * 2], xm0
+ movu [r2 + r5], xm1
+ RET
+%endif
+
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -2899,7 +4034,7 @@ punpckhbw m6, m5, m7
pmaddubsw m6, m0
paddw m2, m6
-mova m6, [tab_c_512]
+mova m6, [pw_512]
pmulhrsw m4, m6
pmulhrsw m2, m6
@@ -2998,7 +4133,7 @@ movd m0, [tab_ChromaCoeff + r4 * 4]
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
-mova m7, [tab_c_512]
+mova m7, [pw_512]
mov r4d, %2
@@ -3076,6 +4211,96 @@ FILTER_V4_W32 32, 32
FILTER_V4_W32 32, 48
FILTER_V4_W32 32, 64
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
+ mov r4d, r4m
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ mova m10, [r5]
+ mova m11, [r5 + mmsize]
+ lea r4, [r1 * 3]
+ sub r0, r1
+ lea r5, [r3 * 3]
+ mova m12, [pw_512]
+ mov r6d, 8
+.loopW:
+ movu m0, [r0] ; m0 = row 0
+ movu m1, [r0 + r1] ; m1 = row 1
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
+ pmaddubsw m2, m10
+ pmaddubsw m3, m10
+ movu m0, [r0 + r1 * 2] ; m0 = row 2
+ punpcklbw m4, m1, m0
+ punpckhbw m5, m1, m0
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ movu m1, [r0 + r4] ; m1 = row 3
+ punpcklbw m6, m0, m1
+ punpckhbw m7, m0, m1
+ pmaddubsw m8, m6, m11
+ pmaddubsw m9, m7, m11
+ pmaddubsw m6, m10
+ pmaddubsw m7, m10
+ paddw m2, m8
+ paddw m3, m9
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ packuswb m2, m3
+ movu [r2], m2
+
+ lea r0, [r0 + r1 * 4]
+ movu m0, [r0] ; m0 = row 4
+ punpcklbw m2, m1, m0
+ punpckhbw m3, m1, m0
+ pmaddubsw m8, m2, m11
+ pmaddubsw m9, m3, m11
+ pmaddubsw m2, m10
+ pmaddubsw m3, m10
+ paddw m4, m8
+ paddw m5, m9
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ packuswb m4, m5
+ movu [r2 + r3], m4
+
+ movu m1, [r0 + r1] ; m1 = row 5
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m6, m4
+ paddw m7, m5
+ pmulhrsw m6, m12
+ pmulhrsw m7, m12
+ packuswb m6, m7
+ movu [r2 + r3 * 2], m6
+
+ movu m0, [r0 + r1 * 2] ; m0 = row 6
+ punpcklbw m6, m1, m0
+ punpckhbw m7, m1, m0
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m2, m6
+ paddw m3, m7
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ packuswb m2, m3
+ movu [r2 + r5], m2
+
+ lea r2, [r2 + r3 * 4]
+ dec r6d
+ jnz .loopW
+ RET
+%endif
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3126,7 +4351,7 @@ punpcklbw m7, m5, m6
pmaddubsw m7, m0
paddw m4, m7
-mova m7, [tab_c_512]
+mova m7, [pw_512]
pmulhrsw m4, m7
pmulhrsw m2, m7
@@ -3190,7 +4415,7 @@ cglobal luma_p2s, 3, 7, 6
mov r4d, r4m
; load constant
- mova m4, [tab_c_128]
+ mova m4, [pb_128]
mova m5, [tab_c_64_n64]
.loopH:
@@ -3379,7 +4604,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
@@ -3421,6 +4646,149 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
RET
%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_4x4, 4,6,8
+ mov r4d, r4m
+ lea r5, [r1 * 3]
+ sub r0, r5
+
+ ; TODO: VPGATHERDD
+ movd xm1, [r0] ; m1 = row0
+ movd xm2, [r0 + r1] ; m2 = row1
+ punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
+
+ movd xm3, [r0 + r1 * 2] ; m3 = row2
+ punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
+ movd xm4, [r0 + r5]
+ punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
+ punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+
+ lea r0, [r0 + r1 * 4]
+ movd xm5, [r0] ; m5 = row4
+ punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
+ punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
+ vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+ movd xm2, [r0 + r1] ; m2 = row5
+ punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
+ punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+ movd xm6, [r0 + r1 * 2] ; m6 = row6
+ punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
+ punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
+ vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+ movd xm4, [r0 + r5] ; m4 = row7
+ punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
+ punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+
+ lea r0, [r0 + r1 * 4]
+ movd xm7, [r0] ; m7 = row8
+ punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
+ punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
+ vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+ movd xm2, [r0 + r1] ; m2 = row9
+ punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
+ punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+ movd xm7, [r0 + r1 * 2] ; m7 = rowA
+ punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
+ punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
+ vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+
+ ; load filter coeff
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8 + 0]
+ vpbroadcastd m2, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
+ vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+
+ pmaddubsw m1, m0
+ pmaddubsw m3, m0
+ pmaddubsw m5, m2
+ pmaddubsw m6, m2
+ vbroadcasti128 m0, [pw_1]
+ pmaddwd m1, m0
+ pmaddwd m3, m0
+ pmaddwd m5, m0
+ pmaddwd m6, m0
+ paddd m1, m5 ; m1 = DQWORD ROW[1 0]
+ paddd m3, m6 ; m3 = DQWORD ROW[3 2]
+ packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
+
+ ; TODO: does it overflow?
+ pmulhrsw m1, [pw_512]
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
+ movd [r2], xm1
+ pextrd [r2 + r3], xm1, 2
+ pextrd [r2 + r3 * 2], xm1, 1
+ lea r4, [r3 * 3]
+ pextrd [r2 + r4], xm1, 3
+ RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+
+ add r3d, r3d
+
+ movd xm1, [r0]
+ pinsrd xm1, [r0 + r1], 1
+ pinsrd xm1, [r0 + r1 * 2], 2
+ pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
+ lea r0, [r0 + r1 * 4]
+ movd xm2, [r0]
+ pinsrd xm2, [r0 + r1], 1
+ pinsrd xm2, [r0 + r1 * 2], 2
+ pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4]
+ vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0]
+ lea r0, [r0 + r1 * 4]
+ movd xm3, [r0]
+ pinsrd xm3, [r0 + r1], 1
+ pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8]
+ vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4]
+ mova m3, [interp4_vpp_shuf1]
+ vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0]
+ vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4]
+ mova m3, [interp4_vpp_shuf1 + mmsize]
+ vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2]
+ vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6]
+
+ mova m3, [interp4_vpp_shuf]
+ pshufb m0, m0, m3
+ pshufb m1, m1, m3
+ pshufb m4, m4, m3
+ pshufb m2, m2, m3
+ pmaddubsw m0, [r5]
+ pmaddubsw m1, [r5 + mmsize]
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ pmaddubsw m2, [r5 + 3 * mmsize]
+ paddw m0, m1
+ paddw m0, m4
+ paddw m0, m2 ; m0 = WORD ROW[3 2 1 0]
+
+ vbroadcasti128 m3, [pw_2000]
+ psubw m0, m3
+ vextracti128 xm2, m0, 1
+ lea r5, [r3 * 3]
+ movq [r2], xm0
+ movhps [r2 + r3], xm0
+ movq [r2 + r3 * 2], xm2
+ movhps [r2 + r5], xm2
+ RET
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
@@ -3451,6 +4819,122 @@ FILTER_VER_LUMA_4xN 4, 8, ps
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_4xN 4, 16, ps
+%macro PROCESS_LUMA_AVX2_W8_8R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m0, [r5 + 2 * mmsize]
+ paddw m1, m3
+ pmaddubsw m0, [r5 + 1 * mmsize]
+ paddw m4, m0
+
+ movq xm3, [r0 + r4] ; m3 = row 11
+ punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 12
+ punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
+ vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+ pmaddubsw m3, m6, [r5 + 3 * mmsize]
+ paddw m1, m3
+ pmaddubsw m6, [r5 + 2 * mmsize]
+ paddw m4, m6
+ movq xm3, [r0 + r1] ; m3 = row 13
+ punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 14
+ punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+ pmaddubsw m0, [r5 + 3 * mmsize]
+ paddw m4, m0
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W8_4R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+%endmacro
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
@@ -3473,7 +4957,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
@@ -3520,6 +5004,115 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
RET
%endmacro
+%macro FILTER_VER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r1 * 4]
+ mov word [rsp], %2 / 8
+ mova m7, [pw_512]
+
+.loop:
+ PROCESS_LUMA_AVX2_W8_8R
+ pmulhrsw m5, m7 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m7 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m7 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m7 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ lea r2, [r2 + r3 * 2]
+ movhps [r2], xm5
+ movhps [r2 + r3], xm2
+ lea r2, [r2 + r3 * 2]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ movhps [r2], xm1
+ movhps [r2 + r3], xm4
+ lea r2, [r2 + r3 * 2]
+ sub r0, r6
+ dec word [rsp]
+ jnz .loop
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ PROCESS_LUMA_AVX2_W8_8R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m3 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m3 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ movhps [r2 + r3 * 2], xm1
+ movhps [r2 + r4], xm4
+ RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ PROCESS_LUMA_AVX2_W8_4R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ packuswb m5, m2
+ vextracti128 xm2, m5, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ RET
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
@@ -3534,11 +5127,13 @@ FILTER_VER_LUMA_8xN 8, 8, pp
; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_8xN 8, 16, pp
+FILTER_VER_LUMA_AVX2_8xN 8, 16
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_8xN 8, 32, pp
+FILTER_VER_LUMA_AVX2_8xN 8, 32
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3581,7 +5176,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
@@ -3674,6 +5269,2260 @@ FILTER_VER_LUMA_12xN 12, 16, pp
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_12xN 12, 16, ps
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movq [r2], xm0
+ pextrd [r2 + 8], xm0, 2
+ movq [r2 + r3], xm1
+ pextrd [r2 + r3 + 8], xm1, 2
+ movq [r2 + r3 * 2], xm2
+ pextrd [r2 + r3 * 2 + 8], xm2, 2
+ movq [r2 + r6], xm3
+ pextrd [r2 + r6 + 8], xm3, 2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm4
+ pextrd [r2 + 8], xm4, 2
+ movq [r2 + r3], xm5
+ pextrd [r2 + r3 + 8], xm5, 2
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movq [r2 + r3 * 2], xm6
+ pextrd [r2 + r3 * 2 + 8], xm6, 2
+ movq [r2 + r6], xm7
+ pextrd [r2 + r6 + 8], xm7, 2
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r0 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r0 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r0 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movq [r2], xm8
+ pextrd [r2 + 8], xm8, 2
+ movq [r2 + r3], xm9
+ pextrd [r2 + r3 + 8], xm9, 2
+ movq [r2 + r3 * 2], xm10
+ pextrd [r2 + r3 * 2 + 8], xm10, 2
+ movq [r2 + r6], xm11
+ pextrd [r2 + r6 + 8], xm11, 2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm12
+ pextrd [r2 + 8], xm12, 2
+ movq [r2 + r3], xm13
+ pextrd [r2 + r3 + 8], xm13, 2
+ movq [r2 + r3 * 2], xm0
+ pextrd [r2 + r3 * 2 + 8], xm0, 2
+ movq [r2 + r6], xm1
+ pextrd [r2 + r6 + 8], xm1, 2
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r0 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r0 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r0 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r6], xm11
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm12
+ movu [r2 + r3], xm13
+ movu [r2 + r3 * 2], xm0
+ movu [r2 + r6], xm1
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ packuswb m8, m9
+ packuswb m10, m11
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r6], xm11
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m12, [pw_512]
+
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+
+ pmulhrsw m0, m12 ; m0 = word: row 0
+ pmulhrsw m1, m12 ; m1 = word: row 1
+ pmulhrsw m2, m12 ; m2 = word: row 2
+ pmulhrsw m3, m12 ; m3 = word: row 3
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_16xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r7, [r1 * 4]
+ mova m14, [pw_512]
+ mov r8d, %2 / 16
+
+.loop:
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm4, [r0] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r0 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r0 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r0 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm8, [r0] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r0 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r0 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r0 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm12, [r0] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm4
+ movu [r2 + r3], xm5
+
+ movu xm13, [r0 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r0 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r2 + r3 * 2], xm6
+ movu [r2 + r6], xm7
+ lea r2, [r2 + r3 * 4]
+
+ movu xm1, [r0 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r0, [r0 + r1 * 4]
+ movu xm2, [r0] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r0 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r0 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r0, [r0 + r1 * 4]
+ movu xm6, [r0] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r0 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r0 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movu [r2], xm8
+ movu [r2 + r3], xm9
+ movu [r2 + r3 * 2], xm10
+ movu [r2 + r6], xm11
+ lea r2, [r2 + r3 * 4]
+ movu [r2], xm12
+ movu [r2 + r3], xm13
+ movu [r2 + r3 * 2], xm0
+ movu [r2 + r6], xm1
+ lea r2, [r2 + r3 * 4]
+ sub r0, r7
+ dec r8d
+ jnz .loop
+ RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16xN 16, 32
+FILTER_VER_LUMA_AVX2_16xN 16, 64
+
+%macro PROCESS_LUMA_AVX2_W16_16R 0
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r7, [r0 + r1 * 4]
+ movu xm4, [r7] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r7 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r7 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r7 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm8, [r7] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r7 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ pmaddubsw m8, [r5]
+ movu xm10, [r7 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ pmaddubsw m9, [r5]
+ movu xm11, [r7 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ pmaddubsw m12, m10, [r5 + 1 * mmsize]
+ paddw m8, m12
+ pmaddubsw m10, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm12, [r7] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+ pmaddubsw m13, m11, [r5 + 1 * mmsize]
+ paddw m9, m13
+ pmaddubsw m11, [r5]
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r8, [r2 + r3 * 4]
+ movu [r8], xm4
+ movu [r8 + r3], xm5
+
+ movu xm13, [r7 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ pmaddubsw m0, m12, [r5 + 2 * mmsize]
+ paddw m8, m0
+ pmaddubsw m0, m12, [r5 + 1 * mmsize]
+ paddw m10, m0
+ pmaddubsw m12, [r5]
+ movu xm0, [r7 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+ pmaddubsw m1, m13, [r5 + 2 * mmsize]
+ paddw m9, m1
+ pmaddubsw m1, m13, [r5 + 1 * mmsize]
+ paddw m11, m1
+ pmaddubsw m13, [r5]
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r8 + r3 * 2], xm6
+ movu [r8 + r6], xm7
+ lea r8, [r8 + r3 * 4]
+
+ movu xm1, [r7 + r4] ; m1 = row 15
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m2, m0, [r5 + 3 * mmsize]
+ paddw m8, m2
+ pmaddubsw m2, m0, [r5 + 2 * mmsize]
+ paddw m10, m2
+ pmaddubsw m2, m0, [r5 + 1 * mmsize]
+ paddw m12, m2
+ pmaddubsw m0, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm2, [r7] ; m2 = row 16
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 3 * mmsize]
+ paddw m9, m3
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m11, m3
+ pmaddubsw m3, m1, [r5 + 1 * mmsize]
+ paddw m13, m3
+ pmaddubsw m1, [r5]
+ movu xm3, [r7 + r1] ; m3 = row 17
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 3 * mmsize]
+ paddw m10, m4
+ pmaddubsw m4, m2, [r5 + 2 * mmsize]
+ paddw m12, m4
+ pmaddubsw m2, [r5 + 1 * mmsize]
+ paddw m0, m2
+ movu xm4, [r7 + r1 * 2] ; m4 = row 18
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 3 * mmsize]
+ paddw m11, m5
+ pmaddubsw m5, m3, [r5 + 2 * mmsize]
+ paddw m13, m5
+ pmaddubsw m3, [r5 + 1 * mmsize]
+ paddw m1, m3
+ movu xm5, [r7 + r4] ; m5 = row 19
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 3 * mmsize]
+ paddw m12, m6
+ pmaddubsw m4, [r5 + 2 * mmsize]
+ paddw m0, m4
+ lea r7, [r7 + r1 * 4]
+ movu xm6, [r7] ; m6 = row 20
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 3 * mmsize]
+ paddw m13, m7
+ pmaddubsw m5, [r5 + 2 * mmsize]
+ paddw m1, m5
+ movu xm7, [r7 + r1] ; m7 = row 21
+ punpckhbw xm2, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm2, 1
+ pmaddubsw m6, [r5 + 3 * mmsize]
+ paddw m0, m6
+ movu xm2, [r7 + r1 * 2] ; m2 = row 22
+ punpckhbw xm3, xm7, xm2
+ punpcklbw xm7, xm2
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m7, [r5 + 3 * mmsize]
+ paddw m1, m7
+
+ pmulhrsw m8, m14 ; m8 = word: row 8
+ pmulhrsw m9, m14 ; m9 = word: row 9
+ pmulhrsw m10, m14 ; m10 = word: row 10
+ pmulhrsw m11, m14 ; m11 = word: row 11
+ pmulhrsw m12, m14 ; m12 = word: row 12
+ pmulhrsw m13, m14 ; m13 = word: row 13
+ pmulhrsw m0, m14 ; m0 = word: row 14
+ pmulhrsw m1, m14 ; m1 = word: row 15
+ packuswb m8, m9
+ packuswb m10, m11
+ packuswb m12, m13
+ packuswb m0, m1
+ vpermq m8, m8, 11011000b
+ vpermq m10, m10, 11011000b
+ vpermq m12, m12, 11011000b
+ vpermq m0, m0, 11011000b
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ vextracti128 xm13, m12, 1
+ vextracti128 xm1, m0, 1
+ movu [r8], xm8
+ movu [r8 + r3], xm9
+ movu [r8 + r3 * 2], xm10
+ movu [r8 + r6], xm11
+ lea r8, [r8 + r3 * 4]
+ movu [r8], xm12
+ movu [r8 + r3], xm13
+ movu [r8 + r3 * 2], xm0
+ movu [r8 + r6], xm1
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W16_8R 0
+ movu xm0, [r0] ; m0 = row 0
+ movu xm1, [r0 + r1] ; m1 = row 1
+ punpckhbw xm2, xm0, xm1
+ punpcklbw xm0, xm1
+ vinserti128 m0, m0, xm2, 1
+ pmaddubsw m0, [r5]
+ movu xm2, [r0 + r1 * 2] ; m2 = row 2
+ punpckhbw xm3, xm1, xm2
+ punpcklbw xm1, xm2
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m1, [r5]
+ movu xm3, [r0 + r4] ; m3 = row 3
+ punpckhbw xm4, xm2, xm3
+ punpcklbw xm2, xm3
+ vinserti128 m2, m2, xm4, 1
+ pmaddubsw m4, m2, [r5 + 1 * mmsize]
+ paddw m0, m4
+ pmaddubsw m2, [r5]
+ lea r7, [r0 + r1 * 4]
+ movu xm4, [r7] ; m4 = row 4
+ punpckhbw xm5, xm3, xm4
+ punpcklbw xm3, xm4
+ vinserti128 m3, m3, xm5, 1
+ pmaddubsw m5, m3, [r5 + 1 * mmsize]
+ paddw m1, m5
+ pmaddubsw m3, [r5]
+ movu xm5, [r7 + r1] ; m5 = row 5
+ punpckhbw xm6, xm4, xm5
+ punpcklbw xm4, xm5
+ vinserti128 m4, m4, xm6, 1
+ pmaddubsw m6, m4, [r5 + 2 * mmsize]
+ paddw m0, m6
+ pmaddubsw m6, m4, [r5 + 1 * mmsize]
+ paddw m2, m6
+ pmaddubsw m4, [r5]
+ movu xm6, [r7 + r1 * 2] ; m6 = row 6
+ punpckhbw xm7, xm5, xm6
+ punpcklbw xm5, xm6
+ vinserti128 m5, m5, xm7, 1
+ pmaddubsw m7, m5, [r5 + 2 * mmsize]
+ paddw m1, m7
+ pmaddubsw m7, m5, [r5 + 1 * mmsize]
+ paddw m3, m7
+ pmaddubsw m5, [r5]
+ movu xm7, [r7 + r4] ; m7 = row 7
+ punpckhbw xm8, xm6, xm7
+ punpcklbw xm6, xm7
+ vinserti128 m6, m6, xm8, 1
+ pmaddubsw m8, m6, [r5 + 3 * mmsize]
+ paddw m0, m8
+ pmaddubsw m8, m6, [r5 + 2 * mmsize]
+ paddw m2, m8
+ pmaddubsw m8, m6, [r5 + 1 * mmsize]
+ paddw m4, m8
+ pmaddubsw m6, [r5]
+ lea r7, [r7 + r1 * 4]
+ movu xm8, [r7] ; m8 = row 8
+ punpckhbw xm9, xm7, xm8
+ punpcklbw xm7, xm8
+ vinserti128 m7, m7, xm9, 1
+ pmaddubsw m9, m7, [r5 + 3 * mmsize]
+ paddw m1, m9
+ pmaddubsw m9, m7, [r5 + 2 * mmsize]
+ paddw m3, m9
+ pmaddubsw m9, m7, [r5 + 1 * mmsize]
+ paddw m5, m9
+ pmaddubsw m7, [r5]
+ movu xm9, [r7 + r1] ; m9 = row 9
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ vinserti128 m8, m8, xm10, 1
+ pmaddubsw m10, m8, [r5 + 3 * mmsize]
+ paddw m2, m10
+ pmaddubsw m10, m8, [r5 + 2 * mmsize]
+ paddw m4, m10
+ pmaddubsw m10, m8, [r5 + 1 * mmsize]
+ paddw m6, m10
+ movu xm10, [r7 + r1 * 2] ; m10 = row 10
+ punpckhbw xm11, xm9, xm10
+ punpcklbw xm9, xm10
+ vinserti128 m9, m9, xm11, 1
+ pmaddubsw m11, m9, [r5 + 3 * mmsize]
+ paddw m3, m11
+ pmaddubsw m11, m9, [r5 + 2 * mmsize]
+ paddw m5, m11
+ pmaddubsw m11, m9, [r5 + 1 * mmsize]
+ paddw m7, m11
+ movu xm11, [r7 + r4] ; m11 = row 11
+ punpckhbw xm12, xm10, xm11
+ punpcklbw xm10, xm11
+ vinserti128 m10, m10, xm12, 1
+ pmaddubsw m12, m10, [r5 + 3 * mmsize]
+ paddw m4, m12
+ pmaddubsw m12, m10, [r5 + 2 * mmsize]
+ paddw m6, m12
+ lea r7, [r7 + r1 * 4]
+ movu xm12, [r7] ; m12 = row 12
+ punpckhbw xm13, xm11, xm12
+ punpcklbw xm11, xm12
+ vinserti128 m11, m11, xm13, 1
+ pmaddubsw m13, m11, [r5 + 3 * mmsize]
+ paddw m5, m13
+ pmaddubsw m13, m11, [r5 + 2 * mmsize]
+ paddw m7, m13
+
+ pmulhrsw m0, m14 ; m0 = word: row 0
+ pmulhrsw m1, m14 ; m1 = word: row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2
+ pmulhrsw m3, m14 ; m3 = word: row 3
+ pmulhrsw m4, m14 ; m4 = word: row 4
+ pmulhrsw m5, m14 ; m5 = word: row 5
+ packuswb m0, m1
+ packuswb m2, m3
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vpermq m4, m4, 11011000b
+ vextracti128 xm1, m0, 1
+ vextracti128 xm3, m2, 1
+ vextracti128 xm5, m4, 1
+ movu [r2], xm0
+ movu [r2 + r3], xm1
+ movu [r2 + r3 * 2], xm2
+ movu [r2 + r6], xm3
+ lea r8, [r2 + r3 * 4]
+ movu [r8], xm4
+ movu [r8 + r3], xm5
+
+ movu xm13, [r7 + r1] ; m13 = row 13
+ punpckhbw xm0, xm12, xm13
+ punpcklbw xm12, xm13
+ vinserti128 m12, m12, xm0, 1
+ pmaddubsw m0, m12, [r5 + 3 * mmsize]
+ paddw m6, m0
+ movu xm0, [r7 + r1 * 2] ; m0 = row 14
+ punpckhbw xm1, xm13, xm0
+ punpcklbw xm13, xm0
+ vinserti128 m13, m13, xm1, 1
+ pmaddubsw m1, m13, [r5 + 3 * mmsize]
+ paddw m7, m1
+
+ pmulhrsw m6, m14 ; m6 = word: row 6
+ pmulhrsw m7, m14 ; m7 = word: row 7
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ vextracti128 xm7, m6, 1
+ movu [r8 + r3 * 2], xm6
+ movu [r8 + r6], xm7
+%endmacro
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r10, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopH:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3
+ vinserti128 m5, m1, xm2, 1
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4
+ lea r7, [r0 + r1 * 4]
+ movq xm1, [r7] ; m1 = row 4
+ punpcklbw xm4, xm1
+ vinserti128 m2, m3, xm4, 1
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3
+ movq xm4, [r7 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4
+ vinserti128 m1, m1, xm3, 1
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r7 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm0, [r7] ; m0 = row 8
+ punpcklbw xm3, xm0
+ vinserti128 m4, m4, xm3, 1
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3
+ movq xm6, [r7 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6
+ vinserti128 m0, m0, xm3, 1
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m0, [r5 + 2 * mmsize]
+ paddw m1, m3
+ pmaddubsw m3, m0, [r5 + 1 * mmsize]
+ paddw m4, m3
+ pmaddubsw m0, [r5]
+
+ movq xm3, [r7 + r4] ; m3 = row 11
+ punpcklbw xm6, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm7, [r7] ; m7 = row 12
+ punpcklbw xm3, xm7
+ vinserti128 m6, m6, xm3, 1
+ pmaddubsw m3, m6, [r5 + 3 * mmsize]
+ paddw m1, m3
+ pmaddubsw m3, m6, [r5 + 2 * mmsize]
+ paddw m4, m3
+ pmaddubsw m3, m6, [r5 + 1 * mmsize]
+ paddw m0, m3
+ pmaddubsw m6, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 13
+ punpcklbw xm7, xm3
+ movq xm8, [r7 + r1 * 2] ; m8 = row 14
+ punpcklbw xm3, xm8
+ vinserti128 m7, m7, xm3, 1
+ pmaddubsw m3, m7, [r5 + 3 * mmsize]
+ paddw m4, m3
+ pmaddubsw m3, m7, [r5 + 2 * mmsize]
+ paddw m0, m3
+ pmaddubsw m3, m7, [r5 + 1 * mmsize]
+ paddw m6, m3
+ pmaddubsw m7, [r5]
+ movq xm3, [r7 + r4] ; m3 = row 15
+ punpcklbw xm8, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm9, [r7] ; m9 = row 16
+ punpcklbw xm3, xm9
+ vinserti128 m8, m8, xm3, 1
+ pmaddubsw m3, m8, [r5 + 3 * mmsize]
+ paddw m0, m3
+ pmaddubsw m3, m8, [r5 + 2 * mmsize]
+ paddw m6, m3
+ pmaddubsw m3, m8, [r5 + 1 * mmsize]
+ paddw m7, m3
+ pmaddubsw m8, [r5]
+ movq xm3, [r7 + r1] ; m3 = row 17
+ punpcklbw xm9, xm3
+ movq xm10, [r7 + r1 * 2] ; m10 = row 18
+ punpcklbw xm3, xm10
+ vinserti128 m9, m9, xm3, 1
+ pmaddubsw m3, m9, [r5 + 3 * mmsize]
+ paddw m6, m3
+ pmaddubsw m3, m9, [r5 + 2 * mmsize]
+ paddw m7, m3
+ pmaddubsw m3, m9, [r5 + 1 * mmsize]
+ paddw m8, m3
+ movq xm3, [r7 + r4] ; m3 = row 19
+ punpcklbw xm10, xm3
+ lea r7, [r7 + r1 * 4]
+ movq xm9, [r7] ; m9 = row 20
+ punpcklbw xm3, xm9
+ vinserti128 m10, m10, xm3, 1
+ pmaddubsw m3, m10, [r5 + 3 * mmsize]
+ paddw m7, m3
+ pmaddubsw m3, m10, [r5 + 2 * mmsize]
+ paddw m8, m3
+ movq xm3, [r7 + r1] ; m3 = row 21
+ punpcklbw xm9, xm3
+ movq xm10, [r7 + r1 * 2] ; m10 = row 22
+ punpcklbw xm3, xm10
+ vinserti128 m9, m9, xm3, 1
+ pmaddubsw m3, m9, [r5 + 3 * mmsize]
+ paddw m8, m3
+
+ pmulhrsw m5, m14 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m14 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m14 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m14 ; m4 = word: row 6, row 7
+ pmulhrsw m0, m14 ; m0 = word: row 8, row 9
+ pmulhrsw m6, m14 ; m6 = word: row 10, row 11
+ pmulhrsw m7, m14 ; m7 = word: row 12, row 13
+ pmulhrsw m8, m14 ; m8 = word: row 14, row 15
+ packuswb m5, m2
+ packuswb m1, m4
+ packuswb m0, m6
+ packuswb m7, m8
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ vextracti128 xm6, m0, 1
+ vextracti128 xm8, m7, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r6], xm2
+ lea r8, [r2 + r3 * 4]
+ movq [r8], xm1
+ movq [r8 + r3], xm4
+ movhps [r8 + r3 * 2], xm1
+ movhps [r8 + r6], xm4
+ lea r8, [r8 + r3 * 4]
+ movq [r8], xm0
+ movq [r8 + r3], xm6
+ movhps [r8 + r3 * 2], xm0
+ movhps [r8 + r6], xm6
+ lea r8, [r8 + r3 * 4]
+ movq [r8], xm7
+ movq [r8 + r3], xm8
+ movhps [r8 + r3 * 2], xm7
+ movhps [r8 + r6], xm8
+
+ sub r7, r10
+ lea r0, [r7 - 16]
+ lea r2, [r8 + r3 * 4 - 16]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_32xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r11, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, %2 / 16
+.loopH:
+ mov r10d, %1 / 16
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r10d
+ jnz .loopW
+ sub r7, r11
+ lea r0, [r7 - 16]
+ lea r2, [r8 + r3 * 4 - 16]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32xN 32, 32
+FILTER_VER_LUMA_AVX2_32xN 32, 64
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ lea r9, [r1 * 4]
+ sub r7, r9
+ lea r0, [r7 - 16]
+ lea r2, [r8 + r3 * 4 - 16]
+ mov r9d, 2
+.loop:
+ PROCESS_LUMA_AVX2_W16_8R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loop
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 2
+.loopW:
+ PROCESS_LUMA_AVX2_W16_8R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ RET
+%endif
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r11, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, 4
+.loopH:
+ mov r10d, 3
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r10d
+ jnz .loopW
+ sub r7, r11
+ lea r0, [r7 - 32]
+ lea r2, [r8 + r3 * 4 - 32]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+
+%macro FILTER_VER_LUMA_AVX2_64xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ lea r11, [r1 * 4]
+ mova m14, [pw_512]
+ mov r9d, %2 / 16
+.loopH:
+ mov r10d, %1 / 16
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r10d
+ jnz .loopW
+ sub r7, r11
+ lea r0, [r7 - 48]
+ lea r2, [r8 + r3 * 4 - 48]
+ dec r9d
+ jnz .loopH
+ RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_64xN 64, 32
+FILTER_VER_LUMA_AVX2_64xN 64, 48
+FILTER_VER_LUMA_AVX2_64xN 64, 64
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ lea r6, [r3 * 3]
+ mova m14, [pw_512]
+ mov r9d, 4
+.loopW:
+ PROCESS_LUMA_AVX2_W16_16R
+ add r2, 16
+ add r0, 16
+ dec r9d
+ jnz .loopW
+ RET
+%endif
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
@@ -3695,7 +7544,7 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
%endif
%ifidn %3,pp
- mova m3, [tab_c_512]
+ mova m3, [pw_512]
%else
mova m3, [pw_2000]
%endif
@@ -3959,7 +7808,7 @@ cglobal chroma_p2s, 3, 7, 4
mov r4d, r4m
; load constant
- mova m2, [tab_c_128]
+ mova m2, [pb_128]
mova m3, [tab_c_64_n64]
.loopH:
diff --git a/source/common/x86/ipfilter8.h b/source/common/x86/ipfilter8.h
index 3949409..7d427ae 100644
--- a/source/common/x86/ipfilter8.h
+++ b/source/common/x86/ipfilter8.h
@@ -25,10 +25,10 @@
#define X265_IPFILTER8_H
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
- void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
- void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+ void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
@@ -58,7 +58,7 @@
SETUP_LUMA_FUNC_DEF(16, 64, cpu)
#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
- void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
#define LUMA_SP_FILTERS(cpu) \
SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
@@ -88,7 +88,7 @@
SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
- void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#define LUMA_SS_FILTERS(cpu) \
SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \
@@ -119,212 +119,212 @@
#if HIGH_BIT_DEPTH
-#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
-
-#define CHROMA_VERT_FILTERS(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu)
-
-#define CHROMA_VERT_FILTERS_SSE4(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
-
-#define CHROMA_VERT_FILTERS_422(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 12, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 24, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(12, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 48, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(24, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 64, cpu);
-
-#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(2, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(6, 16, cpu);
-
-#define CHROMA_VERT_FILTERS_444(cpu) \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(64, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(64, 32, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(64, 48, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(48, 64, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(64, 16, cpu); \
- SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu)
-
-#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-
-#define CHROMA_HORIZ_FILTERS(cpu) \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
-
-#define CHROMA_HORIZ_FILTERS_422(cpu) \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 12, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(6, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(2, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 24, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(12, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 48, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(24, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 64, cpu)
-
-#define CHROMA_HORIZ_FILTERS_444(cpu) \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(64, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(64, 32, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(64, 48, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(48, 64, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \
- SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu)
-
-void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-
-CHROMA_VERT_FILTERS(_sse2);
-CHROMA_HORIZ_FILTERS(_sse4);
-CHROMA_VERT_FILTERS_SSE4(_sse4);
-
-CHROMA_VERT_FILTERS_422(_sse2);
-CHROMA_HORIZ_FILTERS_422(_sse4);
-CHROMA_VERT_FILTERS_SSE4_422(_sse4);
-
-CHROMA_VERT_FILTERS_444(_sse2);
-CHROMA_HORIZ_FILTERS_444(_sse4);
-
-#undef CHROMA_VERT_FILTERS_SSE4
-#undef CHROMA_VERT_FILTERS
-#undef SETUP_CHROMA_VERT_FUNC_DEF
-#undef CHROMA_HORIZ_FILTERS
-#undef SETUP_CHROMA_HORIZ_FUNC_DEF
-
-#undef CHROMA_VERT_FILTERS_422
-#undef CHROMA_VERT_FILTERS_SSE4_422
-#undef CHROMA_HORIZ_FILTERS_422
-
-#undef CHROMA_VERT_FILTERS_444
-#undef CHROMA_HORIZ_FILTERS_444
+#define SETUP_CHROMA_420_VERT_FUNC_DEF(W, H, cpu) \
+ void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+
+#define CHROMA_420_VERT_FILTERS(cpu) \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu)
+
+#define CHROMA_420_VERT_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(6, 8, cpu);
+
+#define CHROMA_422_VERT_FILTERS(cpu) \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_422_VERT_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_444_VERT_FILTERS(cpu) \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_420_VERT_FUNC_DEF(16, 64, cpu)
+
+#define SETUP_CHROMA_420_HORIZ_FUNC_DEF(W, H, cpu) \
+ void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+#define CHROMA_420_HORIZ_FILTERS(cpu) \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(6, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 32, cpu)
+
+#define CHROMA_422_HORIZ_FILTERS(cpu) \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(6, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 64, cpu)
+
+#define CHROMA_444_HORIZ_FILTERS(cpu) \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 64, cpu)
+
+void x265_chroma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
+void x265_luma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
+
+CHROMA_420_VERT_FILTERS(_sse2);
+CHROMA_420_HORIZ_FILTERS(_sse4);
+CHROMA_420_VERT_FILTERS_SSE4(_sse4);
+
+CHROMA_422_VERT_FILTERS(_sse2);
+CHROMA_422_HORIZ_FILTERS(_sse4);
+CHROMA_422_VERT_FILTERS_SSE4(_sse4);
+
+CHROMA_444_VERT_FILTERS(_sse2);
+CHROMA_444_HORIZ_FILTERS(_sse4);
+
+#undef CHROMA_420_VERT_FILTERS_SSE4
+#undef CHROMA_420_VERT_FILTERS
+#undef SETUP_CHROMA_420_VERT_FUNC_DEF
+#undef CHROMA_420_HORIZ_FILTERS
+#undef SETUP_CHROMA_420_HORIZ_FUNC_DEF
+
+#undef CHROMA_422_VERT_FILTERS
+#undef CHROMA_422_VERT_FILTERS_SSE4
+#undef CHROMA_422_HORIZ_FILTERS
+
+#undef CHROMA_444_VERT_FILTERS
+#undef CHROMA_444_HORIZ_FILTERS
#else // if HIGH_BIT_DEPTH
#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
- void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+ void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
-#define CHROMA_FILTERS(cpu) \
+#define CHROMA_420_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
SETUP_CHROMA_FUNC_DEF(2, 4, cpu); \
@@ -350,7 +350,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_FUNC_DEF(8, 32, cpu)
-#define CHROMA_FILTERS_422(cpu) \
+#define CHROMA_422_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \
@@ -376,7 +376,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
SETUP_CHROMA_FUNC_DEF(8, 64, cpu);
-#define CHROMA_FILTERS_444(cpu) \
+#define CHROMA_444_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
@@ -403,9 +403,9 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_FUNC_DEF(16, 64, cpu);
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
-#define CHROMA_SP_FILTERS(cpu) \
+#define CHROMA_420_SP_FILTERS(cpu) \
SETUP_CHROMA_SP_FUNC_DEF(8, 2, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 6, cpu); \
@@ -413,7 +413,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu);
-#define CHROMA_SP_FILTERS_SSE4(cpu) \
+#define CHROMA_420_SP_FILTERS_SSE4(cpu) \
SETUP_CHROMA_SP_FUNC_DEF(2, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(2, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(4, 2, cpu); \
@@ -433,7 +433,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu);
-#define CHROMA_SP_FILTERS_422(cpu) \
+#define CHROMA_422_SP_FILTERS(cpu) \
SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 12, cpu); \
@@ -441,7 +441,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 64, cpu);
-#define CHROMA_SP_FILTERS_422_SSE4(cpu) \
+#define CHROMA_422_SP_FILTERS_SSE4(cpu) \
SETUP_CHROMA_SP_FUNC_DEF(2, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(2, 16, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(4, 4, cpu); \
@@ -461,7 +461,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SP_FUNC_DEF(24, 64, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu);
-#define CHROMA_SP_FILTERS_444(cpu) \
+#define CHROMA_444_SP_FILTERS(cpu) \
SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \
@@ -488,9 +488,9 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu);
#define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+ void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
-#define CHROMA_SS_FILTERS(cpu) \
+#define CHROMA_420_SS_FILTERS(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(4, 2, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \
@@ -513,12 +513,12 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu);
-#define CHROMA_SS_FILTERS_SSE4(cpu) \
+#define CHROMA_420_SS_FILTERS_SSE4(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(2, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(6, 8, cpu);
-#define CHROMA_SS_FILTERS_422(cpu) \
+#define CHROMA_422_SS_FILTERS(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \
@@ -541,12 +541,12 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(8, 64, cpu);
-#define CHROMA_SS_FILTERS_422_SSE4(cpu) \
+#define CHROMA_422_SS_FILTERS_SSE4(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(2, 16, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(6, 16, cpu);
-#define CHROMA_SS_FILTERS_444(cpu) \
+#define CHROMA_444_SS_FILTERS(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \
@@ -572,42 +572,44 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
SETUP_CHROMA_SS_FUNC_DEF(64, 16, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
-CHROMA_FILTERS(_sse4);
-CHROMA_SP_FILTERS(_sse2);
-CHROMA_SP_FILTERS_SSE4(_sse4);
-CHROMA_SS_FILTERS(_sse2);
-CHROMA_SS_FILTERS_SSE4(_sse4);
+CHROMA_420_FILTERS(_sse4);
+CHROMA_420_FILTERS(_avx2);
+CHROMA_420_SP_FILTERS(_sse2);
+CHROMA_420_SP_FILTERS_SSE4(_sse4);
+CHROMA_420_SS_FILTERS(_sse2);
+CHROMA_420_SS_FILTERS_SSE4(_sse4);
-CHROMA_FILTERS_422(_sse4);
-CHROMA_SP_FILTERS_422(_sse2);
-CHROMA_SP_FILTERS_422_SSE4(_sse4);
-CHROMA_SS_FILTERS_422(_sse2);
-CHROMA_SS_FILTERS_422_SSE4(_sse4);
+CHROMA_422_FILTERS(_sse4);
+CHROMA_422_FILTERS(_avx2);
+CHROMA_422_SP_FILTERS(_sse2);
+CHROMA_422_SP_FILTERS_SSE4(_sse4);
+CHROMA_422_SS_FILTERS(_sse2);
+CHROMA_422_SS_FILTERS_SSE4(_sse4);
-CHROMA_FILTERS_444(_sse4);
-CHROMA_SP_FILTERS_444(_sse4);
-CHROMA_SS_FILTERS_444(_sse2);
+CHROMA_444_FILTERS(_sse4);
+CHROMA_444_SP_FILTERS(_sse4);
+CHROMA_444_SS_FILTERS(_sse2);
-void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
#undef SETUP_CHROMA_FUNC_DEF
#undef SETUP_CHROMA_SP_FUNC_DEF
#undef SETUP_CHROMA_SS_FUNC_DEF
-#undef CHROMA_FILTERS
-#undef CHROMA_SP_FILTERS
-#undef CHROMA_SS_FILTERS
-#undef CHROMA_SS_FILTERS_SSE4
-#undef CHROMA_SP_FILTERS_SSE4
-
-#undef CHROMA_FILTERS_422
-#undef CHROMA_SP_FILTERS_422
-#undef CHROMA_SS_FILTERS_422
-#undef CHROMA_SS_FILTERS_422_SSE4
-#undef CHROMA_SP_FILTERS_422_SSE4
-
-#undef CHROMA_FILTERS_444
-#undef CHROMA_SP_FILTERS_444
-#undef CHROMA_SS_FILTERS_444
+#undef CHROMA_420_FILTERS
+#undef CHROMA_420_SP_FILTERS
+#undef CHROMA_420_SS_FILTERS
+#undef CHROMA_420_SS_FILTERS_SSE4
+#undef CHROMA_420_SP_FILTERS_SSE4
+
+#undef CHROMA_422_FILTERS
+#undef CHROMA_422_SP_FILTERS
+#undef CHROMA_422_SS_FILTERS
+#undef CHROMA_422_SS_FILTERS_SSE4
+#undef CHROMA_422_SP_FILTERS_SSE4
+
+#undef CHROMA_444_FILTERS
+#undef CHROMA_444_SP_FILTERS
+#undef CHROMA_444_SS_FILTERS
#endif // if HIGH_BIT_DEPTH
@@ -616,8 +618,8 @@ LUMA_SP_FILTERS(_sse4);
LUMA_SS_FILTERS(_sse2);
LUMA_FILTERS(_avx2);
-void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
-void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_interp_8tap_hv_pp_8x8_sse4(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
+void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
diff --git a/source/common/x86/loopfilter.asm b/source/common/x86/loopfilter.asm
index 5068167..fca18cd 100644
--- a/source/common/x86/loopfilter.asm
+++ b/source/common/x86/loopfilter.asm
@@ -28,10 +28,15 @@
%include "x86inc.asm"
SECTION_RODATA 32
-
-pw_2: times 16 db 2
+pb_31: times 16 db 31
+pb_15: times 16 db 15
SECTION .text
+cextern pb_1
+cextern pb_128
+cextern pb_2
+cextern pw_2
+
;============================================================================================================
; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
@@ -39,47 +44,308 @@ SECTION .text
INIT_XMM sse4
cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
- neg r3 ; r3 = -iSignLeft
- movd m0, r3d
- pslldq m0, 15 ; m0 = [iSignLeft x .. x]
- pcmpeqb m4, m4 ; m4 = [pb -1]
- pxor m5, m5 ; m5 = 0
- movh m6, [r1] ; m6 = m_offsetEo
+ neg r3 ; r3 = -signLeft
+ movzx r3d, r3b
+ movd m0, r3d
+ mova m4, [pb_128] ; m4 = [80]
+ pxor m5, m5 ; m5 = 0
+ movu m6, [r1] ; m6 = offsetEo
.loop:
- movu m7, [r0] ; m1 = pRec[x]
- mova m1, m7
- movu m2, [r0+1] ; m2 = pRec[x+1]
-
- psubusb m3, m2, m7
- psubusb m1, m2
- pcmpeqb m3, m5
- pcmpeqb m1, m5
- pcmpeqb m2, m7
-
- pabsb m3, m3 ; m1 = (pRec[x] - pRec[x+1]) > 0) ? 1 : 0
- por m1, m3 ; m1 = iSignRight
- pandn m2, m1
-
- palignr m3, m2, m0, 15 ; m3 = -iSignLeft
- psignb m3, m4 ; m3 = iSignLeft
- mova m0, m4
- pslldq m0, 15
- pand m0, m2 ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1]
- paddb m2, m3
- paddb m2, [pw_2] ; m1 = uiEdgeType
- pshufb m3, m6, m2
- pmovzxbw m2, m7 ; rec
- punpckhbw m7, m5
- pmovsxbw m1, m3 ; iOffsetEo
- punpckhbw m3, m3
- psraw m3, 8
- paddw m2, m1
- paddw m7, m3
- packuswb m2, m7
- movu [r0], m2
-
- add r0q, 16
- sub r2d, 16
+ movu m7, [r0] ; m1 = rec[x]
+ movu m2, [r0 + 1] ; m2 = rec[x+1]
+
+ pxor m1, m7, m4
+ pxor m3, m2, m4
+ pcmpgtb m2, m1, m3
+ pcmpgtb m3, m1
+ pand m2, [pb_1]
+ por m2, m3
+
+ pslldq m3, m2, 1
+ por m3, m0
+
+ psignb m3, m4 ; m3 = signLeft
+ pxor m0, m0
+ palignr m0, m2, 15
+ paddb m2, m3
+ paddb m2, [pb_2] ; m1 = uiEdgeType
+ pshufb m3, m6, m2
+ pmovzxbw m2, m7 ; rec
+ punpckhbw m7, m5
+ pmovsxbw m1, m3 ; offsetEo
+ punpckhbw m3, m3
+ psraw m3, 8
+ paddw m2, m1
+ paddw m7, m3
+ packuswb m2, m7
+ movu [r0], m2
+
+ add r0q, 16
+ sub r2d, 16
jnz .loop
RET
+
+;==================================================================================================
+; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
+;==================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+ mov r3d, r3m
+ mov r4d, r4m
+ pxor m0, m0 ; m0 = 0
+ movu m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ mova m7, [pb_128]
+ shr r4d, 4
+ .loop
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+
+ paddb m3, m2
+ paddb m3, m6
+
+ movu m4, [r2] ; m4 = m_iOffsetEo
+ pshufb m5, m4, m3
+
+ psubb m3, m0, m2
+ movu [r1], m3
+
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m5
+ punpckhbw m5, m5
+ psraw m5, 8
+
+ paddw m2, m3
+ paddw m1, m5
+ packuswb m2, m1
+ movu [r0], m2
+
+ add r0, 16
+ add r1, 16
+ dec r4d
+ jnz .loop
+ RET
+
+;======================================================================================================================================================
+; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
+;======================================================================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
+
+ mov r6, 16
+ mov r5d, r5m
+ pxor m0, m0 ; m0 = 0
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ mova m7, [pb_128]
+ shr r4d, 4
+ inc r1q
+
+ .loop
+ movu m1, [r0] ; m1 = rec[x]
+ movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+ movu m3, [r2] ; m3 = buff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = edgeType
+
+ movu m4, [r3] ; m4 = offsetEo
+ pshufb m4, m3
+
+ psubb m3, m0, m2
+ movu [r1], m3
+
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ movu [r0], m2
+
+ add r0, r6
+ add r1, r6
+ add r2, r6
+ dec r4d
+ jnz .loop
+ RET
+
+;=======================================================================================================
+;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
+;=======================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE3, 3, 7, 8
+ mov r3d, r3m
+ mov r4d, r4m
+ mov r5d, r5m
+
+ mov r6d, r5d
+ sub r6d, r4d
+
+ inc r4d
+ add r0, r4
+ add r1, r4
+ movh m7, [r0 + r6 - 1]
+ mov r6, [r1 + r6 - 2]
+ pxor m0, m0 ; m0 = 0
+ movu m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+.loop:
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+
+ psubusb m3, m2, m1
+ psubusb m4, m1, m2
+ pcmpeqb m3, m0
+ pcmpeqb m4, m0
+ pcmpeqb m2, m1
+
+ pabsb m3, m3
+ por m4, m3
+ pandn m2, m4 ; m2 = iSignDown
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = uiEdgeType
+
+ movu m4, [r2] ; m4 = m_iOffsetEo
+ pshufb m5, m4, m3
+
+ psubb m3, m0, m2
+ movu [r1 - 1], m3
+
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m5
+ punpckhbw m5, m5
+ psraw m5, 8
+
+ paddw m2, m3
+ paddw m1, m5
+ packuswb m2, m1
+ movu [r0], m2
+
+ sub r5d, 16
+ jle .end
+
+ lea r0, [r0 + 16]
+ lea r1, [r1 + 16]
+
+ jnz .loop
+
+.end:
+ js .skip
+ sub r0, r4
+ sub r1, r4
+ movh [r0 + 16], m7
+ mov [r1 + 15], r6
+ jmp .quit
+
+.skip:
+ sub r0, r4
+ sub r1, r4
+ movh [r0 + 15], m7
+ mov [r1 + 14], r6
+
+.quit:
+
+ RET
+
+;=====================================================================================
+; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
+;=====================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgB0, 4, 7, 8
+
+ mov r3d, r3m
+ mov r4d, r4m
+
+ shr r2d, 4
+ movu m3, [r1 + 0] ; offset[0-15]
+ movu m4, [r1 + 16] ; offset[16-31]
+ pxor m7, m7 ; m7 =[0]
+.loopH
+ mov r5d, r2d
+ xor r6, r6
+
+.loopW
+ movu m2, [r0 + r6] ; m0 = [rec]
+ psrlw m1, m2, 3
+ pand m1, [pb_31] ; m1 = [index]
+ pcmpgtb m0, m1, [pb_15] ; m2 = [mask]
+
+ pshufb m6, m3, m1
+ pshufb m5, m4, m1
+
+ pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+
+ pmovzxbw m1, m2 ; rec
+ punpckhbw m2, m7
+
+ pmovsxbw m0, m6 ; offset
+ punpckhbw m6, m6
+ psraw m6, 8
+
+ paddw m1, m0
+ paddw m2, m6
+ packuswb m1, m2
+
+ movu [r0 + r6], m1
+ add r6d, 16
+ dec r5d
+ jnz .loopW
+
+ lea r0, [r0 + r4]
+
+ dec r3d
+ jnz .loopH
+ RET
+
+;============================================================================================================
+; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
+;============================================================================================================
+INIT_XMM sse4
+cglobal calSign, 4, 5, 7
+
+ mov r4, 16
+ mova m1, [pb_128]
+ mova m0, [pb_1]
+ shr r3d, 4
+.loop
+ movu m2, [r1] ; m2 = pRec[x]
+ movu m3, [r2] ; m3 = pTmpU[x]
+
+ pxor m4, m2, m1
+ pxor m5, m3, m1
+ pcmpgtb m6, m4, m5
+ pcmpgtb m5, m4
+ pand m6, m0
+ por m6, m5
+
+ movu [r0], m6
+
+ add r0, r4
+ add r1, r4
+ add r2, r4
+ dec r3d
+ jnz .loop
+ RET
diff --git a/source/common/x86/loopfilter.h b/source/common/x86/loopfilter.h
index 7f0f409..1cea034 100644
--- a/source/common/x86/loopfilter.h
+++ b/source/common/x86/loopfilter.h
@@ -2,6 +2,7 @@
* Copyright (C) 2013 x265 project
*
* Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -25,5 +26,10 @@
#define X265_LOOPFILTER_H
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
#endif // ifndef X265_LOOPFILTER_H
diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h
index 95cb609..9bf4611 100644
--- a/source/common/x86/mc.h
+++ b/source/common/x86/mc.h
@@ -25,7 +25,7 @@
#define X265_MC_H
#define LOWRES(cpu) \
- void x265_frame_init_lowres_core_ ## cpu(pixel * src0, pixel * dst0, pixel * dsth, pixel * dstv, pixel * dstc, \
+ void x265_frame_init_lowres_core_ ## cpu(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
intptr_t src_stride, intptr_t dst_stride, int width, int height);
LOWRES(mmx2)
LOWRES(sse2)
@@ -37,31 +37,31 @@ LOWRES(xop)
void func ## _mmx2 args; \
void func ## _sse2 args; \
void func ## _ssse3 args;
-DECL_SUF(x265_pixel_avg_64x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x48, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_48x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x24, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_24x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x12, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_12x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x48, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_64x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_48x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x24, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_32x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_24x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x12, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_16x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_12x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_8x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x8, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
+DECL_SUF(x265_pixel_avg_4x4, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
#undef LOWRES
#undef DECL_SUF
diff --git a/source/common/x86/pixel-a.asm b/source/common/x86/pixel-a.asm
index 1e4180b..fc26d81 100644
--- a/source/common/x86/pixel-a.asm
+++ b/source/common/x86/pixel-a.asm
@@ -41,7 +41,10 @@ hmul_8p: times 8 db 1
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
-
+hmul_8w: times 4 dw 1
+ times 2 dw 1, -1
+ALIGN 32
+hmul_w: dw 1, -1, 1, -1, 1, -1, 1, -1
ALIGN 32
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
@@ -66,6 +69,7 @@ cextern pw_pmpmpmpm
cextern pw_pmmpzzzz
cextern pd_1
cextern popcnt_table
+cextern pd_2
;=============================================================================
; SATD
@@ -447,7 +451,19 @@ cglobal pixel_satd_4x8, 4,6
cglobal pixel_satd_4x4, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
- SATD_END_MMX
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+ movd eax, m0
+%else ; !HIGH_BIT_DEPTH
+ pshufw m1, m0, q1032
+ paddw m0, m1
+ pshufw m1, m0, q2301
+ paddw m0, m1
+ movd eax, m0
+ and eax, 0xffff
+%endif ; HIGH_BIT_DEPTH
+ EMMS
+ RET
%macro SATD_START_SSE2 2-3 0
FIX_STRIDES r1, r3
@@ -6579,3 +6595,2367 @@ cglobal upShift_8, 7,7,3
mov [r2], r3w
.end:
RET
+
+%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
+%if cpuflag(ssse3)
+ pabsd %1, %3
+ pabsd %2, %4
+%elifidn %1, %3
+ pxor %5, %5
+ pxor %6, %6
+ psubd %5, %1
+ psubd %6, %2
+ pmaxsd %1, %5
+ pmaxsd %2, %6
+%else
+ pxor %1, %1
+ pxor %2, %2
+ psubd %1, %3
+ psubd %2, %4
+ pmaxsd %1, %3
+ pmaxsd %2, %4
+%endif
+%endmacro
+
+;---------------------------------------------------------------------------------------------------------------------
+;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+;---------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal psyCost_pp_4x4, 4, 5, 8
+
+%if HIGH_BIT_DEPTH
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ paddd m5, m0, m1
+ paddd m5, m2
+ paddd m5, m3
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m5, 2
+
+ SUMSUB_BA d, 0, 1, 4
+ SUMSUB_BA d, 2, 3, 4
+ SUMSUB_BA d, 0, 2, 4
+ SUMSUB_BA d, 1, 3, 4
+ %define ORDER unord
+ TRANS q, ORDER, 0, 2, 4, 6
+ TRANS q, ORDER, 1, 3, 4, 6
+ ABSD2 m0, m2, m0, m2, m4, m6
+ pmaxsd m0, m2
+ ABSD2 m1, m3, m1, m3, m4, m6
+ pmaxsd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+
+ psubd m7, m0, m5
+
+ lea r4, [3 * r3]
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r4]
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ paddd m5, m0, m1
+ paddd m5, m2
+ paddd m5, m3
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m5, 2
+
+ SUMSUB_BA d, 0, 1, 4
+ SUMSUB_BA d, 2, 3, 4
+ SUMSUB_BA d, 0, 2, 4
+ SUMSUB_BA d, 1, 3, 4
+ %define ORDER unord
+ TRANS q, ORDER, 0, 2, 4, 6
+ TRANS q, ORDER, 1, 3, 4, 6
+ ABSD2 m0, m2, m0, m2, m4, m6
+ pmaxsd m0, m2
+ ABSD2 m1, m3, m1, m3, m4, m6
+ pmaxsd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+
+ psubd m0, m5
+
+ psubd m7, m0
+ pabsd m0, m7
+ movd eax, m0
+
+%else ; !HIGH_BIT_DEPTH
+ lea r4, [3 * r1]
+ movd m0, [r0]
+ movd m1, [r0 + r1]
+ movd m2, [r0 + r1 * 2]
+ movd m3, [r0 + r4]
+ shufps m0, m1, 0
+ shufps m2, m3, 0
+ mova m4, [hmul_4p]
+ pmaddubsw m0, m4
+ pmaddubsw m2, m4
+
+ paddw m5, m0, m2
+ movhlps m4, m5
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrld m5, 2
+
+ HADAMARD 0, sumsub, 0, 2, 1, 3
+ HADAMARD 4, sumsub, 0, 2, 1, 3
+ HADAMARD 1, amax, 0, 2, 1, 3
+ HADDW m0, m2
+
+ psubd m6, m0, m5
+
+ lea r4, [3 * r3]
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ movd m2, [r2 + r3 * 2]
+ movd m3, [r2 + r4]
+ shufps m0, m1, 0
+ shufps m2, m3, 0
+ mova m4, [hmul_4p]
+ pmaddubsw m0, m4
+ pmaddubsw m2, m4
+
+ paddw m5, m0, m2
+ movhlps m4, m5
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrld m5, 2
+
+ HADAMARD 0, sumsub, 0, 2, 1, 3
+ HADAMARD 4, sumsub, 0, 2, 1, 3
+ HADAMARD 1, amax, 0, 2, 1, 3
+ HADDW m0, m2
+
+ psubd m0, m5
+
+ psubd m6, m0
+ pabsd m0, m6
+ movd eax, m0
+%endif ; HIGH_BIT_DEPTH
+ RET
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_pp_8x8, 4, 6, 13
+
+%if HIGH_BIT_DEPTH
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ pxor m10, m10
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, [pw_1]
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m10, m0, m8
+
+ lea r4, [3 * r3]
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r4]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, [pw_1]
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m0, m8
+ psubd m10, m0
+ pabsd m0, m10
+ movd eax, m0
+%else ; !HIGH_BIT_DEPTH
+ lea r4, [3 * r1]
+ mova m8, [hmul_8p]
+
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r1]
+ movddup m6, [r5 + r1 * 2]
+ movddup m7, [r5 + r4]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, [pw_1]
+ psrldq m10, m11, 4
+ paddd m11, m10
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m12, m0, m11
+
+ lea r4, [3 * r3]
+
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r4]
+ lea r5, [r2 + r3 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r3]
+ movddup m6, [r5 + r3 * 2]
+ movddup m7, [r5 + r4]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, [pw_1]
+ psrldq m10, m11, 4
+ paddd m11, m10
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m0, m11
+ psubd m12, m0
+ pabsd m0, m12
+ movd eax, m0
+%endif ; HIGH_BIT_DEPTH
+ RET
+%endif
+
+%if ARCH_X86_64
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal psyCost_pp_16x16, 4, 9, 14
+
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ lea r8, [3 * r3]
+ mova m12, [pw_1]
+ mova m13, [pd_1]
+ pxor m11, m11
+ mov r7d, 2
+.loopH:
+ mov r6d, 2
+.loopW:
+ pxor m10, m10
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m12
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, m13
+ psrld m0, 1
+ psubd m10, m0, m8
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r8]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r8]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m12
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, m13
+ psrld m0, 1
+ psubd m0, m8
+ psubd m10, m0
+ pabsd m0, m10
+ paddd m11, m0
+ add r0, 16
+ add r2, 16
+ dec r6d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r7d
+ jnz .loopH
+ movd eax, m11
+ RET
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal psyCost_pp_16x16, 4, 9, 15
+ lea r4, [3 * r1]
+ lea r8, [3 * r3]
+ mova m8, [hmul_8p]
+ mova m10, [pw_1]
+ mova m14, [pd_1]
+ pxor m13, m13
+ mov r7d, 2
+.loopH:
+ mov r6d, 2
+.loopW:
+ pxor m12, m12
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r1]
+ movddup m6, [r5 + r1 * 2]
+ movddup m7, [r5 + r4]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m10
+ psrldq m9, m11, 4
+ paddd m11, m9
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, m14
+ psrld m0, 1
+ psubd m12, m0, m11
+
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r8]
+ lea r5, [r2 + r3 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r3]
+ movddup m6, [r5 + r3 * 2]
+ movddup m7, [r5 + r8]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m10
+ psrldq m9, m11, 4
+ paddd m11, m9
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, m14
+ psrld m0, 1
+ psubd m0, m11
+ psubd m12, m0
+ pabsd m0, m12
+ paddd m13, m0
+ add r0, 8
+ add r2, 8
+ dec r6d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 16]
+ lea r2, [r2 + r3 * 8 - 16]
+ dec r7d
+ jnz .loopH
+ movd eax, m13
+ RET
+%endif ; HIGH_BIT_DEPTH
+%endif
+
+%if ARCH_X86_64
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal psyCost_pp_32x32, 4, 9, 14
+
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ lea r8, [3 * r3]
+ mova m12, [pw_1]
+ mova m13, [pd_1]
+ pxor m11, m11
+ mov r7d, 4
+.loopH:
+ mov r6d, 4
+.loopW:
+ pxor m10, m10
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m12
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, m13
+ psrld m0, 1
+ psubd m10, m0, m8
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r8]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r8]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m12
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, m13
+ psrld m0, 1
+ psubd m0, m8
+ psubd m10, m0
+ pabsd m0, m10
+ paddd m11, m0
+ add r0, 16
+ add r2, 16
+ dec r6d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r7d
+ jnz .loopH
+ movd eax, m11
+ RET
+
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal psyCost_pp_32x32, 4, 9, 15
+
+ lea r4, [3 * r1]
+ lea r8, [3 * r3]
+ mova m8, [hmul_8p]
+ mova m10, [pw_1]
+ mova m14, [pd_1]
+ pxor m13, m13
+ mov r7d, 4
+.loopH:
+ mov r6d, 4
+.loopW:
+ pxor m12, m12
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r1]
+ movddup m6, [r5 + r1 * 2]
+ movddup m7, [r5 + r4]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m10
+ psrldq m9, m11, 4
+ paddd m11, m9
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, m14
+ psrld m0, 1
+ psubd m12, m0, m11
+
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r8]
+ lea r5, [r2 + r3 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r3]
+ movddup m6, [r5 + r3 * 2]
+ movddup m7, [r5 + r8]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m10
+ psrldq m9, m11, 4
+ paddd m11, m9
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, m14
+ psrld m0, 1
+ psubd m0, m11
+ psubd m12, m0
+ pabsd m0, m12
+ paddd m13, m0
+ add r0, 8
+ add r2, 8
+ dec r6d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r7d
+ jnz .loopH
+ movd eax, m13
+ RET
+%endif ; HIGH_BIT_DEPTH
+%endif
+
+%if ARCH_X86_64
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal psyCost_pp_64x64, 4, 9, 14
+
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ lea r8, [3 * r3]
+ mova m12, [pw_1]
+ mova m13, [pd_1]
+ pxor m11, m11
+ mov r7d, 8
+.loopH:
+ mov r6d, 8
+.loopW:
+ pxor m10, m10
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m12
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, m13
+ psrld m0, 1
+ psubd m10, m0, m8
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r8]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r8]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m12
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, m13
+ psrld m0, 1
+ psubd m0, m8
+ psubd m10, m0
+ pabsd m0, m10
+ paddd m11, m0
+ add r0, 16
+ add r2, 16
+ dec r6d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 128]
+ lea r2, [r2 + r3 * 8 - 128]
+ dec r7d
+ jnz .loopH
+ movd eax, m11
+ RET
+
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal psyCost_pp_64x64, 4, 9, 15
+
+ lea r4, [3 * r1]
+ lea r8, [3 * r3]
+ mova m8, [hmul_8p]
+ mova m10, [pw_1]
+ mova m14, [pd_1]
+ pxor m13, m13
+ mov r7d, 8
+.loopH:
+ mov r6d, 8
+.loopW:
+ pxor m12, m12
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r1]
+ movddup m6, [r5 + r1 * 2]
+ movddup m7, [r5 + r4]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m10
+ psrldq m9, m11, 4
+ paddd m11, m9
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, m14
+ psrld m0, 1
+ psubd m12, m0, m11
+
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r8]
+ lea r5, [r2 + r3 * 4]
+ movddup m4, [r5]
+ movddup m5, [r5 + r3]
+ movddup m6, [r5 + r3 * 2]
+ movddup m7, [r5 + r8]
+
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m10
+ psrldq m9, m11, 4
+ paddd m11, m9
+ psrld m11, 2
+
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
+
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ HADDW m0, m1
+
+ paddd m0, m14
+ psrld m0, 1
+ psubd m0, m11
+ psubd m12, m0
+ pabsd m0, m12
+ paddd m13, m0
+ add r0, 8
+ add r2, 8
+ dec r6d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r7d
+ jnz .loopH
+ movd eax, m13
+ RET
+%endif ; HIGH_BIT_DEPTH
+%endif
+
+;---------------------------------------------------------------------------------------------------------------------
+;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
+;---------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal psyCost_ss_4x4, 4, 5, 8
+
+ add r1, r1
+ lea r4, [3 * r1]
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+
+ pabsw m4, m0
+ pabsw m5, m1
+ paddw m5, m4
+ pabsw m4, m2
+ paddw m5, m4
+ pabsw m4, m3
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m6, m5, 2
+
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ psrldq m4, m0, 4
+ psubd m5, m0, m4
+ paddd m0, m4
+ shufps m0, m5, 10001000b
+
+ psrldq m4, m1, 4
+ psubd m5, m1, m4
+ paddd m1, m4
+ shufps m1, m5, 10001000b
+
+ psrldq m4, m2, 4
+ psubd m5, m2, m4
+ paddd m2, m4
+ shufps m2, m5, 10001000b
+
+ psrldq m4, m3, 4
+ psubd m5, m3, m4
+ paddd m3, m4
+ shufps m3, m5, 10001000b
+
+ mova m4, m0
+ paddd m0, m1
+ psubd m1, m4
+ mova m4, m2
+ paddd m2, m3
+ psubd m3, m4
+ mova m4, m0
+ paddd m0, m2
+ psubd m2, m4
+ mova m4, m1
+ paddd m1, m3
+ psubd m3, m4
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ psrld m0, 1
+ psubd m7, m0, m6
+
+ add r3, r3
+ lea r4, [3 * r3]
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r4]
+
+ pabsw m4, m0
+ pabsw m5, m1
+ paddw m5, m4
+ pabsw m4, m2
+ paddw m5, m4
+ pabsw m4, m3
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m6, m5, 2
+
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ psrldq m4, m0, 4
+ psubd m5, m0, m4
+ paddd m0, m4
+ shufps m0, m5, 10001000b
+
+ psrldq m4, m1, 4
+ psubd m5, m1, m4
+ paddd m1, m4
+ shufps m1, m5, 10001000b
+
+ psrldq m4, m2, 4
+ psubd m5, m2, m4
+ paddd m2, m4
+ shufps m2, m5, 10001000b
+
+ psrldq m4, m3, 4
+ psubd m5, m3, m4
+ paddd m3, m4
+ shufps m3, m5, 10001000b
+
+ mova m4, m0
+ paddd m0, m1
+ psubd m1, m4
+ mova m4, m2
+ paddd m2, m3
+ psubd m3, m4
+ mova m4, m0
+ paddd m0, m2
+ psubd m2, m4
+ mova m4, m1
+ paddd m1, m3
+ psubd m3, m4
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ psrld m0, 1
+ psubd m0, m6
+ psubd m7, m0
+ pabsd m0, m7
+ movd eax, m0
+ RET
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_ss_8x8, 4, 6, 15
+
+ mova m13, [hmul_w]
+ mova m14, [pw_1]
+ add r1, r1
+ add r3, r3
+ lea r4, [3 * r1]
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ pabsw m8, m0
+ pabsw m9, m1
+ paddw m8, m9
+ pabsw m10, m2
+ pabsw m11, m3
+ paddw m10, m11
+ paddw m8, m10
+ pabsw m9, m4
+ pabsw m10, m5
+ paddw m9, m10
+ pabsw m11, m6
+ pabsw m12, m7
+ paddw m11, m12
+ paddw m9, m11
+ paddw m8, m9
+ movhlps m9, m8
+ pmovzxwd m8, m8
+ pmovzxwd m9, m9
+ paddd m8, m9
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ pmaddwd m0, m13
+ pmaddwd m1, m13
+ pmaddwd m2, m13
+ pmaddwd m3, m13
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ pmaddwd m4, m13
+ pmaddwd m5, m13
+ pmaddwd m6, m13
+ pmaddwd m7, m13
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m11, m0, m7
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pmaddwd m3, m14
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ pmaddwd m4, m14
+ pmaddwd m5, m14
+ pmaddwd m6, m14
+ pmaddwd m7, m14
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m0, m7
+ paddd m0, m11
+
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrld m0, 2
+ psubd m12, m0, m8
+
+ lea r4, [3 * r3]
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r4]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r4]
+
+ pabsw m8, m0
+ pabsw m9, m1
+ paddw m8, m9
+ pabsw m10, m2
+ pabsw m11, m3
+ paddw m10, m11
+ paddw m8, m10
+ pabsw m9, m4
+ pabsw m10, m5
+ paddw m9, m10
+ pabsw m11, m6
+ pabsw m10, m7
+ paddw m11, m10
+ paddw m9, m11
+ paddw m8, m9
+ movhlps m9, m8
+ pmovzxwd m8, m8
+ pmovzxwd m9, m9
+ paddd m8, m9
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ pmaddwd m0, m13
+ pmaddwd m1, m13
+ pmaddwd m2, m13
+ pmaddwd m3, m13
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ pmaddwd m4, m13
+ pmaddwd m5, m13
+ pmaddwd m6, m13
+ pmaddwd m7, m13
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m11, m0, m7
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r4]
+
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pmaddwd m3, m14
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r4]
+
+ pmaddwd m4, m14
+ pmaddwd m5, m14
+ pmaddwd m6, m14
+ pmaddwd m7, m14
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m0, m7
+ paddd m0, m11
+
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrld m0, 2
+ psubd m0, m8
+
+ psubd m12, m0
+ pabsd m0, m12
+ movd eax, m0
+ RET
+%endif
+
+%macro psy_cost_ss 0
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ pabsw m8, m0
+ pabsw m9, m1
+ paddw m8, m9
+ pabsw m10, m2
+ pabsw m11, m3
+ paddw m10, m11
+ paddw m8, m10
+ pabsw m9, m4
+ pabsw m10, m5
+ paddw m9, m10
+ pabsw m11, m6
+ pabsw m12, m7
+ paddw m11, m12
+ paddw m9, m11
+ paddw m8, m9
+ movhlps m9, m8
+ pmovzxwd m8, m8
+ pmovzxwd m9, m9
+ paddd m8, m9
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ pmaddwd m0, m13
+ pmaddwd m1, m13
+ pmaddwd m2, m13
+ pmaddwd m3, m13
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ pmaddwd m4, m13
+ pmaddwd m5, m13
+ pmaddwd m6, m13
+ pmaddwd m7, m13
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m11, m0, m7
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pmaddwd m3, m14
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ pmaddwd m4, m14
+ pmaddwd m5, m14
+ pmaddwd m6, m14
+ pmaddwd m7, m14
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m0, m7
+ paddd m0, m11
+
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrld m0, 2
+ psubd m12, m0, m8
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r6]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r6]
+
+ pabsw m8, m0
+ pabsw m9, m1
+ paddw m8, m9
+ pabsw m10, m2
+ pabsw m11, m3
+ paddw m10, m11
+ paddw m8, m10
+ pabsw m9, m4
+ pabsw m10, m5
+ paddw m9, m10
+ pabsw m11, m6
+ pabsw m10, m7
+ paddw m11, m10
+ paddw m9, m11
+ paddw m8, m9
+ movhlps m9, m8
+ pmovzxwd m8, m8
+ pmovzxwd m9, m9
+ paddd m8, m9
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ pmaddwd m0, m13
+ pmaddwd m1, m13
+ pmaddwd m2, m13
+ pmaddwd m3, m13
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ pmaddwd m4, m13
+ pmaddwd m5, m13
+ pmaddwd m6, m13
+ pmaddwd m7, m13
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m11, m0, m7
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r6]
+
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pmaddwd m3, m14
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r6]
+
+ pmaddwd m4, m14
+ pmaddwd m5, m14
+ pmaddwd m6, m14
+ pmaddwd m7, m14
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m0, m7
+ paddd m0, m11
+
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrld m0, 2
+ psubd m0, m8
+
+ psubd m12, m0
+ pabsd m0, m12
+ paddd m15, m0
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_ss_16x16, 4, 9, 16
+
+ mova m13, [hmul_w]
+ mova m14, [pw_1]
+ add r1, r1
+ add r3, r3
+ lea r4, [3 * r1]
+ lea r6, [3 * r3]
+ pxor m15, m15
+ mov r7d, 2
+.loopH:
+ mov r8d, 2
+.loopW:
+ psy_cost_ss
+ add r0, 16
+ add r2, 16
+ dec r8d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r7d
+ jnz .loopH
+ movd eax, m15
+ RET
+%endif
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_ss_32x32, 4, 9, 16
+
+ mova m13, [hmul_w]
+ mova m14, [pw_1]
+ add r1, r1
+ add r3, r3
+ lea r4, [3 * r1]
+ lea r6, [3 * r3]
+ pxor m15, m15
+ mov r7d, 4
+.loopH:
+ mov r8d, 4
+.loopW:
+ psy_cost_ss
+ add r0, 16
+ add r2, 16
+ dec r8d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r7d
+ jnz .loopH
+ movd eax, m15
+ RET
+%endif
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_ss_64x64, 4, 9, 16
+
+ mova m13, [hmul_w]
+ mova m14, [pw_1]
+ add r1, r1
+ add r3, r3
+ lea r4, [3 * r1]
+ lea r6, [3 * r3]
+ pxor m15, m15
+ mov r7d, 8
+.loopH:
+ mov r8d, 8
+.loopW:
+ psy_cost_ss
+ add r0, 16
+ add r2, 16
+ dec r8d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 128]
+ lea r2, [r2 + r3 * 8 - 128]
+ dec r7d
+ jnz .loopH
+ movd eax, m15
+ RET
+%endif
diff --git a/source/common/x86/pixel-util.h b/source/common/x86/pixel-util.h
index 90bb4fc..546426d 100644
--- a/source/common/x86/pixel-util.h
+++ b/source/common/x86/pixel-util.h
@@ -24,75 +24,68 @@
#ifndef X265_PIXEL_UTIL_H
#define X265_PIXEL_UTIL_H
-void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-
-void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual16_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual32_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-
-void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
-
-void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
-void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride);
-
-uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
-
-void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-
-void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
- const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_sse2(const pixel * pix1, intptr_t stride1,
- const pixel * pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_avx(const pixel * pix1, intptr_t stride1,
- const pixel * pix2, intptr_t stride2, int sums[2][4]);
+void x265_getResidual4_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual8_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose16_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose32_sse2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose64_sse2(pixel* dest, const pixel* src, intptr_t stride);
+
+void x265_transpose8_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose16_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose32_avx2(pixel* dest, const pixel* src, intptr_t stride);
+void x265_transpose64_avx2(pixel* dest, const pixel* src, intptr_t stride);
+
+uint32_t x265_quant_sse4(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_quant_avx2(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_sse4(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+int x265_count_nonzero_ssse3(const int16_t* quantCoeff, int numCoeff);
+
+void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_sp_sse4(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+
+void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t* pix1, intptr_t stride1,
+ const uint8_t* pix2, intptr_t stride2, int sums[2][4]);
+void x265_pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1,
+ const pixel* pix2, intptr_t stride2, int sums[2][4]);
+void x265_pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1,
+ const pixel* pix2, intptr_t stride2, int sums[2][4]);
float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width);
float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width);
-void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
-void x265_scale1D_128to64_avx2(pixel *, pixel *, intptr_t);
-void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
+void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
+void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
- void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
- void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
+ void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
+ void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1);
-#define CHROMA_PIXELSUB_DEF(cpu) \
+#define CHROMA_420_PIXELSUB_DEF(cpu) \
SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu);
-#define CHROMA_PIXELSUB_DEF_422(cpu) \
+#define CHROMA_422_PIXELSUB_DEF(cpu) \
SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu);
#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
- void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
- void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
+ void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
+ void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1);
#define LUMA_PIXELSUB_DEF(cpu) \
SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
@@ -100,16 +93,16 @@ void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu);
-CHROMA_PIXELSUB_DEF(_sse4);
-LUMA_PIXELSUB_DEF(_sse4);
-CHROMA_PIXELSUB_DEF(_sse2);
LUMA_PIXELSUB_DEF(_sse2);
+CHROMA_420_PIXELSUB_DEF(_sse2);
+CHROMA_422_PIXELSUB_DEF(_sse2);
-CHROMA_PIXELSUB_DEF_422(_sse4);
-CHROMA_PIXELSUB_DEF_422(_sse2);
+LUMA_PIXELSUB_DEF(_sse4);
+CHROMA_420_PIXELSUB_DEF(_sse4);
+CHROMA_422_PIXELSUB_DEF(_sse4);
#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
- uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
+ uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(const pixel* pix, intptr_t pixstride);
#define LUMA_PIXELVAR_DEF(cpu) \
SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \
@@ -118,9 +111,11 @@ CHROMA_PIXELSUB_DEF_422(_sse2);
SETUP_LUMA_PIXELVAR_FUNC(64, 64, cpu);
LUMA_PIXELVAR_DEF(_sse2);
+LUMA_PIXELVAR_DEF(_xop);
+LUMA_PIXELVAR_DEF(_avx);
-#undef CHROMA_PIXELSUB_DEF
-#undef CHROMA_PIXELSUB_DEF_422
+#undef CHROMA_420_PIXELSUB_DEF
+#undef CHROMA_422_PIXELSUB_DEF
#undef LUMA_PIXELSUB_DEF
#undef LUMA_PIXELVAR_DEF
#undef SETUP_CHROMA_PIXELSUB_PS_FUNC
diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm
index 38fb52e..bf92072 100644
--- a/source/common/x86/pixel-util8.asm
+++ b/source/common/x86/pixel-util8.asm
@@ -53,8 +53,11 @@ trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
SECTION .text
cextern pw_1
+cextern pw_0_15
cextern pb_1
cextern pw_00ff
+cextern pw_1023
+cextern pw_3fff
cextern pw_2000
cextern pw_pixel_max
cextern pd_1
@@ -62,448 +65,6 @@ cextern pd_32767
cextern pd_n32768
;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 4/2
-.loop:
- movh m0, [r0]
- movh m1, [r0 + r4]
- punpcklqdq m0, m1
- movh m2, [r1]
- movh m3, [r1 + r4]
- punpcklqdq m2, m3
- paddw m0, m2
- CLIPW m0, m4, m5
-
- ; store recipred[]
- movh [r3], m0
- movhps [r3 + r6], m0
-
- ; store recqt[]
- movh [r2], m0
- movhps [r2 + r5], m0
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 4/2
-.loop:
- movd m1, [r0]
- movd m2, [r0 + r4]
- punpckldq m1, m2
- punpcklbw m1, m0
- movh m2, [r1]
- movh m3, [r1 + r4 * 2]
- punpcklqdq m2, m3
- paddw m1, m2
- packuswb m1, m1
-
- ; store recon[] and recipred[]
- movd [r3], m1
- pshufd m2, m1, 1
- movd [r3 + r6], m2
-
- ; store recqt[]
- punpcklbw m1, m0
- movh [r2], m1
- movhps [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons8, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons8, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
-%if HIGH_BIT_DEPTH
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 8/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + r4]
- movu m2, [r1]
- movu m3, [r1 + r4]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + r6], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 8/2
-.loop:
- movh m1, [r0]
- movh m2, [r0 + r4]
- punpcklbw m1, m0
- punpcklbw m2, m0
- movu m3, [r1]
- movu m4, [r1 + r4 * 2]
- paddw m1, m3
- paddw m2, m4
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movh [r3], m1
- movhps [r3 + r6], m1
-
- ; store recqt[]
- punpcklbw m2, m1, m0
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 16/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2 + r5], m0
- movu [r2 + r5 + 16], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 16
-.loop:
- movu m2, [r0]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- paddw m1, [r1]
- paddw m2, [r1 + 16]
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movu [r3], m1
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + 16], m1
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 32/2
-.loop:
-
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + 32]
- movu m1, [r0 + 48]
- movu m2, [r1 + 32]
- movu m3, [r1 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + 32], m0
- movu [r3 + 48], m1
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4 + 32]
- movu m1, [r0 + r4 + 48]
- movu m2, [r1 + r4 + 32]
- movu m3, [r1 + r4 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6 + 32], m0
- movu [r3 + r6 + 48], m1
- lea r3, [r3 + r6 * 2]
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 32
-.loop:
- movu m2, [r0]
- movu m4, [r0 + 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- paddw m1, [r1 + 0 * 16]
- paddw m2, [r1 + 1 * 16]
- packuswb m1, m2
-
- paddw m3, [r1 + 2 * 16]
- paddw m4, [r1 + 3 * 16]
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [r3], m1
- movu [r3 + 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2 + 0 * 16], m2
- movu [r2 + 1 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [r2 + 2 * 16], m4
- movu [r2 + 3 * 16], m3
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -861,7 +422,7 @@ cglobal getResidual32, 4,5,7
;-----------------------------------------------------------------------------
-; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal quant, 5,6,8
@@ -883,7 +444,7 @@ cglobal quant, 5,6,8
pxor m7, m7 ; m7 = numZero
.loop:
; 4 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
@@ -901,7 +462,7 @@ cglobal quant, 5,6,8
movh [r3], m3
; 4 coeff
- movu m0, [r0 + 16] ; m0 = level
+ pmovsxwd m0, [r0 + 8] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + 16] ; m0 = tmpLevel1
paddd m2, m1, m5
@@ -916,7 +477,7 @@ cglobal quant, 5,6,8
packssdw m3, m3
movh [r3 + 8], m3
- add r0, 32
+ add r0, 16
add r1, 32
add r2, 32
add r3, 16
@@ -953,7 +514,7 @@ cglobal quant, 5,5,10
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
@@ -966,7 +527,7 @@ cglobal quant, 5,5,10
psignd m2, m0
; 8 coeff
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m3, m1, m5
@@ -987,7 +548,7 @@ cglobal quant, 5,5,10
pminuw m2, m9
paddw m7, m2
- add r0, mmsize*2
+ add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
@@ -1025,7 +586,7 @@ cglobal quant, 5,6,8
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
@@ -1044,7 +605,7 @@ cglobal quant, 5,6,8
movu [r3], xm3
; 8 coeff
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m2, m1, m5
@@ -1062,7 +623,7 @@ cglobal quant, 5,6,8
vpermq m3, m3, q0020
movu [r3 + mmsize/2], xm3
- add r0, mmsize*2
+ add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
@@ -1083,7 +644,7 @@ IACA_END
;-----------------------------------------------------------------------------
-; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal nquant, 3,5,8
@@ -1096,8 +657,8 @@ cglobal nquant, 3,5,8
shr r4d, 3
.loop:
- movu m0, [r0] ; m0 = level
- movu m1, [r0 + 16] ; m1 = level
+ pmovsxwd m0, [r0] ; m0 = level
+ pmovsxwd m1, [r0 + 8] ; m1 = level
pabsd m2, m0
pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
@@ -1114,7 +675,7 @@ cglobal nquant, 3,5,8
packssdw m2, m3
movu [r2], m2
- add r0, 32
+ add r0, 16
add r1, 32
add r2, 16
@@ -1144,14 +705,14 @@ cglobal nquant, 3,5,7
shr r4d, 4
.loop:
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
paddd m1, m4
psrad m1, xm3 ; m0 = level1
psignd m1, m0
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m2, m0
pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
paddd m2, m4
@@ -1162,7 +723,7 @@ cglobal nquant, 3,5,7
vpermq m2, m1, q3120
movu [r2], m2
- add r0, mmsize * 2
+ add r0, mmsize
add r1, mmsize * 2
add r2, mmsize
@@ -1211,15 +772,11 @@ cglobal dequant_normal, 5,5,5
pmaddwd m4, m1
psrad m3, m0
psrad m4, m0
- packssdw m3, m3 ; OPT_ME: store must be 32 bits
- pmovsxwd m3, m3
- packssdw m4, m4
- pmovsxwd m4, m4
+ packssdw m3, m4
mova [r1], m3
- mova [r1 + 16], m4
add r0, 16
- add r1, 32
+ add r1, 16
sub r2d, 8
jnz .loop
@@ -1259,13 +816,12 @@ cglobal dequant_normal, 5,5,7
pmaxsd m3, m6
pminsd m4, m5
pmaxsd m4, m6
+ packssdw m3, m4
mova [r1 + 0 * mmsize/2], xm3
- mova [r1 + 1 * mmsize/2], xm4
- vextracti128 [r1 + 2 * mmsize/2], m3, 1
- vextracti128 [r1 + 3 * mmsize/2], m4, 1
+ vextracti128 [r1 + 1 * mmsize/2], m3, 1
add r0, mmsize
- add r1, mmsize * 2
+ add r1, mmsize
dec r2d
jnz .loop
@@ -1301,9 +857,85 @@ cglobal count_nonzero, 2,2,3
;-----------------------------------------------------------------------------------------------------------------------------------------------
;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
;-----------------------------------------------------------------------------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
INIT_XMM sse4
-cglobal weight_pp, 6, 7, 6
+cglobal weight_pp, 4,7,7
+%define correction (14 - BIT_DEPTH)
+%if BIT_DEPTH == 10
+ mova m6, [pw_1023]
+%elif BIT_DEPTH == 12
+ mova m6, [pw_3fff]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ mov r6d, r6m
+ mov r4d, r4m
+ mov r5d, r5m
+ shl r6d, 16 - correction
+ or r6d, r5d ; assuming both (w0) and round are using maximum of 16 bits each.
+ movd m0, r6d
+ pshufd m0, m0, 0 ; m0 = [w0, round]
+ mov r5d, r7m
+ sub r5d, correction
+ movd m1, r5d
+ movd m2, r8m
+ pshufd m2, m2, 0
+ mova m5, [pw_1]
+ sub r2d, r3d
+ add r2d, r2d
+ shr r3d, 4
+
+.loopH:
+ mov r5d, r3d
+
+.loopW:
+ movu m4, [r0]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, m1
+ paddd m3, m2 ; TODO: we can put Offset into Round, but we have to analyze Dynamic Range before that.
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, m1
+ paddd m4, m2
+
+ packusdw m3, m4
+ pminuw m3, m6
+ movu [r1], m3
+
+ movu m4, [r0 + mmsize]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, m1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, m1
+ paddd m4, m2
+
+ packusdw m3, m4
+ pminuw m3, m6
+ movu [r1 + mmsize], m3
+
+ add r0, 2 * mmsize
+ add r1, 2 * mmsize
+
+ dec r5d
+ jnz .loopW
+
+ add r0, r2
+ add r1, r2
+
+ dec r4d
+ jnz .loopH
+ RET
+
+%else ; end of (HIGH_BIT_DEPTH == 1)
+
+INIT_XMM sse4
+cglobal weight_pp, 6,7,6
shl r5d, 6 ; m0 = [w0<<6]
mov r6d, r6m
shl r6d, 16
@@ -1363,6 +995,8 @@ cglobal weight_pp, 6, 7, 6
dec r4d
jnz .loopH
RET
+%endif ; end of (HIGH_BIT_DEPTH == 0)
+
INIT_YMM avx2
@@ -1418,6 +1052,88 @@ cglobal weight_pp, 6, 7, 6
;-------------------------------------------------------------------------------------------------------------------------------------------------
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal weight_sp, 6,7,8
+%if BIT_DEPTH == 10
+ mova m1, [pw_1023]
+%elif BIT_DEPTH == 12
+ mova m1, [pw_3fff]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ mova m2, [pw_1]
+ mov r6d, r7m
+ shl r6d, 16
+ or r6d, r6m ; assuming both (w0) and round are using maximum of 16 bits each.
+ movd m3, r6d
+ pshufd m3, m3, 0 ; m3 = [round w0]
+
+ movd m4, r8m ; m4 = [shift]
+ movd m5, r9m
+ pshufd m5, m5, 0 ; m5 = [offset]
+
+ ; correct row stride
+ add r3d, r3d
+ add r2d, r2d
+ mov r6d, r4d
+ and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
+ sub r3d, r6d
+ sub r3d, r6d
+ sub r2d, r6d
+ sub r2d, r6d
+
+ ; generate partial width mask (MUST BE IN XMM0)
+ mov r6d, r4d
+ and r6d, (mmsize / SIZEOF_PIXEL - 1)
+ movd m0, r6d
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ pcmpgtw m0, [pw_0_15]
+
+.loopH:
+ mov r6d, r4d
+
+.loopW:
+ movu m6, [r0]
+ paddw m6, [pw_2000]
+
+ punpcklwd m7, m6, m2
+ pmaddwd m7, m3
+ psrad m7, m4
+ paddd m7, m5
+
+ punpckhwd m6, m2
+ pmaddwd m6, m3
+ psrad m6, m4
+ paddd m6, m5
+
+ packusdw m7, m6
+ pminuw m7, m1
+
+ sub r6d, (mmsize / SIZEOF_PIXEL)
+ jl .widthLess8
+ movu [r1], m7
+ lea r0, [r0 + mmsize]
+ lea r1, [r1 + mmsize]
+ je .nextH
+ jmp .loopW
+
+.widthLess8:
+ movu m6, [r1]
+ pblendvb m6, m6, m7, m0
+ movu [r1], m6
+
+.nextH:
+ add r0, r2
+ add r1, r3
+
+ dec r5d
+ jnz .loopH
+ RET
+
+%else ; end of (HIGH_BIT_DEPTH == 1)
+
INIT_XMM sse4
%if ARCH_X86_64
cglobal weight_sp, 6, 7+2, 7
@@ -1496,8 +1212,9 @@ cglobal weight_sp, 6, 7, 7, 0-(2*4)
dec r5d
jnz .loopH
-
RET
+%endif ; end of (HIGH_BIT_DEPTH == 0)
+
;-----------------------------------------------------------------
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
@@ -3338,14 +3055,7 @@ SSIM
INIT_XMM avx
SSIM
-;-----------------------------------------------------------------
-; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
-;-----------------------------------------------------------------
-INIT_XMM ssse3
-cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
-%if HIGH_BIT_DEPTH
- mova m7, [deinterleave_word_shuf]
-
+%macro SCALE1D_128to64_HBD 0
movu m0, [r1]
palignr m1, m0, 2
movu m2, [r1 + 16]
@@ -3367,8 +3077,6 @@ cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
punpcklqdq m4, m6
movu [r0 + 16], m4
-
-
movu m0, [r1 + 64]
palignr m1, m0, 2
movu m2, [r1 + 80]
@@ -3433,10 +3141,28 @@ cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
movu [r0 + 96], m0
punpcklqdq m4, m6
movu [r0 + 112], m4
+%endmacro
+
+;-----------------------------------------------------------------
+; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
+%if HIGH_BIT_DEPTH
+ mova m7, [deinterleave_word_shuf]
+
+ ;Top pixel
+ SCALE1D_128to64_HBD
+
+ ;Left pixel
+ add r1, 256
+ add r0, 128
+ SCALE1D_128to64_HBD
%else
mova m7, [deinterleave_shuf]
+ ;Top pixel
movu m0, [r1]
palignr m1, m0, 1
movu m2, [r1 + 16]
@@ -3488,6 +3214,59 @@ cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
movu [r0 + 32], m0
punpcklqdq m4, m6
movu [r0 + 48], m4
+
+ ;Left pixel
+ movu m0, [r1 + 128]
+ palignr m1, m0, 1
+ movu m2, [r1 + 144]
+ palignr m3, m2, 1
+ movu m4, [r1 + 160]
+ palignr m5, m4, 1
+ movu m6, [r1 + 176]
+
+ pavgb m0, m1
+
+ palignr m1, m6, 1
+
+ pavgb m2, m3
+ pavgb m4, m5
+ pavgb m6, m1
+
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 64], m0
+ punpcklqdq m4, m6
+ movu [r0 + 80], m4
+
+ movu m0, [r1 + 192]
+ palignr m1, m0, 1
+ movu m2, [r1 + 208]
+ palignr m3, m2, 1
+ movu m4, [r1 + 224]
+ palignr m5, m4, 1
+ movu m6, [r1 + 240]
+
+ pavgb m0, m1
+
+ palignr m1, m6, 1
+
+ pavgb m2, m3
+ pavgb m4, m5
+ pavgb m6, m1
+
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 96], m0
+ punpcklqdq m4, m6
+ movu [r0 + 112], m4
%endif
RET
@@ -3496,6 +3275,7 @@ INIT_YMM avx2
cglobal scale1D_128to64, 2, 2, 3
pxor m2, m2
+ ;Top pixel
movu m0, [r1]
movu m1, [r1 + 32]
phaddw m0, m1
@@ -3523,6 +3303,36 @@ cglobal scale1D_128to64, 2, 2, 3
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 96], m0
+
+ ;Left pixel
+ movu m0, [r1 + 256]
+ movu m1, [r1 + 288]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 128], m0
+
+ movu m0, [r1 + 320]
+ movu m1, [r1 + 352]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 160], m0
+
+ movu m0, [r1 + 384]
+ movu m1, [r1 + 416]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 192], m0
+
+ movu m0, [r1 + 448]
+ movu m1, [r1 + 480]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 224], m0
+
RET
%else ; HIGH_BIT_DEPTH == 0
INIT_YMM avx2
@@ -3530,6 +3340,7 @@ cglobal scale1D_128to64, 2, 2, 4
pxor m2, m2
mova m3, [pb_1]
+ ;Top pixel
movu m0, [r1]
pmaddubsw m0, m0, m3
pavgw m0, m2
@@ -3549,6 +3360,27 @@ cglobal scale1D_128to64, 2, 2, 4
packuswb m0, m1
vpermq m0, m0, 0xD8
movu [r0 + 32], m0
+
+ ;Left pixel
+ movu m0, [r1 + 128]
+ pmaddubsw m0, m0, m3
+ pavgw m0, m2
+ movu m1, [r1 + 160]
+ pmaddubsw m1, m1, m3
+ pavgw m1, m2
+ packuswb m0, m1
+ vpermq m0, m0, 0xD8
+ movu [r0 + 64], m0
+
+ movu m0, [r1 + 192]
+ pmaddubsw m0, m0, m3
+ pavgw m0, m2
+ movu m1, [r1 + 224]
+ pmaddubsw m1, m1, m3
+ pavgw m1, m2
+ packuswb m0, m1
+ vpermq m0, m0, 0xD8
+ movu [r0 + 96], m0
RET
%endif
diff --git a/source/common/x86/pixel.h b/source/common/x86/pixel.h
index e99b1ee..6adab39 100644
--- a/source/common/x86/pixel.h
+++ b/source/common/x86/pixel.h
@@ -57,17 +57,17 @@
ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
#define DECL_X1(name, suffix) \
- DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
+ DECL_PIXELS(int, name, suffix, (const pixel*, intptr_t, const pixel*, intptr_t))
#define DECL_X1_SS(name, suffix) \
- DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
+ DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const int16_t*, intptr_t))
#define DECL_X1_SP(name, suffix) \
- DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, pixel *, intptr_t))
+ DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const pixel*, intptr_t))
#define DECL_X4(name, suffix) \
- DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
- DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
+ DECL_PIXELS(void, name ## _x3, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) \
+ DECL_PIXELS(void, name ## _x4, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*))
/* sad-a.asm */
DECL_X1(sad, mmx2)
@@ -103,11 +103,11 @@ DECL_X1(satd, sse4)
DECL_X1(satd, avx)
DECL_X1(satd, xop)
DECL_X1(satd, avx2)
-int x265_pixel_satd_8x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x4_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x12_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_satd_16x64_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
DECL_X1(sa8d, mmx2)
DECL_X1(sa8d, sse2)
@@ -138,42 +138,42 @@ DECL_X1_SS(ssd_ss, xop)
DECL_X1_SS(ssd_ss, avx2)
DECL_X1_SP(ssd_sp, sse4)
#define DECL_HEVC_SSD(suffix) \
- int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x24_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_24x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_32x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x12_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_16x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
- int x265_pixel_ssd_8x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t);
+ int x265_pixel_ssd_32x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x24_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_24x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_32x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x12_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_16x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
+ int x265_pixel_ssd_8x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t);
DECL_HEVC_SSD(sse2)
DECL_HEVC_SSD(ssse3)
DECL_HEVC_SSD(avx)
-int x265_pixel_ssd_12x16_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_24x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_48x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x16_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
-int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_ssd_12x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_24x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_48x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x48_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
+int x265_pixel_ssd_64x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t);
-int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_4_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t);
+int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
#define ADDAVG(func) \
- void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+ void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
ADDAVG(addAvg_2x4)
ADDAVG(addAvg_2x8)
ADDAVG(addAvg_4x2);
@@ -216,8 +216,18 @@ ADDAVG(addAvg_16x24)
ADDAVG(addAvg_24x64)
ADDAVG(addAvg_32x48)
-void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
-void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
+void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+int x265_psyCost_pp_4x4_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+int x265_psyCost_pp_8x8_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+int x265_psyCost_pp_16x16_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+int x265_psyCost_pp_32x32_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+int x265_psyCost_ss_8x8_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+int x265_psyCost_ss_16x16_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+int x265_psyCost_ss_32x32_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+int x265_psyCost_ss_64x64_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
diff --git a/source/common/yuv.cpp b/source/common/yuv.cpp
index fffc215..67ab18e 100644
--- a/source/common/yuv.cpp
+++ b/source/common/yuv.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -43,21 +43,31 @@ bool Yuv::create(uint32_t size, int csp)
m_hChromaShift = CHROMA_H_SHIFT(csp);
m_vChromaShift = CHROMA_V_SHIFT(csp);
- // set width and height
m_size = size;
- m_csize = size >> m_hChromaShift;
m_part = partitionFromSizes(size, size);
- size_t sizeL = size * size;
- size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift);
+ if (csp == X265_CSP_I400)
+ {
+ CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
+ m_buf[1] = m_buf[2] = 0;
+ m_csize = MAX_INT;
+ return true;
+ }
+ else
+ {
+ m_csize = size >> m_hChromaShift;
- X265_CHECK((sizeC & 15) == 0, "invalid size");
+ size_t sizeL = size * size;
+ size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift);
- // memory allocation (padded for SIMD reads)
- CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8);
- m_buf[1] = m_buf[0] + sizeL;
- m_buf[2] = m_buf[0] + sizeL + sizeC;
- return true;
+ X265_CHECK((sizeC & 15) == 0, "invalid size");
+
+ // memory allocation (padded for SIMD reads)
+ CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8);
+ m_buf[1] = m_buf[0] + sizeL;
+ m_buf[2] = m_buf[0] + sizeL + sizeC;
+ return true;
+ }
fail:
return false;
@@ -71,71 +81,81 @@ void Yuv::destroy()
void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const
{
pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
-
- primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size);
+ primitives.cu[m_part].copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
- primitives.chroma[m_csp].copy_pp[m_part](dstU, dstPic.m_strideC, m_buf[1], m_csize);
- primitives.chroma[m_csp].copy_pp[m_part](dstV, dstPic.m_strideC, m_buf[2], m_csize);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
}
void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
{
- /* We cheat with const_cast internally because the get methods are not capable of
- * returning const buffers and the primitives are not const aware, but we know
- * this function does not modify srcPic */
- PicYuv& srcPicSafe = const_cast<PicYuv&>(srcPic);
- pixel* srcY = srcPicSafe.getLumaAddr(cuAddr, absPartIdx);
-
- primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride);
-
- pixel* srcU = srcPicSafe.getCbAddr(cuAddr, absPartIdx);
- pixel* srcV = srcPicSafe.getCrAddr(cuAddr, absPartIdx);
- primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPicSafe.m_strideC);
- primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPicSafe.m_strideC);
+ const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
+ primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
+
+ const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
+ const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
}
void Yuv::copyFromYuv(const Yuv& srcYuv)
{
- X265_CHECK(m_size <= srcYuv.m_size, "invalid size\n");
+ X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
- primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
- primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
- primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+ primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+}
+
+/* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
+void Yuv::copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma)
+{
+ X265_CHECK(m_size == FENC_STRIDE && m_size >= srcYuv.m_size, "PU buffer size mismatch\n");
+
+ const pixel* srcY = srcYuv.m_buf[0] + getAddrOffset(absPartIdx, srcYuv.m_size);
+ primitives.pu[partEnum].copy_pp(m_buf[0], m_size, srcY, srcYuv.m_size);
+
+ if (bChroma)
+ {
+ const pixel* srcU = srcYuv.m_buf[1] + srcYuv.getChromaAddrOffset(absPartIdx);
+ const pixel* srcV = srcYuv.m_buf[2] + srcYuv.getChromaAddrOffset(absPartIdx);
+ primitives.chroma[m_csp].pu[partEnum].copy_pp(m_buf[1], m_csize, srcU, srcYuv.m_csize);
+ primitives.chroma[m_csp].pu[partEnum].copy_pp(m_buf[2], m_csize, srcV, srcYuv.m_csize);
+ }
}
void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const
{
pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
- primitives.luma_copy_pp[m_part](dstY, dstYuv.m_size, m_buf[0], m_size);
+ primitives.cu[m_part].copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
pixel* dstU = dstYuv.getCbAddr(absPartIdx);
pixel* dstV = dstYuv.getCrAddr(absPartIdx);
- primitives.chroma[m_csp].copy_pp[m_part](dstU, dstYuv.m_csize, m_buf[1], m_csize);
- primitives.chroma[m_csp].copy_pp[m_part](dstV, dstYuv.m_csize, m_buf[2], m_csize);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
+ primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
}
void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
{
pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
pixel* dstY = dstYuv.m_buf[0];
-
- primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size);
+ primitives.cu[dstYuv.m_part].copy_pp(dstY, dstYuv.m_size, srcY, m_size);
pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
pixel* dstU = dstYuv.m_buf[1];
pixel* dstV = dstYuv.m_buf[2];
- primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstU, dstYuv.m_csize, srcU, m_csize);
- primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstV, dstYuv.m_csize, srcV, m_csize);
+ primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
}
void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
{
- primitives.luma_add_ps[log2SizeL - 2](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
- primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
- primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+ primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
+ primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+ primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
}
void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
@@ -144,23 +164,21 @@ void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absP
if (bLuma)
{
- int16_t* srcY0 = const_cast<ShortYuv&>(srcYuv0).getLumaAddr(absPartIdx);
- int16_t* srcY1 = const_cast<ShortYuv&>(srcYuv1).getLumaAddr(absPartIdx);
+ const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
+ const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
pixel* dstY = getLumaAddr(absPartIdx);
-
- primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
+ primitives.pu[part].addAvg(srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
}
if (bChroma)
{
- int16_t* srcU0 = const_cast<ShortYuv&>(srcYuv0).getCbAddr(absPartIdx);
- int16_t* srcV0 = const_cast<ShortYuv&>(srcYuv0).getCrAddr(absPartIdx);
- int16_t* srcU1 = const_cast<ShortYuv&>(srcYuv1).getCbAddr(absPartIdx);
- int16_t* srcV1 = const_cast<ShortYuv&>(srcYuv1).getCrAddr(absPartIdx);
+ const int16_t* srcU0 = srcYuv0.getCbAddr(absPartIdx);
+ const int16_t* srcV0 = srcYuv0.getCrAddr(absPartIdx);
+ const int16_t* srcU1 = srcYuv1.getCbAddr(absPartIdx);
+ const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
pixel* dstU = getCbAddr(absPartIdx);
pixel* dstV = getCrAddr(absPartIdx);
-
- primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
- primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
+ primitives.chroma[m_csp].pu[part].addAvg(srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
+ primitives.chroma[m_csp].pu[part].addAvg(srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
}
}
@@ -168,17 +186,15 @@ void Yuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size
{
const pixel* src = getLumaAddr(absPartIdx);
pixel* dst = dstYuv.getLumaAddr(absPartIdx);
- primitives.square_copy_pp[log2Size - 2](dst, dstYuv.m_size, const_cast<pixel*>(src), m_size);
+ primitives.cu[log2Size - 2].copy_pp(dst, dstYuv.m_size, src, m_size);
}
void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
{
- int part = partitionFromLog2Size(log2SizeL);
const pixel* srcU = getCbAddr(absPartIdx);
const pixel* srcV = getCrAddr(absPartIdx);
pixel* dstU = dstYuv.getCbAddr(absPartIdx);
pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-
- primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, const_cast<pixel*>(srcU), m_csize);
- primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, const_cast<pixel*>(srcV), m_csize);
+ primitives.chroma[m_csp].cu[log2SizeL - 2].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
+ primitives.chroma[m_csp].cu[log2SizeL - 2].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
}
diff --git a/source/common/yuv.h b/source/common/yuv.h
index a02987c..97cce0e 100644
--- a/source/common/yuv.h
+++ b/source/common/yuv.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2014 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -63,6 +63,9 @@ public:
// Copy from same size YUV buffer
void copyFromYuv(const Yuv& srcYuv);
+ // Copy portion of srcYuv into ME prediction buffer
+ void copyPUFromYuv(const Yuv& srcYuv, uint32_t absPartIdx, int partEnum, bool bChroma);
+
// Copy Small YUV buffer to the part of other Big YUV buffer
void copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const;
diff --git a/source/encoder/CMakeLists.txt b/source/encoder/CMakeLists.txt
index 020364f..0e995ed 100644
--- a/source/encoder/CMakeLists.txt
+++ b/source/encoder/CMakeLists.txt
@@ -3,6 +3,9 @@
if(GCC)
add_definitions(-Wno-uninitialized)
endif()
+if(MSVC)
+ add_definitions(/wd4701) # potentially uninitialized local variable 'foo' used
+endif()
add_library(encoder OBJECT ../x265.h
analysis.cpp analysis.h
diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp
index c62f5f0..40d502e 100644
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -33,8 +33,6 @@
#include "rdcost.h"
#include "encoder.h"
-#include "PPA/ppa.h"
-
using namespace x265;
/* An explanation of rate distortion levels (--rd-level)
@@ -61,9 +59,12 @@ using namespace x265;
*
* RDO selection between merge and skip
* sa8d selection of best inter mode
+ * sa8d decisions include chroma residual cost
* RDO selection between (merge/skip) / best inter mode / intra / split
*
* rd-level 4 enables RDOQuant
+ * chroma residual cost included in satd decisions, including subpel refine
+ * (as a result of --subme 3 being used by preset slow)
*
* rd-level 5,6 does RDO for each inter mode
*/
@@ -71,12 +72,15 @@ using namespace x265;
Analysis::Analysis()
{
m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
+ m_reuseIntraDataCTU = NULL;
+ m_reuseInterDataCTU = NULL;
}
bool Analysis::create(ThreadLocalData *tld)
{
m_tld = tld;
m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
+ m_bChromaSa8d = m_param->rdLevel >= 3;
int csp = m_param->internalCsp;
uint32_t cuSize = g_maxCUSize;
@@ -116,7 +120,7 @@ void Analysis::destroy()
}
}
-Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
+Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
{
m_slice = ctu.m_slice;
m_frame = &frame;
@@ -124,27 +128,31 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
invalidateContexts(0);
m_quant.setQPforQuant(ctu);
m_rqt[0].cur.load(initialContext);
- m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0);
+ m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
uint32_t numPartition = ctu.m_numPartitions;
- if (m_slice->m_sliceType == I_SLICE)
+ if (m_param->analysisMode)
{
- uint32_t zOrder = 0;
- if (m_param->analysisMode == X265_ANALYSIS_LOAD)
- compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder);
+ if (m_slice->m_sliceType == I_SLICE)
+ m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData;
else
{
- compressIntraCU(ctu, cuGeom, NULL, zOrder);
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
+ m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData;
+ reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
+ }
+ }
- if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData)
- {
- CUData *bestCU = &m_modeDepth[0].bestMode->cu;
- memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
- memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
- memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
- m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr;
- m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc;
- }
+ uint32_t zOrder = 0;
+ if (m_slice->m_sliceType == I_SLICE)
+ {
+ compressIntraCU(ctu, cuGeom, zOrder);
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
+ {
+ CUData *bestCU = &m_modeDepth[0].bestMode->cu;
+ memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+ memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+ memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
}
}
else
@@ -152,10 +160,10 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
if (!m_param->rdLevel)
{
/* In RD Level 0/1, copy source pixels into the reconstructed block so
- * they are available for intra predictions */
- m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0);
-
- compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1
+ * they are available for intra predictions */
+ m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
+
+ compressInterCU_rd0_4(ctu, cuGeom);
/* generate residual for entire CTU at once and copy to reconPic */
encodeResidue(ctu, cuGeom);
@@ -165,7 +173,15 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
else if (m_param->rdLevel <= 4)
compressInterCU_rd0_4(ctu, cuGeom);
else
- compressInterCU_rd5_6(ctu, cuGeom);
+ {
+ compressInterCU_rd5_6(ctu, cuGeom, zOrder);
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
+ {
+ CUData *bestCU = &m_modeDepth[0].bestMode->cu;
+ memcpy(&m_reuseInterDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+ memcpy(&m_reuseInterDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_predMode, sizeof(uint8_t) * numPartition);
+ }
+ }
}
return *m_modeDepth[0].bestMode;
@@ -178,7 +194,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom)
if (!md.bestMode->distortion)
/* already lossless */
return;
- else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA)
+ else if (md.bestMode->cu.isIntra(0))
{
md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
@@ -195,7 +211,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom)
}
}
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder)
{
uint32_t depth = cuGeom.depth;
ModeDepth& md = m_modeDepth[depth];
@@ -204,20 +220,20 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
- if (shared)
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
{
- uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
- char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
- uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
- if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx)
+ if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
{
m_quant.setQPforQuant(parentCTU);
- PartSize size = (PartSize)sharedPartSizes[zOrder];
+ PartSize size = (PartSize)reusePartSizes[zOrder];
Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
mode.cu.initSubCU(parentCTU, cuGeom);
- checkIntra(mode, cuGeom, size, sharedModes);
+ checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
checkBestMode(mode, depth);
if (m_bTryLossless)
@@ -227,7 +243,7 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
addSplitFlagCost(*md.bestMode, cuGeom.depth);
// increment zOrder offset to point to next best depth in sharedDepth buffer
- zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]];
+ zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
mightSplit = false;
}
}
@@ -267,23 +283,23 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressIntraCU(parentCTU, childCuData, shared, zOrder);
+ compressIntraCU(parentCTU, childGeom, zOrder);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
nextContext = &nd.bestMode->contexts;
}
else
{
/* record the depth of this non-present sub-CU */
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
}
}
@@ -300,38 +316,47 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
}
bool Analysis::findJob(int threadId)
{
/* try to acquire a CU mode to analyze */
+ m_pmodeLock.acquire();
if (m_totalNumJobs > m_numAcquiredJobs)
{
- /* ATOMIC_INC returns the incremented value */
- int id = ATOMIC_INC(&m_numAcquiredJobs);
- if (m_totalNumJobs >= id)
- {
- parallelModeAnalysis(threadId, id - 1);
+ int id = m_numAcquiredJobs++;
+ m_pmodeLock.release();
- if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs)
- m_modeCompletionEvent.trigger();
- return true;
- }
+ ProfileScopeEvent(pmode);
+ parallelModeAnalysis(threadId, id);
+
+ m_pmodeLock.acquire();
+ if (++m_numCompletedJobs == m_totalNumJobs)
+ m_modeCompletionEvent.trigger();
+ m_pmodeLock.release();
+ return true;
}
+ else
+ m_pmodeLock.release();
+ m_meLock.acquire();
if (m_totalNumME > m_numAcquiredME)
{
- int id = ATOMIC_INC(&m_numAcquiredME);
- if (m_totalNumME >= id)
- {
- parallelME(threadId, id - 1);
+ int id = m_numAcquiredME++;
+ m_meLock.release();
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
- m_meCompletionEvent.trigger();
- return true;
- }
+ ProfileScopeEvent(pme);
+ parallelME(threadId, id);
+
+ m_meLock.acquire();
+ if (++m_numCompletedME == m_totalNumME)
+ m_meCompletionEvent.trigger();
+ m_meLock.release();
+ return true;
}
+ else
+ m_meLock.release();
return false;
}
@@ -349,18 +374,14 @@ void Analysis::parallelME(int threadId, int meId)
slave->m_slice = m_slice;
slave->m_frame = m_frame;
- PicYuv* fencPic = m_frame->m_origPicYuv;
- pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
- slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
- slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
-
- slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart);
+ slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+ slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
}
if (meId < m_slice->m_numRefIdx[0])
- slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId);
+ slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
else
- slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
+ slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
}
void Analysis::parallelModeAnalysis(int threadId, int jobId)
@@ -376,8 +397,6 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
slave->m_frame = m_frame;
slave->setQP(*m_slice, m_rdCost.m_qp);
slave->invalidateContexts(0);
- if (jobId)
- slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride);
}
ModeDepth& md = m_modeDepth[m_curGeom->depth];
@@ -389,13 +408,15 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
case 0:
if (slave != this)
slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
- slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom);
+ slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
if (m_param->rdLevel > 2)
slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
break;
case 1:
slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
+ if (m_slice->m_sliceType == B_SLICE)
+ slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
break;
case 2:
@@ -446,6 +467,13 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
case 1:
slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
+ md.pred[PRED_BIDIR].rdCost = MAX_INT64;
+ if (m_slice->m_sliceType == B_SLICE)
+ {
+ slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
+ if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+ slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
+ }
break;
case 2:
@@ -499,6 +527,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
/* Initialize all prediction CUs based on parentCTU */
md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+ md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
if (m_param->bEnableRectInter)
@@ -520,12 +549,14 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
}
+ m_pmodeLock.acquire();
m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
m_numAcquiredJobs = !bTryIntra;
m_numCompletedJobs = m_numAcquiredJobs;
m_curGeom = &cuGeom;
m_bJobsQueued = true;
JobProvider::enqueue();
+ m_pmodeLock.release();
for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
m_pool->pokeIdleThread();
@@ -572,17 +603,26 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
if (m_param->rdLevel > 2)
{
- /* encode best inter */
- for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ /* RD selection between merge, inter, bidir and intra */
+ if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
{
- prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
- motionCompensation(bestInter->predYuv, false, true);
+ for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ {
+ prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+ motionCompensation(bestInter->predYuv, false, true);
+ }
}
encodeResAndCalcRdInterCU(*bestInter, cuGeom);
-
- /* RD selection between merge, inter and intra */
checkBestMode(*bestInter, depth);
+ /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+ md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+ {
+ encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+ checkBestMode(md.pred[PRED_BIDIR], depth);
+ }
+
if (bTryIntra)
checkBestMode(md.pred[PRED_INTRA], depth);
}
@@ -591,6 +631,9 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
md.bestMode = bestInter;
+ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+ md.bestMode = &md.pred[PRED_BIDIR];
+
if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
{
md.bestMode = &md.pred[PRED_INTRA];
@@ -614,6 +657,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
m_modeCompletionEvent.wait();
checkBestMode(md.pred[PRED_2Nx2N], depth);
+ checkBestMode(md.pred[PRED_BIDIR], depth);
if (m_param->bEnableRectInter)
{
@@ -640,7 +684,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
{
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
- checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+ checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
checkBestMode(md.pred[PRED_INTRA], depth);
}
@@ -655,7 +699,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
bool bNoSplit = false;
if (md.bestMode)
{
- bNoSplit = !!md.bestMode->cu.isSkipped(0);
+ bNoSplit = md.bestMode->cu.isSkipped(0);
if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
}
@@ -674,22 +718,22 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressInterCU_dist(parentCTU, childCuData);
+ compressInterCU_dist(parentCTU, childGeom);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
nextContext = &nd.bestMode->contexts;
}
else
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
}
nextContext->store(splitPred->contexts);
@@ -701,10 +745,10 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
checkBestMode(*splitPred, depth);
}
- if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+ if (mightNotSplit)
{
/* early-out statistics */
- FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+ FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
cuStat.count[depth] += 1;
@@ -716,7 +760,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
}
void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -734,24 +778,9 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
{
bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
- /* Initialize all prediction CUs based on parentCTU */
- md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+ /* Compute Merge Cost */
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
- if (m_param->bEnableRectInter)
- {
- md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
- }
- if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
- {
- md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
- md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
- }
-
- /* Compute Merge Cost */
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
bool earlyskip = false;
@@ -760,14 +789,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
if (!earlyskip)
{
+ md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
- Mode *bestInter = &md.pred[PRED_2Nx2N];
+ if (m_slice->m_sliceType == B_SLICE)
+ {
+ md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+ checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+ }
+
+ Mode *bestInter = &md.pred[PRED_2Nx2N];
if (m_param->bEnableRectInter)
{
+ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_Nx2N];
+
+ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxN];
@@ -789,18 +828,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
if (bHor)
{
+ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnU];
+
+ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_2NxnD];
}
if (bVer)
{
+ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nLx2N];
+
+ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
bestInter = &md.pred[PRED_nRx2N];
@@ -810,37 +855,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
if (m_param->rdLevel >= 3)
{
/* Calculate RD cost of best inter option */
- for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
{
- prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
- motionCompensation(bestInter->predYuv, false, true);
+ for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+ {
+ prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+ motionCompensation(bestInter->predYuv, false, true);
+ }
}
-
encodeResAndCalcRdInterCU(*bestInter, cuGeom);
+ checkBestMode(*bestInter, depth);
- if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost)
- md.bestMode = bestInter;
+ /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+ md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+ {
+ encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+ checkBestMode(md.pred[PRED_BIDIR], depth);
+ }
if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
md.bestMode->sa8dCost == MAX_INT64)
{
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
- checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+ checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
- if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost)
- md.bestMode = &md.pred[PRED_INTRA];
+ checkBestMode(md.pred[PRED_INTRA], depth);
}
}
else
{
- /* SA8D choice between merge/skip, inter, and intra */
+ /* SA8D choice between merge/skip, inter, bidir, and intra */
if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
md.bestMode = bestInter;
+ if (m_slice->m_sliceType == B_SLICE &&
+ md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+ md.bestMode = &md.pred[PRED_BIDIR];
+
if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
{
md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
- checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+ checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
md.bestMode = &md.pred[PRED_INTRA];
}
@@ -854,7 +910,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
/* prediction already generated for this CU, and if rd level
* is not 0, it is already fully encoded */
}
- else if (md.bestMode->cu.m_predMode[0] == MODE_INTER)
+ else if (md.bestMode->cu.isInter(0))
{
for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
{
@@ -865,8 +921,23 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
else if (m_param->rdLevel == 1)
{
- m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
- generateCoeffRecon(*md.bestMode, cuGeom);
+ /* generate recon pixels with no rate distortion considerations */
+ CUData& cu = md.bestMode->cu;
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getInterTUQtDepthRange(tuDepthRange, 0);
+
+ m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
+ residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
+ if (cu.getQtRootCbf(0))
+ md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
+ else
+ {
+ md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
+ if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
+ cu.setPredModeSubParts(MODE_SKIP);
+ }
}
}
else
@@ -874,7 +945,19 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
if (m_param->rdLevel == 2)
encodeIntraInInter(*md.bestMode, cuGeom);
else if (m_param->rdLevel == 1)
- generateCoeffRecon(*md.bestMode, cuGeom);
+ {
+ /* generate recon pixels with no rate distortion considerations */
+ CUData& cu = md.bestMode->cu;
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+ residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
+ getBestIntraModeChroma(*md.bestMode, cuGeom);
+ residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+ md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
+ }
}
}
} // !earlyskip
@@ -889,7 +972,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
bool bNoSplit = false;
if (md.bestMode)
{
- bNoSplit = !!md.bestMode->cu.isSkipped(0);
+ bNoSplit = md.bestMode->cu.isSkipped(0);
if (mightSplit && depth && depth >= minDepth && !bNoSplit)
bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
}
@@ -908,54 +991,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressInterCU_rd0_4(parentCTU, childCuData);
+ compressInterCU_rd0_4(parentCTU, childGeom);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
if (m_param->rdLevel)
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
else
- nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
if (m_param->rdLevel > 1)
nextContext = &nd.bestMode->contexts;
}
else
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ splitCU->setEmptyPart(childGeom, subPartIdx);
}
nextContext->store(splitPred->contexts);
if (mightNotSplit)
addSplitFlagCost(*splitPred, cuGeom.depth);
- else if (m_param->rdLevel <= 1)
- splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
- else
+ else if (m_param->rdLevel > 1)
updateModeCost(*splitPred);
+ else
+ splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
if (!md.bestMode)
md.bestMode = splitPred;
- else if (m_param->rdLevel >= 1)
- {
- if (splitPred->rdCost < md.bestMode->rdCost)
- md.bestMode = splitPred;
- }
- else
- {
- if (splitPred->sa8dCost < md.bestMode->sa8dCost)
- md.bestMode = splitPred;
- }
+ else if (m_param->rdLevel > 1)
+ checkBestMode(*splitPred, cuGeom.depth);
+ else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
+ md.bestMode = splitPred;
}
- if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+ if (mightNotSplit)
{
/* early-out statistics */
- FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+ FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
cuStat.count[depth] += 1;
@@ -967,10 +1044,10 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
}
-void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
+void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder)
{
uint32_t depth = cuGeom.depth;
ModeDepth& md = m_modeDepth[depth];
@@ -979,29 +1056,80 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
- if (mightNotSplit)
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
{
- for (int i = 0; i < MAX_PRED_TYPES; i++)
- md.pred[i].cu.initSubCU(parentCTU, cuGeom);
+ uint8_t* reuseDepth = &m_reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ uint8_t* reuseModes = &m_reuseInterDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+ if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx && reuseModes[zOrder] == MODE_SKIP)
+ {
+ md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
+ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
+ checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+
+ if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
+ (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
+ {
+ md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
+ checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
+ checkBestMode(md.pred[PRED_INTRA], depth);
+ if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
+ {
+ md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
+ checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, &reuseModes[zOrder]);
+ checkBestMode(md.pred[PRED_INTRA_NxN], depth);
+ }
+ }
+
+ if (m_bTryLossless)
+ tryLossless(cuGeom);
+
+ if (mightSplit)
+ addSplitFlagCost(*md.bestMode, cuGeom.depth);
+
+ // increment zOrder offset to point to next best depth in sharedDepth buffer
+ zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
+
+ mightSplit = false;
+ mightNotSplit = false;
+ }
+ }
+
+ if (mightNotSplit)
+ {
+ md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
+ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
if (!earlySkip)
{
+ md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+ if (m_slice->m_sliceType == B_SLICE)
+ {
+ md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+ checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+ if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+ {
+ encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+ checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
+ }
+ }
+
if (m_param->bEnableRectInter)
{
- // Nx2N rect
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
}
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
}
@@ -1027,11 +1155,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
{
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
}
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
}
@@ -1040,11 +1170,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
{
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
}
if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
{
+ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
}
@@ -1054,11 +1186,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
(!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
{
+ md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
checkBestMode(md.pred[PRED_INTRA], depth);
if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
{
+ md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
checkBestMode(md.pred[PRED_INTRA_NxN], depth);
}
@@ -1087,21 +1221,24 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
{
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+ m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
m_rqt[nextDepth].cur.load(*nextContext);
- compressInterCU_rd5_6(parentCTU, childCuData);
+ compressInterCU_rd5_6(parentCTU, childGeom, zOrder);
// Save best CU and pred data for this sub CU
- splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+ splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+ nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
nextContext = &nd.bestMode->contexts;
}
else
- splitCU->setEmptyPart(childCuData, subPartIdx);
+ {
+ splitCU->setEmptyPart(childGeom, subPartIdx);
+ zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
+ }
}
nextContext->store(splitPred->contexts);
if (mightNotSplit)
@@ -1117,7 +1254,7 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
/* Copy best data to encData CTU and recon */
md.bestMode->cu.copyToPic(depth);
if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
}
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1149,6 +1286,7 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
bestPred->sa8dCost = MAX_INT64;
int bestSadCand = -1;
int sizeIdx = cuGeom.log2CUSize - 2;
+
for (uint32_t i = 0; i < maxNumMergeCand; ++i)
{
if (m_bFrameParallel &&
@@ -1159,16 +1297,20 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
tempPred->cu.m_interDir[0] = interDirNeighbours[i];
tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
- tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+ tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
- tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
+ tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
- // do MC only for Luma part
prepMotionCompensation(tempPred->cu, cuGeom, 0);
- motionCompensation(tempPred->predYuv, true, false);
+ motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
- tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
+ tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
+ if (m_bChromaSa8d)
+ {
+ tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
+ tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
+ }
tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
if (tempPred->sa8dCost < bestPred->sa8dCost)
@@ -1183,8 +1325,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
return;
/* calculate the motion compensation for chroma for the best mode selected */
- prepMotionCompensation(bestPred->cu, cuGeom, 0);
- motionCompensation(bestPred->predYuv, false, true);
+ if (!m_bChromaSa8d) /* Chroma MC was done above */
+ {
+ prepMotionCompensation(bestPred->cu, cuGeom, 0);
+ motionCompensation(bestPred->predYuv, false, true);
+ }
if (m_param->rdLevel)
{
@@ -1197,9 +1342,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
- tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+ tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
- tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+ tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
tempPred->sa8dCost = bestPred->sa8dCost;
tempPred->predYuv.copyFromYuv(bestPred->predYuv);
@@ -1213,9 +1358,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
/* broadcast sets of MV field data */
bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
- bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
- bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
}
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1269,10 +1414,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
tempPred->cu.m_interDir[0] = interDirNeighbours[i];
tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
- tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+ tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
- tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
- tempPred->cu.setSkipFlagSubParts(false); /* must be cleared between encode iterations */
+ tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+ tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
prepMotionCompensation(tempPred->cu, cuGeom, 0);
motionCompensation(tempPred->predYuv, true, true);
@@ -1302,10 +1447,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
tempPred->cu.m_interDir[0] = interDirNeighbours[i];
tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
- tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+ tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
- tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
- tempPred->cu.setSkipFlagSubParts(false);
+ tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+ tempPred->cu.setPredModeSubParts(MODE_INTER);
tempPred->predYuv.copyFromYuv(bestPred->predYuv);
}
@@ -1324,9 +1469,9 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
- bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
- bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
+ bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
}
}
@@ -1335,14 +1480,46 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
interMode.initCosts();
interMode.cu.setPartSizeSubParts(partSize);
interMode.cu.setPredModeSubParts(MODE_INTER);
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
- if (predInterSearch(interMode, cuGeom, false, false))
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+ {
+ for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++)
+ {
+ MotionData* bestME = interMode.bestME[part];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ bestME[i].ref = *reuseRef;
+ reuseRef++;
+ }
+ }
+ }
+ if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
{
/* predInterSearch sets interMode.sa8dBits */
const Yuv& fencYuv = *interMode.fencYuv;
Yuv& predYuv = interMode.predYuv;
- interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+ int part = partitionFromLog2Size(cuGeom.log2CUSize);
+ interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+ if (m_bChromaSa8d)
+ {
+ interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
+ interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
+ }
interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
+
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+ {
+ for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+ {
+ MotionData* bestME = interMode.bestME[puIdx];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ *reuseRef = bestME[i].ref;
+ reuseRef++;
+ }
+ }
+ }
}
else
{
@@ -1356,11 +1533,37 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
interMode.initCosts();
interMode.cu.setPartSizeSubParts(partSize);
interMode.cu.setPredModeSubParts(MODE_INTER);
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+ {
+ for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+ {
+ MotionData* bestME = interMode.bestME[puIdx];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ bestME[i].ref = *reuseRef;
+ reuseRef++;
+ }
+ }
+ }
if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
{
/* predInterSearch sets interMode.sa8dBits, but this is ignored */
encodeResAndCalcRdInterCU(interMode, cuGeom);
+
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+ {
+ for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+ {
+ MotionData* bestME = interMode.bestME[puIdx];
+ for (int32_t i = 0; i < numPredDir; i++)
+ {
+ *reuseRef = bestME[i].ref;
+ reuseRef++;
+ }
+ }
+ }
}
else
{
@@ -1369,221 +1572,145 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
}
}
-/* Note that this function does not save the best intra prediction, it must
- * be generated later. It records the best mode in the cu */
-void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom)
+void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
{
- CUData& cu = intraMode.cu;
- uint32_t depth = cu.m_cuDepth[0];
+ CUData& cu = bidir2Nx2N.cu;
- cu.setPartSizeSubParts(SIZE_2Nx2N);
- cu.setPredModeSubParts(MODE_INTRA);
-
- uint32_t initTrDepth = 0;
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
- uint32_t tuSize = 1 << log2TrSize;
- const uint32_t absPartIdx = 0;
-
- // Reference sample smoothing
- initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
-
- pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0];
- uint32_t stride = m_modeDepth[depth].fencYuv.m_size;
-
- pixel *above = m_refAbove + tuSize - 1;
- pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel *left = m_refLeft + tuSize - 1;
- pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
- int sad, bsad;
- uint32_t bits, bbits, mode, bmode;
- uint64_t cost, bcost;
-
- // 33 Angle modes once
- ALIGN_VAR_32(pixel, bufScale[32 * 32]);
- ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
- ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
- int scaleTuSize = tuSize;
- int scaleStride = stride;
- int costShift = 0;
- int sizeIdx = log2TrSize - 2;
-
- if (tuSize > 32)
+ if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
{
- // origin is 64x64, we scale to 32x32 and setup required parameters
- primitives.scale2D_64to32(bufScale, fenc, stride);
- fenc = bufScale;
-
- // reserve space in case primitives need to store data in above
- // or left buffers
- pixel _above[4 * 32 + 1];
- pixel _left[4 * 32 + 1];
- pixel *aboveScale = _above + 2 * 32;
- pixel *leftScale = _left + 2 * 32;
- aboveScale[0] = leftScale[0] = above[0];
- primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
- primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
-
- scaleTuSize = 32;
- scaleStride = 32;
- costShift = 2;
- sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
- // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
- above = aboveScale;
- left = leftScale;
- aboveFiltered = aboveScale;
- leftFiltered = leftScale;
+ bidir2Nx2N.sa8dCost = MAX_INT64;
+ bidir2Nx2N.rdCost = MAX_INT64;
+ return;
}
- pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
- int predsize = scaleTuSize * scaleTuSize;
-
- m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
-
- /* there are three cost tiers for intra modes:
- * pred[0] - mode probable, least cost
- * pred[1], pred[2] - less probable, slightly more cost
- * non-mpm modes - all cost the same (rbits) */
- uint64_t mpms;
- uint32_t preds[3];
- uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
-
- // DC
- primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
- bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
- bmode = mode = DC_IDX;
- bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
- bcost = m_rdCost.calcRdSADCost(bsad, bbits);
-
- pixel *abovePlanar = above;
- pixel *leftPlanar = left;
-
- if (tuSize & (8 | 16 | 32))
+ const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
+ MV mvzero(0, 0);
+ int partEnum = cuGeom.log2CUSize - 2;
+
+ bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
+ bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
+ MotionData* bestME = bidir2Nx2N.bestME[0];
+ int ref0 = bestME[0].ref;
+ MV mvp0 = bestME[0].mvp;
+ int mvpIdx0 = bestME[0].mvpIdx;
+ int ref1 = bestME[1].ref;
+ MV mvp1 = bestME[1].mvp;
+ int mvpIdx1 = bestME[1].mvpIdx;
+
+ bidir2Nx2N.initCosts();
+ cu.setPartSizeSubParts(SIZE_2Nx2N);
+ cu.setPredModeSubParts(MODE_INTER);
+ cu.setPUInterDir(3, 0, 0);
+ cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
+ cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
+ cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
+ cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
+ cu.m_mergeFlag[0] = 0;
+
+ /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
+ cu.setPUMv(0, bestME[0].mv, 0, 0);
+ cu.m_mvd[0][0] = bestME[0].mv - mvp0;
+
+ cu.setPUMv(1, bestME[1].mv, 0, 0);
+ cu.m_mvd[1][0] = bestME[1].mv - mvp1;
+
+ prepMotionCompensation(cu, cuGeom, 0);
+ motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+
+ int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
+ if (m_bChromaSa8d)
{
- abovePlanar = aboveFiltered;
- leftPlanar = leftFiltered;
+ /* Add in chroma distortion */
+ sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
+ sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
}
+ bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
- // PLANAR
- primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
- sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
- mode = PLANAR_IDX;
- bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
- cost = m_rdCost.calcRdSADCost(sad, bits);
- COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
-
- // Transpose NxN
- primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
-
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
- bool modeHor;
- pixel *cmp;
- intptr_t srcStride;
-
-#define TRY_ANGLE(angle) \
- modeHor = angle < 18; \
- cmp = modeHor ? bufTrans : fenc; \
- srcStride = modeHor ? scaleTuSize : scaleStride; \
- sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
- bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
- cost = m_rdCost.calcRdSADCost(sad, bits)
-
- if (m_param->bEnableFastIntra)
+ bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
+ if (bTryZero)
+ {
+ /* Do not try zero MV if unidir motion predictors are beyond
+ * valid search area */
+ MV mvmin, mvmax;
+ int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
+ setSearchRange(cu, mvzero, merange, mvmin, mvmax);
+ mvmax.y += 2; // there is some pad for subpel refine
+ mvmin <<= 2;
+ mvmax <<= 2;
+
+ bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
+ }
+ if (bTryZero)
{
- int asad = 0;
- uint32_t lowmode, highmode, amode = 5, abits = 0;
- uint64_t acost = MAX_INT64;
+ /* Estimate cost of BIDIR using coincident blocks */
+ Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
- /* pick the best angle, sampling at distance of 5 */
- for (mode = 5; mode < 35; mode += 5)
- {
- TRY_ANGLE(mode);
- COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
- }
+ int zsa8d;
- /* refine best angle at distance 2, then distance 1 */
- for (uint32_t dist = 2; dist >= 1; dist--)
+ if (m_bChromaSa8d)
{
- lowmode = amode - dist;
- highmode = amode + dist;
+ cu.m_mv[0][0] = mvzero;
+ cu.m_mv[1][0] = mvzero;
- X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
- TRY_ANGLE(lowmode);
- COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+ prepMotionCompensation(cu, cuGeom, 0);
+ motionCompensation(tmpPredYuv, true, true);
- X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
- TRY_ANGLE(highmode);
- COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+ zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
+ zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
}
-
- if (amode == 33)
+ else
{
- TRY_ANGLE(34);
- COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
- }
+ pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+ pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+ intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
- COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
- }
- else // calculate and search all intra prediction angles for lowest cost
- {
- for (mode = 2; mode < 35; mode++)
- {
- TRY_ANGLE(mode);
- COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
+ zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
- }
-
- cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth);
- intraMode.initCosts();
- intraMode.totalBits = bbits;
- intraMode.distortion = bsad;
- intraMode.sa8dCost = bcost;
- intraMode.sa8dBits = bbits;
-}
-
-void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
-{
- CUData& cu = intraMode.cu;
- Yuv* reconYuv = &intraMode.reconYuv;
- Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv;
-
- X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
- X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
-
- m_quant.setQPforQuant(cu);
-
- uint32_t tuDepthRange[2];
- cu.getIntraTUQtDepthRange(tuDepthRange, 0);
- m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
+ uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+ uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+ uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
- Cost icosts;
- codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
- extractIntraResultQT(cu, *reconYuv, 0, 0);
+ /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
+ checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost);
+ checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost);
- intraMode.distortion = icosts.distortion;
- intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
+ uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ zcost = zsa8d + m_rdCost.getCost(zbits);
- m_entropyCoder.resetBits();
- if (m_slice->m_pps->bTransquantBypassEnabled)
- m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
- m_entropyCoder.codeSkipFlag(cu, 0);
- m_entropyCoder.codePredMode(cu.m_predMode[0]);
- m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
- m_entropyCoder.codePredInfo(cu, 0);
- intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+ if (zcost < bidir2Nx2N.sa8dCost)
+ {
+ bidir2Nx2N.sa8dBits = zbits;
+ bidir2Nx2N.sa8dCost = zcost;
- bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange);
+ cu.setPUMv(0, mvzero, 0, 0);
+ cu.m_mvd[0][0] = mvzero - mvp0;
+ cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
- intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
- intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
- if (m_rdCost.m_psyRd)
- intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+ cu.setPUMv(1, mvzero, 0, 0);
+ cu.m_mvd[1][0] = mvzero - mvp1;
+ cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
- m_entropyCoder.store(intraMode.contexts);
- updateModeCost(intraMode);
+ if (m_bChromaSa8d)
+ /* real MC was already performed */
+ bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
+ else
+ {
+ prepMotionCompensation(cu, cuGeom, 0);
+ motionCompensation(bidir2Nx2N.predYuv, true, true);
+ }
+ }
+ else if (m_bChromaSa8d)
+ {
+ /* recover overwritten motion vectors */
+ cu.m_mv[0][0] = bestME[0].mv;
+ cu.m_mv[1][0] = bestME[1].mv;
+ }
+ }
}
void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
@@ -1592,9 +1719,9 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
{
for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
- encodeResidue(ctu, childCuData);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
+ encodeResidue(ctu, childGeom);
}
return;
}
@@ -1602,29 +1729,30 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
uint32_t absPartIdx = cuGeom.encodeIdx;
int sizeIdx = cuGeom.log2CUSize - 2;
- Yuv& fencYuv = m_modeDepth[0].fencYuv;
-
/* reuse the bestMode data structures at the current depth */
Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
- Yuv& reconYuv = bestMode->reconYuv;
CUData& cu = bestMode->cu;
cu.copyFromPic(ctu, cuGeom);
m_quant.setQPforQuant(cu);
- if (cu.m_predMode[0] == MODE_INTRA)
+ Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
+ if (cuGeom.depth)
+ m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
+ X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
+
+ if (cu.isIntra(0))
{
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
- residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange);
+ residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
getBestIntraModeChroma(*bestMode, cuGeom);
residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
}
- else if (cu.m_predMode[0] == MODE_INTER)
+ else // if (cu.isInter(0))
{
- X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n");
+ X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
/* Calculate residual for current CU part into depth sized resiYuv */
@@ -1636,75 +1764,56 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
pixel* predU = predYuv.getCbAddr(absPartIdx);
pixel* predV = predYuv.getCrAddr(absPartIdx);
- primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
- fencYuv.getLumaAddr(absPartIdx), predY,
- fencYuv.m_size, predYuv.m_size);
+ primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
+ fencYuv.m_buf[0], predY,
+ fencYuv.m_size, predYuv.m_size);
- primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
- fencYuv.getCbAddr(absPartIdx), predU,
- fencYuv.m_csize, predYuv.m_csize);
+ primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
+ fencYuv.m_buf[1], predU,
+ fencYuv.m_csize, predYuv.m_csize);
- primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
- fencYuv.getCrAddr(absPartIdx), predV,
- fencYuv.m_csize, predYuv.m_csize);
+ primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
+ fencYuv.m_buf[2], predV,
+ fencYuv.m_csize, predYuv.m_csize);
uint32_t tuDepthRange[2];
cu.getInterTUQtDepthRange(tuDepthRange, 0);
- residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+ residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
- PicYuv& reconPicYuv = *m_frame->m_reconPicYuv;
- if (cu.getQtRootCbf(0)) // TODO: split to each component
- {
- /* residualTransformQuantInter() wrote transformed residual back into
- * resiYuv. Generate the recon pixels by adding it to the prediction */
-
- primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size,
- predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
- primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize,
- predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
- primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize,
- predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
-
- /* copy the reconstructed part to the recon pic for later intra
- * predictions */
- reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx);
- }
+ /* residualTransformQuantInter() wrote transformed residual back into
+ * resiYuv. Generate the recon pixels by adding it to the prediction */
+
+ PicYuv& reconPic = *m_frame->m_reconPic;
+ if (cu.m_cbf[0][0])
+ primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
+ predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
else
- {
- /* copy the prediction pixels to the recon pic for later intra
- * predictions */
-
- primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride,
- predY, predYuv.m_size);
- primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
- predU, predYuv.m_csize);
- primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
- predV, predYuv.m_csize);
- }
+ primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
+ predY, predYuv.m_size);
+
+ if (cu.m_cbf[1][0])
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+ else
+ primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predU, predYuv.m_csize);
+
+ if (cu.m_cbf[2][0])
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+ else
+ primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predV, predYuv.m_csize);
}
- /* else if (cu.m_predMode[0] == MODE_NONE) {} */
checkDQP(cu, cuGeom);
cu.updatePic(cuGeom.depth);
}
-/* check whether current try is the best with identifying the depth of current try */
-void Analysis::checkBestMode(Mode& mode, uint32_t depth)
-{
- ModeDepth& md = m_modeDepth[depth];
- if (md.bestMode)
- {
- if (mode.rdCost < md.bestMode->rdCost)
- md.bestMode = &mode;
- }
- else
- md.bestMode = &mode;
-}
-
void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
{
if (m_param->rdLevel >= 3)
@@ -1817,7 +1926,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom
* each quantity */
uint32_t depth = cuGeom.depth;
- FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+ FrameData& curEncData = *m_frame->m_encData;
FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
uint64_t cuCount = cuStat.count[depth];
@@ -1855,7 +1964,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom
}
// give 60% weight to all CU's and 40% weight to neighbour CU's
- if (neighCost + cuCount)
+ if (neighCount + cuCount)
{
uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
diff --git a/source/encoder/analysis.h b/source/encoder/analysis.h
index 404cc90..bb7fc87 100644
--- a/source/encoder/analysis.h
+++ b/source/encoder/analysis.h
@@ -49,6 +49,7 @@ public:
PRED_SKIP,
PRED_INTRA,
PRED_2Nx2N,
+ PRED_BIDIR,
PRED_Nx2N,
PRED_2NxN,
PRED_SPLIT,
@@ -71,11 +72,16 @@ public:
ModeDepth m_modeDepth[NUM_CU_DEPTH];
bool m_bTryLossless;
+ bool m_bChromaSa8d;
+ /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
+ analysis_intra_data* m_reuseIntraDataCTU;
+ analysis_inter_data* m_reuseInterDataCTU;
+ int32_t* reuseRef;
Analysis();
bool create(ThreadLocalData* tld);
void destroy();
- Search::Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
+ Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
protected:
@@ -83,18 +89,19 @@ protected:
int m_totalNumJobs;
volatile int m_numAcquiredJobs;
volatile int m_numCompletedJobs;
+ Lock m_pmodeLock;
Event m_modeCompletionEvent;
bool findJob(int threadId);
void parallelModeAnalysis(int threadId, int jobId);
void parallelME(int threadId, int meId);
/* full analysis for an I-slice CU */
- void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* sdata, uint32_t &zOrder);
+ void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder);
/* full analysis for a P or B slice CU */
void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom);
void compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom);
- void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom);
+ void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder);
/* measure merge and skip */
void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
@@ -104,20 +111,36 @@ protected:
void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly);
- /* measure intra options */
- void checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom);
- void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+ void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
/* encode current bestMode losslessly, pick best RD cost */
void tryLossless(const CUGeom& cuGeom);
- void checkDQP(CUData& cu, const CUGeom& cuGeom);
+ /* add the RD cost of coding a split flag (0 or 1) to the given mode */
void addSplitFlagCost(Mode& mode, uint32_t depth);
- void checkBestMode(Mode& mode, uint32_t depth);
+
+ /* update CBF flags and QP values to be internally consistent */
+ void checkDQP(CUData& cu, const CUGeom& cuGeom);
+
+ /* work-avoidance heuristics for RD levels < 5 */
uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
+ /* generate residual and recon pixels for an entire CTU recursively (RD0) */
void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
+
+ /* check whether current mode is the new best */
+ inline void checkBestMode(Mode& mode, uint32_t depth)
+ {
+ ModeDepth& md = m_modeDepth[depth];
+ if (md.bestMode)
+ {
+ if (mode.rdCost < md.bestMode->rdCost)
+ md.bestMode = &mode;
+ }
+ else
+ md.bestMode = &mode;
+ }
};
struct ThreadLocalData
diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp
index 66f8e28..74cee73 100644
--- a/source/encoder/api.cpp
+++ b/source/encoder/api.cpp
@@ -73,7 +73,11 @@ x265_encoder *x265_encoder_open(x265_param *p)
determineLevel(*param, encoder->m_vps);
encoder->create();
- encoder->init();
+ if (encoder->m_aborted)
+ {
+ delete encoder;
+ return NULL;
+ }
x265_print_params(param);
@@ -178,7 +182,6 @@ void x265_encoder_close(x265_encoder *enc)
extern "C"
void x265_cleanup(void)
{
- destroyROM();
BitCost::destroy();
}
@@ -198,13 +201,12 @@ void x265_picture_init(x265_param *param, x265_picture *pic)
pic->forceqp = X265_QP_AUTO;
if (param->analysisMode)
{
- uint32_t numPartitions = 1 << (g_maxFullDepth * 2);
uint32_t widthInCU = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
uint32_t heightInCU = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
uint32_t numCUsInFrame = widthInCU * heightInCU;
pic->analysisData.numCUsInFrame = numCUsInFrame;
- pic->analysisData.numPartitions = numPartitions;
+ pic->analysisData.numPartitions = NUM_CU_PARTITIONS;
}
}
@@ -213,37 +215,3 @@ void x265_picture_free(x265_picture *p)
{
return x265_free(p);
}
-
-int x265_alloc_analysis_data(x265_picture* pic)
-{
- CHECKED_MALLOC(pic->analysisData.interData, x265_inter_data, pic->analysisData.numCUsInFrame * 85);
- CHECKED_MALLOC(pic->analysisData.intraData, x265_intra_data, 1);
- pic->analysisData.intraData->cuAddr = NULL;
- pic->analysisData.intraData->depth = NULL;
- pic->analysisData.intraData->modes = NULL;
- pic->analysisData.intraData->partSizes = NULL;
- pic->analysisData.intraData->poc = NULL;
- CHECKED_MALLOC(pic->analysisData.intraData->depth, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->modes, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->partSizes, char, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->cuAddr, uint32_t, pic->analysisData.numCUsInFrame);
- CHECKED_MALLOC(pic->analysisData.intraData->poc, int, pic->analysisData.numCUsInFrame);
- return 0;
-
-fail:
- x265_free_analysis_data(pic);
- return -1;
-}
-
-void x265_free_analysis_data(x265_picture* pic)
-{
- X265_FREE(pic->analysisData.interData);
- pic->analysisData.interData = NULL;
- X265_FREE(pic->analysisData.intraData->depth);
- X265_FREE(pic->analysisData.intraData->modes);
- X265_FREE(pic->analysisData.intraData->partSizes);
- X265_FREE(pic->analysisData.intraData->cuAddr);
- X265_FREE(pic->analysisData.intraData->poc);
- X265_FREE(pic->analysisData.intraData);
- pic->analysisData.intraData = NULL;
-}
diff --git a/source/encoder/bitcost.h b/source/encoder/bitcost.h
index d28486b..674dffa 100644
--- a/source/encoder/bitcost.h
+++ b/source/encoder/bitcost.h
@@ -35,7 +35,7 @@ class BitCost
{
public:
- BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0) {}
+ BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0), m_mvp(0) {}
void setQP(unsigned int qp);
diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp
index 1c82a76..9ca1d04 100644
--- a/source/encoder/dpb.cpp
+++ b/source/encoder/dpb.cpp
@@ -52,8 +52,8 @@ DPB::~DPB()
FrameData* next = m_picSymFreeList->m_freeListNext;
m_picSymFreeList->destroy();
- m_picSymFreeList->m_reconPicYuv->destroy();
- delete m_picSymFreeList->m_reconPicYuv;
+ m_picSymFreeList->m_reconPic->destroy();
+ delete m_picSymFreeList->m_reconPic;
delete m_picSymFreeList;
m_picSymFreeList = next;
@@ -82,7 +82,7 @@ void DPB::recycleUnreferenced()
curFrame->m_encData->m_freeListNext = m_picSymFreeList;
m_picSymFreeList = curFrame->m_encData;
curFrame->m_encData = NULL;
- curFrame->m_reconPicYuv = NULL;
+ curFrame->m_reconPic = NULL;
}
}
}
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
index 44e82af..45ad5b1 100644
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -51,6 +51,8 @@ static const char *summaryCSVHeader =
"B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
"Version\n";
+const char* defaultAnalysisFileName = "x265_analysis.dat";
+
using namespace x265;
Encoder::Encoder()
@@ -78,11 +80,12 @@ Encoder::Encoder()
m_buOffsetC = NULL;
m_threadPool = 0;
m_numThreadLocalData = 0;
+ m_analysisFile = NULL;
}
void Encoder::create()
{
- if (!primitives.sad[0])
+ if (!primitives.pu[0].sad)
{
// this should be an impossible condition when using our public API, and indicates a serious bug.
x265_log(m_param, X265_LOG_ERROR, "Primitives must be initialized before encoder is created\n");
@@ -92,9 +95,10 @@ void Encoder::create()
x265_param* p = m_param;
int rows = (p->sourceHeight + p->maxCUSize - 1) >> g_log2Size[p->maxCUSize];
+ int cols = (p->sourceWidth + p->maxCUSize - 1) >> g_log2Size[p->maxCUSize];
- // Do not allow WPP if only one row, it is pointless and unstable
- if (rows == 1)
+ // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
+ if (rows == 1 || cols < 3)
p->bEnableWavefront = 0;
int poolThreadCount = p->poolNumThreads ? p->poolNumThreads : getCpuCount();
@@ -131,8 +135,8 @@ void Encoder::create()
int cpuCount = getCpuCount();
if (!p->bEnableWavefront)
p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2);
- else if (cpuCount > 32)
- p->frameNumThreads = 6; // dual-socket 10-core IvyBridge or higher
+ else if (cpuCount >= 32)
+ p->frameNumThreads = (p->sourceHeight > 2000) ? 8 : 6; // dual-socket 10-core IvyBridge or higher
else if (cpuCount >= 16)
p->frameNumThreads = 5; // 8 HT cores, or dual socket
else if (cpuCount >= 8)
@@ -194,31 +198,77 @@ void Encoder::create()
m_csvfpt = fopen(m_param->csvfn, "r");
if (m_csvfpt)
{
- // file already exists, re-open for append
+ /* file already exists, re-open for append */
fclose(m_csvfpt);
m_csvfpt = fopen(m_param->csvfn, "ab");
}
else
{
- // new CSV file, write header
+ /* new CSV file, write header */
m_csvfpt = fopen(m_param->csvfn, "wb");
if (m_csvfpt)
{
- if (m_param->logLevel >= X265_LOG_DEBUG)
+ if (m_param->logLevel >= X265_LOG_FRAME)
{
fprintf(m_csvfpt, "Encode Order, Type, POC, QP, Bits, ");
if (m_param->rc.rateControlMode == X265_RC_CRF)
fprintf(m_csvfpt, "RateFactor, ");
- fprintf(m_csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB), "
- "Encoding time, Elapsed time, List 0, List 1\n");
+ fprintf(m_csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB), List 0, List 1");
+ /* detailed performance statistics */
+ fprintf(m_csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks\n");
}
else
fputs(summaryCSVHeader, m_csvfpt);
}
}
+
+ if (!m_csvfpt)
+ {
+ x265_log(m_param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", m_param->csvfn);
+ m_aborted = true;
+ }
+ }
+
+ if (m_frameEncoder)
+ {
+ int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
+ int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
+ for (int i = 0; i < m_param->frameNumThreads; i++)
+ {
+ if (!m_frameEncoder[i].init(this, numRows, numCols, i))
+ {
+ x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
+ m_aborted = true;
+ }
+ }
}
+ if (m_param->bEmitHRDSEI)
+ m_rateControl->initHRD(&m_sps);
+ if (!m_rateControl->init(&m_sps))
+ m_aborted = true;
+
+ m_lookahead->init();
+
+ if (m_param->analysisMode)
+ {
+ const char* name = m_param->analysisFileName;
+ if (!name)
+ name = defaultAnalysisFileName;
+ const char* mode = m_param->analysisMode == X265_ANALYSIS_LOAD ? "rb" : "wb";
+ m_analysisFile = fopen(name, mode);
+ if (!m_analysisFile)
+ {
+ x265_log(NULL, X265_LOG_ERROR, "Analysis load/save: failed to open file %s\n", name);
+ m_aborted = true;
+ }
+ }
+
+ m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1;
+
m_aborted |= parseLambdaFile(m_param);
+
+ m_encodeStartTime = x265_mdate();
}
void Encoder::destroy()
@@ -250,10 +300,7 @@ void Encoder::destroy()
delete [] m_threadLocalData;
if (m_lookahead)
- {
- m_lookahead->destroy();
- delete m_lookahead;
- }
+ m_lookahead->stop();
delete m_dpb;
if (m_rateControl)
@@ -261,15 +308,26 @@ void Encoder::destroy()
m_rateControl->destroy();
delete m_rateControl;
}
+
// thread pool release should always happen last
if (m_threadPool)
m_threadPool->release();
+ if (m_lookahead)
+ {
+ m_lookahead->destroy();
+ delete m_lookahead;
+ }
+
X265_FREE(m_cuOffsetY);
X265_FREE(m_cuOffsetC);
X265_FREE(m_buOffsetY);
X265_FREE(m_buOffsetC);
+ if (m_analysisFile)
+ fclose(m_analysisFile);
+ free(m_param->analysisFileName);
+ free(m_param->csvfn);
if (m_csvfpt)
fclose(m_csvfpt);
free(m_param->rc.statFileName); // alloc'd by strdup
@@ -277,29 +335,6 @@ void Encoder::destroy()
X265_FREE(m_param);
}
-void Encoder::init()
-{
- if (m_frameEncoder)
- {
- int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
- int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
- for (int i = 0; i < m_param->frameNumThreads; i++)
- {
- if (!m_frameEncoder[i].init(this, numRows, numCols, i))
- {
- x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
- m_aborted = true;
- }
- }
- }
- if (m_param->bEmitHRDSEI)
- m_rateControl->initHRD(&m_sps);
- if (!m_rateControl->init(&m_sps))
- m_aborted = true;
- m_lookahead->init();
- m_encodeStartTime = x265_mdate();
-}
-
void Encoder::updateVbvPlan(RateControl* rc)
{
for (int i = 0; i < m_param->frameNumThreads; i++)
@@ -367,14 +402,14 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
* allocated by this top level encoder */
if (m_cuOffsetY)
{
- inFrame->m_origPicYuv->m_cuOffsetC = m_cuOffsetC;
- inFrame->m_origPicYuv->m_cuOffsetY = m_cuOffsetY;
- inFrame->m_origPicYuv->m_buOffsetC = m_buOffsetC;
- inFrame->m_origPicYuv->m_buOffsetY = m_buOffsetY;
+ inFrame->m_fencPic->m_cuOffsetC = m_cuOffsetC;
+ inFrame->m_fencPic->m_cuOffsetY = m_cuOffsetY;
+ inFrame->m_fencPic->m_buOffsetC = m_buOffsetC;
+ inFrame->m_fencPic->m_buOffsetY = m_buOffsetY;
}
else
{
- if (!inFrame->m_origPicYuv->createOffsets(m_sps))
+ if (!inFrame->m_fencPic->createOffsets(m_sps))
{
m_aborted = true;
x265_log(m_param, X265_LOG_ERROR, "memory allocation failure, aborting encode\n");
@@ -384,10 +419,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
}
else
{
- m_cuOffsetC = inFrame->m_origPicYuv->m_cuOffsetC;
- m_cuOffsetY = inFrame->m_origPicYuv->m_cuOffsetY;
- m_buOffsetC = inFrame->m_origPicYuv->m_buOffsetC;
- m_buOffsetY = inFrame->m_origPicYuv->m_buOffsetY;
+ m_cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
+ m_cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
+ m_buOffsetC = inFrame->m_fencPic->m_buOffsetC;
+ m_buOffsetY = inFrame->m_fencPic->m_buOffsetY;
}
}
}
@@ -405,9 +440,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
/* Copy input picture into a Frame and PicYuv, send to lookahead */
inFrame->m_poc = ++m_pocLast;
- inFrame->m_origPicYuv->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
- inFrame->m_intraData = pic_in->analysisData.intraData;
- inFrame->m_interData = pic_in->analysisData.interData;
+ inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
+
inFrame->m_userData = pic_in->userData;
inFrame->m_pts = pic_in->pts;
inFrame->m_forceqp = pic_in->forceqp;
@@ -431,11 +465,31 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
}
}
else
+ {
+ ProfileScopeEvent(prelookahead);
m_rateControl->calcAdaptiveQuantFrame(inFrame);
+ }
}
/* Use the frame types from the first pass, if available */
int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : pic_in->sliceType;
+
+ /* In analysisSave mode, x265_analysis_data is allocated in pic_in and inFrame points to this */
+ /* Load analysis data before lookahead->addPicture, since sliceType has been decided */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+ {
+ x265_picture* inputPic = const_cast<x265_picture*>(pic_in);
+ /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */
+ readAnalysisFile(&inputPic->analysisData, inFrame->m_poc);
+ inFrame->m_analysisData.poc = inFrame->m_poc;
+ inFrame->m_analysisData.sliceType = inputPic->analysisData.sliceType;
+ inFrame->m_analysisData.numCUsInFrame = inputPic->analysisData.numCUsInFrame;
+ inFrame->m_analysisData.numPartitions = inputPic->analysisData.numPartitions;
+ inFrame->m_analysisData.interData = inputPic->analysisData.interData;
+ inFrame->m_analysisData.intraData = inputPic->analysisData.intraData;
+ sliceType = inputPic->analysisData.sliceType;
+ }
+
m_lookahead->addPicture(inFrame, sliceType);
m_numDelayedPic++;
}
@@ -446,146 +500,185 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
int ret = 0;
- // getEncodedPicture() should block until the FrameEncoder has completed
- // encoding the frame. This is how back-pressure through the API is
- // accomplished when the encoder is full.
- Frame *outFrame = curEncoder->getEncodedPicture(m_nalList);
-
- if (outFrame)
+ /* Normal operation is to wait for the current frame encoder to complete its current frame
+ * and then to give it a new frame to work on. In zero-latency mode, we must encode this
+ * input picture before returning so the order must be reversed. This do/while() loop allows
+ * us to alternate the order of the calls without ugly code replication */
+ Frame* outFrame = NULL;
+ Frame* frameEnc = NULL;
+ int pass = 0;
+ do
{
- Slice *slice = outFrame->m_encData->m_slice;
- if (pic_out)
+ /* getEncodedPicture() should block until the FrameEncoder has completed
+ * encoding the frame. This is how back-pressure through the API is
+ * accomplished when the encoder is full */
+ if (!m_bZeroLatency || pass)
+ outFrame = curEncoder->getEncodedPicture(m_nalList);
+ if (outFrame)
{
- PicYuv *recpic = outFrame->m_reconPicYuv;
- pic_out->poc = slice->m_poc;
- pic_out->bitDepth = X265_DEPTH;
- pic_out->userData = outFrame->m_userData;
- pic_out->colorSpace = m_param->internalCsp;
+ Slice *slice = outFrame->m_encData->m_slice;
- pic_out->pts = outFrame->m_pts;
- pic_out->dts = outFrame->m_dts;
+ /* Free up pic_in->analysisData since it has already been used */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+ freeAnalysis(&outFrame->m_analysisData);
- switch (slice->m_sliceType)
+ if (pic_out)
{
- case I_SLICE:
- pic_out->sliceType = outFrame->m_lowres.bKeyframe ? X265_TYPE_IDR : X265_TYPE_I;
- break;
- case P_SLICE:
- pic_out->sliceType = X265_TYPE_P;
- break;
- case B_SLICE:
- pic_out->sliceType = X265_TYPE_B;
- break;
- }
+ PicYuv *recpic = outFrame->m_reconPic;
+ pic_out->poc = slice->m_poc;
+ pic_out->bitDepth = X265_DEPTH;
+ pic_out->userData = outFrame->m_userData;
+ pic_out->colorSpace = m_param->internalCsp;
- pic_out->planes[0] = recpic->m_picOrg[0];
- pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel));
- pic_out->planes[1] = recpic->m_picOrg[1];
- pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel));
- pic_out->planes[2] = recpic->m_picOrg[2];
- pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel));
- }
+ pic_out->pts = outFrame->m_pts;
+ pic_out->dts = outFrame->m_dts;
- if (m_param->analysisMode)
- {
- pic_out->analysisData.interData = outFrame->m_interData;
- pic_out->analysisData.intraData = outFrame->m_intraData;
- pic_out->analysisData.numCUsInFrame = slice->m_sps->numCUsInFrame;
- pic_out->analysisData.numPartitions = slice->m_sps->numPartitions;
- }
+ switch (slice->m_sliceType)
+ {
+ case I_SLICE:
+ pic_out->sliceType = outFrame->m_lowres.bKeyframe ? X265_TYPE_IDR : X265_TYPE_I;
+ break;
+ case P_SLICE:
+ pic_out->sliceType = X265_TYPE_P;
+ break;
+ case B_SLICE:
+ pic_out->sliceType = X265_TYPE_B;
+ break;
+ }
- if (slice->m_sliceType == P_SLICE)
- {
- if (slice->m_weightPredTable[0][0][0].bPresentFlag)
- m_numLumaWPFrames++;
- if (slice->m_weightPredTable[0][0][1].bPresentFlag ||
- slice->m_weightPredTable[0][0][2].bPresentFlag)
- m_numChromaWPFrames++;
- }
- else if (slice->m_sliceType == B_SLICE)
- {
- bool bLuma = false, bChroma = false;
- for (int l = 0; l < 2; l++)
+ pic_out->planes[0] = recpic->m_picOrg[0];
+ pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel));
+ pic_out->planes[1] = recpic->m_picOrg[1];
+ pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel));
+ pic_out->planes[2] = recpic->m_picOrg[2];
+ pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel));
+
+ /* Dump analysis data from pic_out to file in save mode and free */
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+ {
+ pic_out->analysisData.poc = pic_out->poc;
+ pic_out->analysisData.sliceType = pic_out->sliceType;
+ pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
+ pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
+ pic_out->analysisData.interData = outFrame->m_analysisData.interData;
+ pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
+ writeAnalysisFile(&pic_out->analysisData);
+ freeAnalysis(&pic_out->analysisData);
+ }
+ }
+ if (slice->m_sliceType == P_SLICE)
{
- if (slice->m_weightPredTable[l][0][0].bPresentFlag)
- bLuma = true;
- if (slice->m_weightPredTable[l][0][1].bPresentFlag ||
- slice->m_weightPredTable[l][0][2].bPresentFlag)
- bChroma = true;
+ if (slice->m_weightPredTable[0][0][0].bPresentFlag)
+ m_numLumaWPFrames++;
+ if (slice->m_weightPredTable[0][0][1].bPresentFlag ||
+ slice->m_weightPredTable[0][0][2].bPresentFlag)
+ m_numChromaWPFrames++;
}
+ else if (slice->m_sliceType == B_SLICE)
+ {
+ bool bLuma = false, bChroma = false;
+ for (int l = 0; l < 2; l++)
+ {
+ if (slice->m_weightPredTable[l][0][0].bPresentFlag)
+ bLuma = true;
+ if (slice->m_weightPredTable[l][0][1].bPresentFlag ||
+ slice->m_weightPredTable[l][0][2].bPresentFlag)
+ bChroma = true;
+ }
- if (bLuma)
- m_numLumaWPBiFrames++;
- if (bChroma)
- m_numChromaWPBiFrames++;
- }
- if (m_aborted)
- return -1;
+ if (bLuma)
+ m_numLumaWPBiFrames++;
+ if (bChroma)
+ m_numChromaWPBiFrames++;
+ }
- finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits);
- // Allow this frame to be recycled if no frame encoders are using it for reference
- if (!pic_out)
- {
- ATOMIC_DEC(&outFrame->m_countRefEncoders);
- m_dpb->recycleUnreferenced();
- }
- else
- m_exportedPic = outFrame;
+ if (m_aborted)
+ return -1;
- m_numDelayedPic--;
+ finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits);
- ret = 1;
- }
+ /* Allow this frame to be recycled if no frame encoders are using it for reference */
+ if (!pic_out)
+ {
+ ATOMIC_DEC(&outFrame->m_countRefEncoders);
+ m_dpb->recycleUnreferenced();
+ }
+ else
+ m_exportedPic = outFrame;
- // pop a single frame from decided list, then provide to frame encoder
- // curEncoder is guaranteed to be idle at this point
- Frame* frameEnc = m_lookahead->getDecidedPicture();
- if (frameEnc)
- {
- // give this picture a FrameData instance before encoding
- if (m_dpb->m_picSymFreeList)
- {
- frameEnc->m_encData = m_dpb->m_picSymFreeList;
- m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
- frameEnc->reinit(m_sps);
- }
- else
- {
- frameEnc->allocEncodeData(m_param, m_sps);
- Slice* slice = frameEnc->m_encData->m_slice;
- slice->m_sps = &m_sps;
- slice->m_pps = &m_pps;
- slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
- slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS);
- frameEnc->m_reconPicYuv->m_cuOffsetC = m_cuOffsetC;
- frameEnc->m_reconPicYuv->m_cuOffsetY = m_cuOffsetY;
- frameEnc->m_reconPicYuv->m_buOffsetC = m_buOffsetC;
- frameEnc->m_reconPicYuv->m_buOffsetY = m_buOffsetY;
+ m_numDelayedPic--;
+
+ ret = 1;
}
- curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
- if (m_bframeDelay)
+
+ /* pop a single frame from decided list, then provide to frame encoder
+ * curEncoder is guaranteed to be idle at this point */
+ if (!pass)
+ frameEnc = m_lookahead->getDecidedPicture();
+ if (frameEnc && !pass)
{
- int64_t *prevReorderedPts = m_prevReorderedPts;
- frameEnc->m_dts = m_encodedFrameNum > m_bframeDelay
- ? prevReorderedPts[(m_encodedFrameNum - m_bframeDelay) % m_bframeDelay]
- : frameEnc->m_reorderedPts - m_bframeDelayTime;
- prevReorderedPts[m_encodedFrameNum % m_bframeDelay] = frameEnc->m_reorderedPts;
- }
- else
- frameEnc->m_dts = frameEnc->m_reorderedPts;
+ /* give this frame a FrameData instance before encoding */
+ if (m_dpb->m_picSymFreeList)
+ {
+ frameEnc->m_encData = m_dpb->m_picSymFreeList;
+ m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
+ frameEnc->reinit(m_sps);
+ }
+ else
+ {
+ frameEnc->allocEncodeData(m_param, m_sps);
+ Slice* slice = frameEnc->m_encData->m_slice;
+ slice->m_sps = &m_sps;
+ slice->m_pps = &m_pps;
+ slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
+ slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS);
+ frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC;
+ frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY;
+ frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC;
+ frameEnc->m_reconPic->m_buOffsetY = m_buOffsetY;
+ }
+
+ curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
+ if (m_bframeDelay)
+ {
+ int64_t *prevReorderedPts = m_prevReorderedPts;
+ frameEnc->m_dts = m_encodedFrameNum > m_bframeDelay
+ ? prevReorderedPts[(m_encodedFrameNum - m_bframeDelay) % m_bframeDelay]
+ : frameEnc->m_reorderedPts - m_bframeDelayTime;
+ prevReorderedPts[m_encodedFrameNum % m_bframeDelay] = frameEnc->m_reorderedPts;
+ }
+ else
+ frameEnc->m_dts = frameEnc->m_reorderedPts;
+
+ /* Allocate analysis data before encode in save mode. This is allocated in frameEnc */
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+ {
+ x265_analysis_data* analysis = &frameEnc->m_analysisData;
+ analysis->poc = frameEnc->m_poc;
+ analysis->sliceType = frameEnc->m_lowres.sliceType;
+ uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
+ uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
+
+ uint32_t numCUsInFrame = widthInCU * heightInCU;
+ analysis->numCUsInFrame = numCUsInFrame;
+ analysis->numPartitions = NUM_CU_PARTITIONS;
+ allocAnalysis(analysis);
+ }
- // determine references, setup RPS, etc
- m_dpb->prepareEncode(frameEnc);
+ /* determine references, setup RPS, etc */
+ m_dpb->prepareEncode(frameEnc);
- if (m_param->rc.rateControlMode != X265_RC_CQP)
- m_lookahead->getEstimatedPictureCost(frameEnc);
+ if (m_param->rc.rateControlMode != X265_RC_CQP)
+ m_lookahead->getEstimatedPictureCost(frameEnc);
- // Allow FrameEncoder::compressFrame() to start in the frame encoder thread
- if (!curEncoder->startCompressFrame(frameEnc))
- m_aborted = true;
+ /* Allow FrameEncoder::compressFrame() to start in the frame encoder thread */
+ if (!curEncoder->startCompressFrame(frameEnc))
+ m_aborted = true;
+ }
+ else if (m_encodedFrameNum)
+ m_rateControl->setFinalFrameCount(m_encodedFrameNum);
}
- else if (m_encodedFrameNum)
- m_rateControl->setFinalFrameCount(m_encodedFrameNum);
+ while (m_bZeroLatency && ++pass < 2);
return ret;
}
@@ -890,7 +983,7 @@ void Encoder::writeLog(int argc, char **argv)
{
if (m_csvfpt)
{
- if (m_param->logLevel >= X265_LOG_DEBUG)
+ if (m_param->logLevel >= X265_LOG_FRAME)
{
// adding summary to a per-frame csv log file needs a summary header
fprintf(m_csvfpt, "\nSummary\n");
@@ -965,7 +1058,7 @@ static const char*digestToString(const unsigned char digest[3][16], int numChar)
void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64_t bits)
{
- PicYuv* reconPic = curFrame->m_reconPicYuv;
+ PicYuv* reconPic = curFrame->m_reconPic;
//===== calculate PSNR =====
int width = reconPic->m_picWidth - m_sps.conformanceWindow.rightOffset;
@@ -1029,14 +1122,14 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64
m_analyzeB.addSsim(ssim);
}
- // if debug log level is enabled, per frame logging is performed
+ char c = (slice->isIntra() ? 'I' : slice->isInterP() ? 'P' : 'B');
+ int poc = slice->m_poc;
+ if (!IS_REFERENCED(curFrame))
+ c += 32; // lower case if unreferenced
+
+ // if debug log level is enabled, per frame console logging is performed
if (m_param->logLevel >= X265_LOG_DEBUG)
{
- char c = (slice->isIntra() ? 'I' : slice->isInterP() ? 'P' : 'B');
- int poc = slice->m_poc;
- if (!IS_REFERENCED(curFrame))
- c += 32; // lower case if unreferenced
-
char buf[1024];
int p;
p = sprintf(buf, "POC:%d %c QP %2.2lf(%d) %10d bits", poc, c, curEncData.m_avgQpAq, slice->m_sliceQp, (int)bits);
@@ -1063,43 +1156,6 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64
}
}
- // per frame CSV logging if the file handle is valid
- if (m_csvfpt)
- {
- fprintf(m_csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d,", m_outputCount++, c, poc, curEncData.m_avgQpAq, (int)bits);
- if (m_param->rc.rateControlMode == X265_RC_CRF)
- fprintf(m_csvfpt, "%.3lf,", curEncData.m_rateFactor);
- double psnr = (psnrY * 6 + psnrU + psnrV) / 8;
- if (m_param->bEnablePsnr)
- fprintf(m_csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", psnrY, psnrU, psnrV, psnr);
- else
- fprintf(m_csvfpt, " -, -, -, -,");
- if (m_param->bEnableSsim)
- fprintf(m_csvfpt, " %.6f, %6.3f,", ssim, x265_ssim2dB(ssim));
- else
- fprintf(m_csvfpt, " -, -,");
- fprintf(m_csvfpt, " %.3lf, %.3lf", curEncoder->m_frameTime, curEncoder->m_elapsedCompressTime);
- if (!slice->isIntra())
- {
- int numLists = slice->isInterP() ? 1 : 2;
- for (int list = 0; list < numLists; list++)
- {
- fprintf(m_csvfpt, ", ");
- for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
- {
- int k = slice->m_refPOCList[list][ref] - slice->m_lastIDR;
- fprintf(m_csvfpt, " %d", k);
- }
- }
-
- if (numLists == 1)
- fprintf(m_csvfpt, ", -");
- }
- else
- fprintf(m_csvfpt, ", -, -");
- fprintf(m_csvfpt, "\n");
- }
-
if (m_param->decodedPictureHashSEI && m_param->logLevel >= X265_LOG_FULL)
{
const char* digestStr = NULL;
@@ -1119,7 +1175,60 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64
p += sprintf(buf + p, " [Checksum:%s]", digestStr);
}
}
+
x265_log(m_param, X265_LOG_DEBUG, "%s\n", buf);
+ }
+
+ if (m_param->logLevel >= X265_LOG_FRAME && m_csvfpt)
+ {
+ // per frame CSV logging if the file handle is valid
+ fprintf(m_csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d,", m_outputCount++, c, poc, curEncData.m_avgQpAq, (int)bits);
+ if (m_param->rc.rateControlMode == X265_RC_CRF)
+ fprintf(m_csvfpt, "%.3lf,", curEncData.m_rateFactor);
+ double psnr = (psnrY * 6 + psnrU + psnrV) / 8;
+ if (m_param->bEnablePsnr)
+ fprintf(m_csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", psnrY, psnrU, psnrV, psnr);
+ else
+ fputs(" -, -, -, -,", m_csvfpt);
+ if (m_param->bEnableSsim)
+ fprintf(m_csvfpt, " %.6f, %6.3f", ssim, x265_ssim2dB(ssim));
+ else
+ fputs(" -, -", m_csvfpt);
+ if (slice->isIntra())
+ fputs(", -, -", m_csvfpt);
+ else
+ {
+ int numLists = slice->isInterP() ? 1 : 2;
+ for (int list = 0; list < numLists; list++)
+ {
+ fprintf(m_csvfpt, ", ");
+ for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
+ {
+ int k = slice->m_refPOCList[list][ref] - slice->m_lastIDR;
+ fprintf(m_csvfpt, " %d", k);
+ }
+ }
+
+ if (numLists == 1)
+ fputs(", -", m_csvfpt);
+ }
+
+#define ELAPSED_MSEC(start, end) (((double)(end) - (start)) / 1000)
+
+ // detailed frame statistics
+ fprintf(m_csvfpt, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf",
+ ELAPSED_MSEC(0, curEncoder->m_slicetypeWaitTime),
+ ELAPSED_MSEC(curEncoder->m_startCompressTime, curEncoder->m_row0WaitTime),
+ ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_endCompressTime),
+ ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_allRowsAvailableTime),
+ ELAPSED_MSEC(0, curEncoder->m_totalWorkerElapsedTime),
+ ELAPSED_MSEC(0, curEncoder->m_totalNoWorkerTime));
+ if (curEncoder->m_totalActiveWorkerCount)
+ fprintf(m_csvfpt, ", %.3lf", (double)curEncoder->m_totalActiveWorkerCount / curEncoder->m_activeWorkerCountSamples);
+ else
+ fputs(", 1", m_csvfpt);
+ fprintf(m_csvfpt, ", %d", curEncoder->m_countRowBlocks);
+ fprintf(m_csvfpt, "\n");
fflush(stderr);
}
}
@@ -1159,7 +1268,7 @@ void Encoder::getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs)
if (buffer)
{
sprintf(buffer, "x265 (build %d) - %s:%s - H.265/HEVC codec - "
- "Copyright 2013-2014 (c) Multicoreware Inc - "
+ "Copyright 2013-2015 (c) Multicoreware Inc - "
"http://x265.org - options: %s",
X265_BUILD, x265_version_str, x265_build_info_str, opts);
@@ -1197,7 +1306,7 @@ void Encoder::initSPS(SPS *sps)
m_vps.ptl.progressiveSourceFlag = !m_param->interlaceMode;
m_vps.ptl.interlacedSourceFlag = !!m_param->interlaceMode;
m_vps.ptl.nonPackedConstraintFlag = false;
- m_vps.ptl.frameOnlyConstraintFlag = false;
+ m_vps.ptl.frameOnlyConstraintFlag = !m_param->interlaceMode;
sps->conformanceWindow = m_conformanceWindow;
sps->chromaFormatIdc = m_param->internalCsp;
@@ -1224,6 +1333,7 @@ void Encoder::initSPS(SPS *sps)
sps->maxDecPicBuffering = m_vps.maxDecPicBuffering;
sps->numReorderPics = m_vps.numReorderPics;
+ sps->maxLatencyIncrease = m_param->bframes;
sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing;
sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp;
@@ -1280,8 +1390,8 @@ void Encoder::initPPS(PPS *pps)
pps->maxCuDQPDepth = 0;
}
- pps->chromaCbQpOffset = m_param->cbQpOffset;
- pps->chromaCrQpOffset = m_param->crQpOffset;
+ pps->chromaQpOffset[0] = m_param->cbQpOffset;
+ pps->chromaQpOffset[1] = m_param->crQpOffset;
pps->bConstrainedIntraPred = m_param->bEnableConstrainedIntra;
pps->bUseWeightPred = m_param->bEnableWeightedPred;
@@ -1290,13 +1400,10 @@ void Encoder::initPPS(PPS *pps)
pps->bTransformSkipEnabled = m_param->bEnableTransformSkip;
pps->bSignHideEnabled = m_param->bEnableSignHiding;
- /* If offsets are ever configured, enable bDeblockingFilterControlPresent and set
- * deblockingFilterBetaOffsetDiv2 / deblockingFilterTcOffsetDiv2 */
- bool bDeblockOffsetInPPS = 0;
- pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || bDeblockOffsetInPPS;
+ pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || m_param->deblockingFilterBetaOffset || m_param->deblockingFilterTCOffset;
pps->bPicDisableDeblockingFilter = !m_param->bEnableLoopFilter;
- pps->deblockingFilterBetaOffsetDiv2 = 0;
- pps->deblockingFilterTcOffsetDiv2 = 0;
+ pps->deblockingFilterBetaOffsetDiv2 = m_param->deblockingFilterBetaOffset;
+ pps->deblockingFilterTcOffsetDiv2 = m_param->deblockingFilterTCOffset;
pps->bEntropyCodingSyncEnabled = m_param->bEnableWavefront;
}
@@ -1330,12 +1437,14 @@ void Encoder::configure(x265_param *p)
p->bBPyramid = 0;
/* Disable features which are not supported by the current RD level */
- if (p->rdLevel < 4)
+ if (p->rdLevel < 5)
{
- if (p->psyRdoq > 0) /* impossible */
- x265_log(p, X265_LOG_WARNING, "--psy-rdoq disabled, requires --rdlevel 4 or higher\n");
- p->psyRdoq = 0;
+ if (p->bEnableCbfFastMode) /* impossible */
+ x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n");
+ p->bEnableCbfFastMode = 0;
}
+ if (p->rdLevel < 4)
+ p->psyRdoq = 0; /* impossible */
if (p->rdLevel < 3)
{
if (p->bCULossless) /* impossible */
@@ -1350,9 +1459,7 @@ void Encoder::configure(x265_param *p)
x265_log(p, X265_LOG_WARNING, "--pmode disabled, requires --rdlevel 2 or higher\n");
p->bDistributeModeAnalysis = 0;
- if (p->psyRd > 0) /* impossible */
- x265_log(p, X265_LOG_WARNING, "--psy-rd disabled, requires --rdlevel 2 or higher\n");
- p->psyRd = 0;
+ p->psyRd = 0; /* impossible */
if (p->bEnableRectInter) /* broken, not very useful */
x265_log(p, X265_LOG_WARNING, "--rect disabled, requires --rdlevel 2 or higher\n");
@@ -1403,11 +1510,8 @@ void Encoder::configure(x265_param *p)
if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0)
p->rc.aqStrength = 0;
- if (p->internalCsp != X265_CSP_I420)
- {
- x265_log(p, X265_LOG_WARNING, "!! HEVC Range Extension specifications are not finalized !!\n");
- x265_log(p, X265_LOG_WARNING, "!! This output bitstream may not be compliant with the final spec !!\n");
- }
+ if (p->totalFrames <= 2 * ((float)p->fpsNum) / p->fpsDenom && p->rc.bStrictCbr)
+ p->lookaheadDepth = p->totalFrames;
if (p->scalingLists && p->internalCsp == X265_CSP_I444)
{
@@ -1424,6 +1528,12 @@ void Encoder::configure(x265_param *p)
p->rc.rfConstantMin = 0;
}
+ if (p->analysisMode && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation))
+ {
+ x265_log(p, X265_LOG_ERROR, "Analysis load/save options incompatible with pmode/pme");
+ p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0;
+ }
+
m_bframeDelay = p->bframes ? (p->bBPyramid ? 2 : 1) : 0;
p->bFrameBias = X265_MIN(X265_MAX(-90, p->bFrameBias), 100);
@@ -1458,35 +1568,212 @@ void Encoder::configure(x265_param *p)
x265_log(p, X265_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s);
}
- //========= set default display window ==================================
+ /* initialize the conformance window */
m_conformanceWindow.bEnabled = false;
m_conformanceWindow.rightOffset = 0;
m_conformanceWindow.topOffset = 0;
m_conformanceWindow.bottomOffset = 0;
m_conformanceWindow.leftOffset = 0;
- //======== set pad size if width is not multiple of the minimum CU size =========
- const uint32_t minCUSize = MIN_CU_SIZE;
- if (p->sourceWidth & (minCUSize - 1))
+ /* set pad size if width is not multiple of the minimum CU size */
+ if (p->sourceWidth & (MIN_CU_SIZE - 1))
{
- uint32_t rem = p->sourceWidth & (minCUSize - 1);
- uint32_t padsize = minCUSize - rem;
+ uint32_t rem = p->sourceWidth & (MIN_CU_SIZE - 1);
+ uint32_t padsize = MIN_CU_SIZE - rem;
p->sourceWidth += padsize;
- /* set the confirmation window offsets */
m_conformanceWindow.bEnabled = true;
m_conformanceWindow.rightOffset = padsize;
}
- //======== set pad size if height is not multiple of the minimum CU size =========
- if (p->sourceHeight & (minCUSize - 1))
+ /* set pad size if height is not multiple of the minimum CU size */
+ if (p->sourceHeight & (MIN_CU_SIZE - 1))
{
- uint32_t rem = p->sourceHeight & (minCUSize - 1);
- uint32_t padsize = minCUSize - rem;
+ uint32_t rem = p->sourceHeight & (MIN_CU_SIZE - 1);
+ uint32_t padsize = MIN_CU_SIZE - rem;
p->sourceHeight += padsize;
- /* set the confirmation window offsets */
m_conformanceWindow.bEnabled = true;
m_conformanceWindow.bottomOffset = padsize;
}
+ if (p->bDistributeModeAnalysis && p->analysisMode)
+ {
+ p->analysisMode = X265_ANALYSIS_OFF;
+ x265_log(p, X265_LOG_WARNING, "Analysis save and load mode not supported for distributed mode analysis\n");
+ }
+}
+
+void Encoder::allocAnalysis(x265_analysis_data* analysis)
+{
+ analysis->interData = analysis->intraData = NULL;
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
+ CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
+ CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+ CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+ CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
+ analysis->intraData = intraData;
+ }
+ else
+ {
+ analysis_inter_data *interData = (analysis_inter_data*)analysis->interData;
+ CHECKED_MALLOC_ZERO(interData, analysis_inter_data, 1);
+ CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2);
+ CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+ CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
+ analysis->interData = interData;
+ }
+ return;
+
+fail:
+ freeAnalysis(analysis);
+ m_aborted = true;
+}
+
+void Encoder::freeAnalysis(x265_analysis_data* analysis)
+{
+ if (analysis->intraData)
+ {
+ X265_FREE(((analysis_intra_data*)analysis->intraData)->depth);
+ X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
+ X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes);
+ X265_FREE(analysis->intraData);
+ }
+ else
+ {
+ X265_FREE(((analysis_inter_data*)analysis->interData)->ref);
+ X265_FREE(((analysis_inter_data*)analysis->interData)->depth);
+ X265_FREE(((analysis_inter_data*)analysis->interData)->modes);
+ X265_FREE(analysis->interData);
+ }
+}
+
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc)
+{
+
+#define X265_FREAD(val, size, readSize, fileOffset)\
+ if (fread(val, size, readSize, fileOffset) != readSize)\
+ {\
+ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
+ freeAnalysis(analysis);\
+ m_aborted = true;\
+ return;\
+ }\
+
+ static uint64_t consumedBytes = 0;
+ static uint64_t totalConsumedBytes = 0;
+ fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET);
+
+ int poc; uint32_t frameRecordSize;
+ X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+ X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
+
+ uint64_t currentOffset = totalConsumedBytes;
+
+ /* Seeking to the right frame Record */
+ while (poc != curPoc && !feof(m_analysisFile))
+ {
+ currentOffset += frameRecordSize;
+ fseeko(m_analysisFile, currentOffset, SEEK_SET);
+ X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+ X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
+ }
+
+ if (poc != curPoc || feof(m_analysisFile))
+ {
+ x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
+ freeAnalysis(analysis);
+ return;
+ }
+
+ /* Now arrived at the right frame, read the record */
+ analysis->poc = poc;
+ analysis->frameRecordSize = frameRecordSize;
+ X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
+ X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
+ X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
+
+ /* Memory is allocated for inter and intra analysis data based on the slicetype */
+ allocAnalysis(analysis);
+
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ X265_FREAD(((analysis_intra_data *)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FREAD(((analysis_intra_data *)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ analysis->sliceType = X265_TYPE_I;
+ consumedBytes += frameRecordSize;
+ }
+ else if (analysis->sliceType == X265_TYPE_P)
+ {
+ X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
+ X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ consumedBytes += frameRecordSize;
+ totalConsumedBytes = consumedBytes;
+ }
+ else
+ {
+ X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
+ X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ consumedBytes += frameRecordSize;
+ }
+#undef X265_FREAD
+}
+
+void Encoder::writeAnalysisFile(x265_analysis_data* analysis)
+{
+
+#define X265_FWRITE(val, size, writeSize, fileOffset)\
+ if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
+ {\
+ x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n");\
+ freeAnalysis(analysis);\
+ m_aborted = true;\
+ return;\
+ }\
+
+ /* calculate frameRecordSize */
+ analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
+ sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions);
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 3;
+ else if (analysis->sliceType == X265_TYPE_P)
+ {
+ analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
+ analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2;
+ }
+ else
+ {
+ analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
+ analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2;
+ }
+
+ X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
+ X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile);
+ X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
+ X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
+ X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
+
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
+ {
+ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ }
+ else if (analysis->sliceType == X265_TYPE_P)
+ {
+ X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
+ X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ }
+ else
+ {
+ X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
+ X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
+ }
+#undef X265_FWRITE
}
diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h
index 8a387c2..f1d73a3 100644
--- a/source/encoder/encoder.h
+++ b/source/encoder/encoder.h
@@ -74,7 +74,7 @@ struct ThreadLocalData;
class Encoder : public x265_encoder
{
-private:
+public:
int m_pocLast; // time index (POC)
int m_encodedFrameNum;
@@ -113,9 +113,7 @@ private:
int m_numChromaWPFrames; // number of P frames with weighted chroma reference
int m_numLumaWPBiFrames; // number of B frames with weighted luma reference
int m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
-
-public:
-
+ FILE* m_analysisFile;
int m_conformanceMode;
VPS m_vps;
SPS m_sps;
@@ -133,15 +131,14 @@ public:
Lookahead* m_lookahead;
Window m_conformanceWindow;
+ bool m_bZeroLatency; // x265_encoder_encode() returns NALs for the input picture, zero lag
bool m_aborted; // fatal error detected
Encoder();
-
~Encoder() {}
void create();
void destroy();
- void init();
int encode(const x265_picture* pic, x265_picture *pic_out);
@@ -163,12 +160,20 @@ public:
void updateVbvPlan(RateControl* rc);
+ void allocAnalysis(x265_analysis_data* analysis);
+
+ void freeAnalysis(x265_analysis_data* analysis);
+
+ void readAnalysisFile(x265_analysis_data* analysis, int poc);
+
+ void writeAnalysisFile(x265_analysis_data* pic);
+
+ void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
+
protected:
void initSPS(SPS *sps);
void initPPS(PPS *pps);
-
- void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
};
}
diff --git a/source/encoder/entropy.cpp b/source/encoder/entropy.cpp
index 13eaf57..b63ad7b 100644
--- a/source/encoder/entropy.cpp
+++ b/source/encoder/entropy.cpp
@@ -103,7 +103,7 @@ void Entropy::codeSPS(const SPS& sps, const ScalingList& scalingList, const Prof
WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1[i]");
WRITE_UVLC(sps.numReorderPics, "sps_num_reorder_pics[i]");
- WRITE_UVLC(0, "sps_max_latency_increase_plus1[i]");
+ WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1[i]");
WRITE_UVLC(sps.log2MinCodingBlockSize - 3, "log2_min_coding_block_size_minus3");
WRITE_UVLC(sps.log2DiffMaxMinCodingBlockSize, "log2_diff_max_min_coding_block_size");
@@ -154,8 +154,8 @@ void Entropy::codePPS(const PPS& pps)
if (pps.bUseDQP)
WRITE_UVLC(pps.maxCuDQPDepth, "diff_cu_qp_delta_depth");
- WRITE_SVLC(pps.chromaCbQpOffset, "pps_cb_qp_offset");
- WRITE_SVLC(pps.chromaCrQpOffset, "pps_cr_qp_offset");
+ WRITE_SVLC(pps.chromaQpOffset[0], "pps_cb_qp_offset");
+ WRITE_SVLC(pps.chromaQpOffset[1], "pps_cr_qp_offset");
WRITE_FLAG(0, "pps_slice_chroma_qp_offsets_present_flag");
WRITE_FLAG(pps.bUseWeightPred, "weighted_pred_flag");
@@ -397,7 +397,9 @@ void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData)
// Ideally this process should not be repeated for each slice in a picture
if (slice.isIRAP())
for (int picIdx = 0; picIdx < slice.m_rps.numberOfPictures; picIdx++)
+ {
X265_CHECK(!slice.m_rps.bUsed[picIdx], "pic unused failure\n");
+ }
#endif
WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
@@ -515,9 +517,9 @@ void Entropy::encodeCTU(const CUData& ctu, const CUGeom& cuGeom)
}
/* encode a CU block recursively */
-void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP)
+void Entropy::encodeCU(const CUData& ctu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP)
{
- const Slice* slice = cu.m_slice;
+ const Slice* slice = ctu.m_slice;
if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
bEncodeDQP = true;
@@ -527,78 +529,124 @@ void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartI
if (!cuUnsplitFlag)
{
- uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2;
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childCuData.flags & CUGeom::PRESENT)
- encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
+ encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP);
}
return;
}
// We need to split, so don't try these modes.
if (cuSplitFlag)
- codeSplitFlag(cu, absPartIdx, depth);
+ codeSplitFlag(ctu, absPartIdx, depth);
- if (depth < cu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
+ if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
{
- uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2;
-
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
{
- const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
+ encodeCU(ctu, childGeom, absPartIdx, depth + 1, bEncodeDQP);
}
return;
}
if (slice->m_pps->bTransquantBypassEnabled)
- codeCUTransquantBypassFlag(cu.m_tqBypass[absPartIdx]);
+ codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]);
if (!slice->isIntra())
- codeSkipFlag(cu, absPartIdx);
-
- if (cu.isSkipped(absPartIdx))
{
- codeMergeIndex(cu, absPartIdx);
- finishCU(cu, absPartIdx, depth);
- return;
+ codeSkipFlag(ctu, absPartIdx);
+ if (ctu.isSkipped(absPartIdx))
+ {
+ codeMergeIndex(ctu, absPartIdx);
+ finishCU(ctu, absPartIdx, depth);
+ return;
+ }
+ codePredMode(ctu.m_predMode[absPartIdx]);
}
- if (!slice->isIntra())
- codePredMode(cu.m_predMode[absPartIdx]);
-
- codePartSize(cu, absPartIdx, depth);
+ codePartSize(ctu, absPartIdx, depth);
// prediction Info ( Intra : direction mode, Inter : Mv, reference idx )
- codePredInfo(cu, absPartIdx);
+ codePredInfo(ctu, absPartIdx);
uint32_t tuDepthRange[2];
- if (cu.isIntra(absPartIdx))
- cu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx);
+ if (ctu.isIntra(absPartIdx))
+ ctu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx);
else
- cu.getInterTUQtDepthRange(tuDepthRange, absPartIdx);
+ ctu.getInterTUQtDepthRange(tuDepthRange, absPartIdx);
// Encode Coefficients, allow codeCoeff() to modify bEncodeDQP
- codeCoeff(cu, absPartIdx, depth, bEncodeDQP, tuDepthRange);
+ codeCoeff(ctu, absPartIdx, bEncodeDQP, tuDepthRange);
// --- write terminating bit ---
- finishCU(cu, absPartIdx, depth);
+ finishCU(ctu, absPartIdx, depth);
+}
+
+/* Return bit count of signaling inter mode */
+uint32_t Entropy::bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const
+{
+ uint32_t bits;
+ bits = bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); /* not skip */
+ bits += bitsCodeBin(0, m_contextState[OFF_PRED_MODE_CTX]); /* inter */
+ PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
+ switch (partSize)
+ {
+ case SIZE_2Nx2N:
+ bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX]);
+ break;
+
+ case SIZE_2NxN:
+ case SIZE_2NxnU:
+ case SIZE_2NxnD:
+ bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]);
+ bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 1]);
+ if (cu.m_slice->m_sps->maxAMPDepth > depth)
+ {
+ bits += bitsCodeBin((partSize == SIZE_2NxN) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]);
+ if (partSize != SIZE_2NxN)
+ bits++; // encodeBinEP((partSize == SIZE_2NxnU ? 0 : 1));
+ }
+ break;
+
+ case SIZE_Nx2N:
+ case SIZE_nLx2N:
+ case SIZE_nRx2N:
+ bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]);
+ bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]);
+ if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3))
+ bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]);
+ if (cu.m_slice->m_sps->maxAMPDepth > depth)
+ {
+ bits += bitsCodeBin((partSize == SIZE_Nx2N) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]);
+ if (partSize != SIZE_Nx2N)
+ bits++; // encodeBinEP((partSize == SIZE_nLx2N ? 0 : 1));
+ }
+ break;
+ default:
+ X265_CHECK(0, "invalid CU partition\n");
+ break;
+ }
+
+ return bits;
}
/* finish encoding a cu and handle end-of-slice conditions */
-void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
+void Entropy::finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth)
{
- const Slice* slice = cu.m_slice;
- X265_CHECK(cu.m_slice->m_endCUAddr == cu.m_slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n");
+ const Slice* slice = ctu.m_slice;
uint32_t realEndAddress = slice->m_endCUAddr;
- uint32_t cuAddr = cu.getSCUAddr() + absPartIdx;
+ uint32_t cuAddr = ctu.getSCUAddr() + absPartIdx;
+ X265_CHECK(realEndAddress == slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n");
uint32_t granularityMask = g_maxCUSize - 1;
- uint32_t cuSize = 1 << cu.m_log2CUSize[absPartIdx];
- uint32_t rpelx = cu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize;
- uint32_t bpely = cu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize;
+ uint32_t cuSize = 1 << ctu.m_log2CUSize[absPartIdx];
+ uint32_t rpelx = ctu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize;
+ uint32_t bpely = ctu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize;
bool granularityBoundary = (((rpelx & granularityMask) == 0 || (rpelx == slice->m_sps->picWidthInLumaSamples )) &&
((bpely & granularityMask) == 0 || (bpely == slice->m_sps->picHeightInLumaSamples)));
@@ -618,41 +666,18 @@ void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
}
}
-void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx,
- uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, bool& bCodeDQP, uint32_t depthRange[2])
+void Entropy::encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+ bool& bCodeDQP, const uint32_t depthRange[2])
{
- const bool subdiv = cu.m_tuDepth[absPartIdx] + cu.m_cuDepth[absPartIdx] > (uint8_t)depth;
- uint32_t hChromaShift = cu.m_hChromaShift;
- uint32_t vChromaShift = cu.m_vChromaShift;
- uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, trIdx);
- uint32_t cbfU = cu.getCbf(absPartIdx, TEXT_CHROMA_U, trIdx);
- uint32_t cbfV = cu.getCbf(absPartIdx, TEXT_CHROMA_V, trIdx);
-
- if (!trIdx)
- state.bakAbsPartIdxCU = absPartIdx;
-
- if (log2TrSize == 2 && cu.m_chromaFormat != X265_CSP_I444)
- {
- uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- if (!(absPartIdx & (partNum - 1)))
- {
- state.bakAbsPartIdx = absPartIdx;
- state.bakChromaOffset = offsetChroma;
- }
- else if ((absPartIdx & (partNum - 1)) == (partNum - 1))
- {
- cbfU = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_U, trIdx);
- cbfV = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_V, trIdx);
- }
- }
+ const bool subdiv = cu.m_tuDepth[absPartIdx] > tuDepth;
/* in each of these conditions, the subdiv flag is implied and not signaled,
* so we have checks to make sure the implied value matches our intentions */
- if (cu.m_predMode[absPartIdx] == MODE_INTRA && cu.m_partSize[absPartIdx] == SIZE_NxN && depth == cu.m_cuDepth[absPartIdx])
+ if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth)
{
X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
}
- else if (cu.m_predMode[absPartIdx] == MODE_INTER && (cu.m_partSize[absPartIdx] != SIZE_2Nx2N) && depth == cu.m_cuDepth[absPartIdx] &&
+ else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth &&
cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
{
X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2TrSize %d, depthRange[0] %d\n", log2TrSize, depthRange[0]);
@@ -671,127 +696,111 @@ void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t
codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
}
- const uint32_t trDepthCurr = depth - cu.m_cuDepth[absPartIdx];
- const bool bFirstCbfOfCU = trDepthCurr == 0;
-
- bool mCodeAll = true;
- const uint32_t numPels = 1 << (log2TrSize * 2 - hChromaShift - vChromaShift);
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
-
- if (bFirstCbfOfCU || mCodeAll)
+ uint32_t hChromaShift = cu.m_hChromaShift;
+ uint32_t vChromaShift = cu.m_vChromaShift;
+ bool bSmallChroma = (log2TrSize - hChromaShift < 2);
+ if (!tuDepth || !bSmallChroma)
{
- uint32_t tuSize = 1 << log2TrSize;
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1))
- codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_U, trDepthCurr, (subdiv == 0));
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1))
- codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_V, trDepthCurr, (subdiv == 0));
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
else
{
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1), "chroma xform size match failure\n");
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1), "chroma xform size match failure\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma xform size match failure\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma xform size match failure\n");
}
if (subdiv)
{
- log2TrSize--;
- uint32_t numCoeff = 1 << (log2TrSize * 2);
- uint32_t numCoeffC = (numCoeff >> (hChromaShift + vChromaShift));
- trIdx++;
- ++depth;
- absPartIdxStep >>= 2;
- const uint32_t partNum = NUM_CU_PARTITIONS >> (depth << 1);
+ --log2TrSize;
+ ++tuDepth;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
- absPartIdx += partNum;
- offsetLuma += numCoeff;
- offsetChroma += numCoeffC;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 0 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 1 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 2 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ encodeTransform(cu, absPartIdx + 3 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange);
+ return;
+ }
- absPartIdx += partNum;
- offsetLuma += numCoeff;
- offsetChroma += numCoeffC;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ uint32_t absPartIdxC = bSmallChroma ? absPartIdx & 0xFC : absPartIdx;
- absPartIdx += partNum;
- offsetLuma += numCoeff;
- offsetChroma += numCoeffC;
- encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange);
+ if (cu.isInter(absPartIdxC) && !tuDepth && !cu.getCbf(absPartIdxC, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdxC, TEXT_CHROMA_V, 0))
+ {
+ X265_CHECK(cu.getCbf(absPartIdxC, TEXT_LUMA, 0), "CBF should have been set\n");
}
else
+ codeQtCbfLuma(cu, absPartIdx, tuDepth);
+
+ uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth);
+ uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, tuDepth);
+ uint32_t cbfV = cu.getCbf(absPartIdxC, TEXT_CHROMA_V, tuDepth);
+ if (!(cbfY || cbfU || cbfV))
+ return;
+
+ // dQP: only for CTU once
+ if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
{
- if (cu.m_predMode[absPartIdx] != MODE_INTRA && depth == cu.m_cuDepth[absPartIdx] && !cu.getCbf(absPartIdx, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdx, TEXT_CHROMA_V, 0))
- {
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
- }
- else
- codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+ uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
+ codeDeltaQP(cu, absPartIdxLT);
+ bCodeDQP = false;
+ }
- if (cbfY || cbfU || cbfV)
- {
- // dQP: only for CTU once
- if (cu.m_slice->m_pps->bUseDQP)
- {
- if (bCodeDQP)
- {
- codeDeltaQP(cu, state.bakAbsPartIdxCU);
- bCodeDQP = false;
- }
- }
- }
- if (cbfY)
- codeCoeffNxN(cu, cu.m_trCoeff[0] + offsetLuma, absPartIdx, log2TrSize, TEXT_LUMA);
+ if (cbfY)
+ {
+ uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
+ codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2TrSize, TEXT_LUMA);
+ if (!(cbfU || cbfV))
+ return;
+ }
- int chFmt = cu.m_chromaFormat;
- if (log2TrSize == 2 && chFmt != X265_CSP_I444)
+ if (bSmallChroma)
+ {
+ if ((absPartIdx & 3) != 3)
+ return;
+
+ const uint32_t log2TrSizeC = 2;
+ const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422);
+ const uint32_t curPartNum = 4;
+ uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
+ for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- if ((absPartIdx & (partNum - 1)) == (partNum - 1))
+ TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC);
+ const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
+ do
{
- const uint32_t log2TrSizeC = 2;
- const bool splitIntoSubTUs = (chFmt == X265_CSP_I422);
-
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs))
{
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, state.bakAbsPartIdx);
- const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
- do
- {
- uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
- if (cbf)
- {
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- codeCoeffNxN(cu, coeffChroma + state.bakChromaOffset + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
- }
- }
- while (tuIterator.isNextSection());
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
}
}
+ while (tuIterator.isNextSection());
}
- else
+ }
+ else
+ {
+ uint32_t log2TrSizeC = log2TrSize - hChromaShift;
+ const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422);
+ uint32_t curPartNum = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
+ uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
+ for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- uint32_t log2TrSizeC = log2TrSize - hChromaShift;
- const bool splitIntoSubTUs = (chFmt == X265_CSP_I422);
- uint32_t curPartNum = NUM_CU_PARTITIONS >> (depth << 1);
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdxC);
+ const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
+ do
{
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx);
- const coeff_t* coeffChroma = cu.m_trCoeff[chromaId];
- do
+ if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs))
{
- uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
- if (cbf)
- {
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- codeCoeffNxN(cu, coeffChroma + offsetChroma + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
- }
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId);
}
- while (tuIterator.isNextSection());
}
+ while (tuIterator.isNextSection());
}
}
}
@@ -808,14 +817,14 @@ void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
codeIntraDirChroma(cu, absPartIdx, chromaDirMode);
- if ((cu.m_chromaFormat == X265_CSP_I444) && (cu.m_partSize[absPartIdx] == SIZE_NxN))
+ if (cu.m_chromaFormat == X265_CSP_I444 && cu.m_partSize[absPartIdx] != SIZE_2Nx2N)
{
- uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2;
- for (uint32_t i = 1; i <= 3; i++)
+ uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 1; qIdx < 4; ++qIdx)
{
- uint32_t offset = absPartIdx + i * partOffset;
- cu.getAllowedChromaDir(offset, chromaDirMode);
- codeIntraDirChroma(cu, offset, chromaDirMode);
+ absPartIdx += qNumParts;
+ cu.getAllowedChromaDir(absPartIdx, chromaDirMode);
+ codeIntraDirChroma(cu, absPartIdx, chromaDirMode);
}
}
}
@@ -867,7 +876,7 @@ void Entropy::codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list)
codeRefFrmIdx(cu, absPartIdx, list);
}
-void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2])
+void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2])
{
if (!cu.isIntra(absPartIdx))
{
@@ -877,12 +886,8 @@ void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, b
return;
}
- uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
- uint32_t lumaOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
- uint32_t chromaOffset = lumaOffset >> (cu.m_hChromaShift + cu.m_vChromaShift);
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> (depth << 1);
- CoeffCodeState state;
- encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange);
+ uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+ encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
}
void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
@@ -925,7 +930,7 @@ void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
/** initialize context model with respect to QP and initialization value */
uint8_t sbacInit(int qp, int initValue)
{
- qp = Clip3(0, 51, qp);
+ qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
int slope = (initValue >> 4) * 5 - 45;
int offset = ((initValue & 15) << 3) - 16;
@@ -1116,7 +1121,7 @@ void Entropy::writeCoefRemainExGolomb(uint32_t codeNumber, uint32_t absGoRice)
if (codeNumber != 0)
{
unsigned long idx;
- CLZ32(idx, codeNumber + 1);
+ CLZ(idx, codeNumber + 1);
length = idx;
codeNumber -= (1 << idx) - 1;
}
@@ -1145,11 +1150,6 @@ void Entropy::copyFrom(const Entropy& src)
markValid();
}
-void Entropy::codeMVPIdx(uint32_t symbol)
-{
- encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]);
-}
-
void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
{
PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
@@ -1200,32 +1200,6 @@ void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth
}
}
-void Entropy::codePredMode(int predMode)
-{
- encodeBin(predMode == MODE_INTER ? 0 : 1, m_contextState[OFF_PRED_MODE_CTX]);
-}
-
-void Entropy::codeCUTransquantBypassFlag(uint32_t symbol)
-{
- encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]);
-}
-
-void Entropy::codeSkipFlag(const CUData& cu, uint32_t absPartIdx)
-{
- // get context function is here
- uint32_t symbol = cu.isSkipped(absPartIdx) ? 1 : 0;
- uint32_t ctxSkip = cu.getCtxSkipFlag(absPartIdx);
-
- encodeBin(symbol, m_contextState[OFF_SKIP_FLAG_CTX + ctxSkip]);
-}
-
-void Entropy::codeMergeFlag(const CUData& cu, uint32_t absPartIdx)
-{
- const uint32_t symbol = cu.m_mergeFlag[absPartIdx] ? 1 : 0;
-
- encodeBin(symbol, m_contextState[OFF_MERGE_FLAG_EXT_CTX]);
-}
-
void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx)
{
uint32_t numCand = cu.m_slice->m_maxNumMergeCand;
@@ -1246,50 +1220,18 @@ void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx)
}
}
-void Entropy::codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
-{
- X265_CHECK(depth < g_maxCUDepth, "invalid depth\n");
-
- uint32_t ctx = cu.getCtxSplitFlag(absPartIdx, depth);
- uint32_t currSplitFlag = (cu.m_cuDepth[absPartIdx] > depth) ? 1 : 0;
-
- X265_CHECK(ctx < 3, "ctx out of range\n");
- encodeBin(currSplitFlag, m_contextState[OFF_SPLIT_FLAG_CTX + ctx]);
-}
-
-void Entropy::codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx)
-{
- encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]);
-}
-
-uint32_t Entropy::bitsIntraModeNonMPM() const
-{
- uint32_t mstate = m_contextState[OFF_ADI_CTX];
- uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 0)) >> 15;
- return bits + 5; /* fixed cost for encodeBinsEP() */
-}
-
-uint32_t Entropy::bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const
-{
- X265_CHECK(dir == preds[0] || dir == preds[1] || dir == preds[2], "dir must be a most probable mode\n");
- uint32_t mstate = m_contextState[OFF_ADI_CTX];
- uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 1)) >> 15;
- return bits + (dir == preds[0] ? 1 : 2);
-}
-
void Entropy::codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple)
{
uint32_t dir[4], j;
uint32_t preds[4][3];
int predIdx[4];
- PartSize mode = (PartSize)cu.m_partSize[absPartIdx];
- uint32_t partNum = isMultiple ? (mode == SIZE_NxN ? 4 : 1) : 1;
- uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2;
+ uint32_t partNum = isMultiple && cu.m_partSize[absPartIdx] != SIZE_2Nx2N ? 4 : 1;
+ uint32_t qNumParts = 1 << (cu.m_log2CUSize[absPartIdx] - 1 - LOG2_UNIT_SIZE) * 2;
- for (j = 0; j < partNum; j++)
+ for (j = 0; j < partNum; j++, absPartIdx += qNumParts)
{
- dir[j] = cu.m_lumaIntraDir[absPartIdx + partOffset * j];
- cu.getIntraDirLumaPredictor(absPartIdx + partOffset * j, preds[j]);
+ dir[j] = cu.m_lumaIntraDir[absPartIdx];
+ cu.getIntraDirLumaPredictor(absPartIdx, preds[j]);
predIdx[j] = -1;
for (uint32_t i = 0; i < 3; i++)
if (dir[j] == preds[j][i])
@@ -1444,211 +1386,174 @@ void Entropy::codeDeltaQP(const CUData& cu, uint32_t absPartIdx)
}
}
-void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel)
+void Entropy::codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel)
{
- uint32_t ctx = ctxCbf[ttype][trDepth];
+ uint32_t ctx = tuDepth + 2;
- bool canQuadSplit = (width >= (MIN_TU_SIZE * 2)) && (height >= (MIN_TU_SIZE * 2));
- uint32_t lowestTUDepth = trDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF
+ uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+ bool canQuadSplit = (log2TrSize - cu.m_hChromaShift > 2);
+ uint32_t lowestTUDepth = tuDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF
- if ((width != height) && (lowestLevel || !canQuadSplit)) // if sub-TUs are present
+ if (cu.m_chromaFormat == X265_CSP_I422 && (lowestLevel || !canQuadSplit)) // if sub-TUs are present
{
uint32_t subTUDepth = lowestTUDepth + 1; // if this is the lowest level of the TU-tree, the sub-TUs are directly below.
// Otherwise, this must be the level above the lowest level (as specified above)
- uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
+ uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
- uint32_t cbf = cu.getCbf(subTUAbsPartIdx, ttype, subTUDepth);
-
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
- }
+ encodeBin(cu.getCbf(absPartIdx , ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
+ encodeBin(cu.getCbf(absPartIdx + tuNumParts, ttype, subTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
}
else
- {
- uint32_t cbf = cu.getCbf(absPartIdx, ttype, lowestTUDepth);
-
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
- }
-}
-
-void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth)
-{
- uint32_t ctx = ctxCbf[ttype][trDepth];
- uint32_t cbf = cu.getCbf(absPartIdx, ttype, trDepth);
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth)
-{
- uint32_t ctx = ctxCbf[ttype][trDepth];
- encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype)
-{
- if (cu.m_tqBypass[absPartIdx])
- return;
- if (trSize != 4)
- return;
-
- uint32_t useTransformSkip = cu.m_transformSkip[ttype][absPartIdx];
- encodeBin(useTransformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]);
-}
-
-void Entropy::codeQtRootCbf(uint32_t cbf)
-{
- encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]);
-}
-
-void Entropy::codeQtCbfZero(TextType ttype, uint32_t trDepth)
-{
- // this function is only used to estimate the bits when cbf is 0
- // and will never be called when writing the bitsream.
- uint32_t ctx = ctxCbf[ttype][trDepth];
- encodeBin(0, m_contextState[OFF_QT_CBF_CTX + ctx]);
-}
-
-void Entropy::codeQtRootCbfZero()
-{
- // this function is only used to estimate the bits when cbf is 0
- // and will never be called when writing the bistream.
- encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]);
-}
-
-/** Encode (X,Y) position of the last significant coefficient
- * \param posx X component of last coefficient
- * \param posy Y component of last coefficient
- * \param log2TrSize
- * \param bIsLuma
- * \param scanIdx scan type (zig-zag, hor, ver)
- * This method encodes the X and Y component within a block of the last significant coefficient.
- */
-void Entropy::codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx)
-{
- // swap
- if (scanIdx == SCAN_VER)
- std::swap(posx, posy);
-
- uint32_t ctxLast;
- uint32_t groupIdxX = getGroupIdx(posx);
- uint32_t groupIdxY = getGroupIdx(posy);
-
- int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
- int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
- uint32_t maxGroupIdx = log2TrSize * 2 - 1;
-
- // posX
- uint8_t *ctxX = &m_contextState[OFF_CTX_LAST_FLAG_X];
- for (ctxLast = 0; ctxLast < groupIdxX; ctxLast++)
- encodeBin(1, *(ctxX + blkSizeOffset + (ctxLast >> ctxShift)));
-
- if (groupIdxX < maxGroupIdx)
- encodeBin(0, *(ctxX + blkSizeOffset + (ctxLast >> ctxShift)));
-
- // posY
- uint8_t *ctxY = &m_contextState[OFF_CTX_LAST_FLAG_Y];
- for (ctxLast = 0; ctxLast < groupIdxY; ctxLast++)
- encodeBin(1, *(ctxY + blkSizeOffset + (ctxLast >> ctxShift)));
-
- if (groupIdxY < maxGroupIdx)
- encodeBin(0, *(ctxY + blkSizeOffset + (ctxLast >> ctxShift)));
-
- if (groupIdxX > 3)
- {
- uint32_t count = (groupIdxX - 2) >> 1;
- posx = posx - g_minInGroup[groupIdxX];
- encodeBinsEP(posx, count);
- }
- if (groupIdxY > 3)
- {
- uint32_t count = (groupIdxY - 2) >> 1;
- posy = posy - g_minInGroup[groupIdxY];
- encodeBinsEP(posy, count);
- }
+ encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
}
void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
{
uint32_t trSize = 1 << log2TrSize;
+ uint32_t tqBypass = cu.m_tqBypass[absPartIdx];
// compute number of significant coefficients
uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1)));
X265_CHECK(numSig > 0, "cbf check fail\n");
- bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled && !cu.m_tqBypass[absPartIdx];
+ bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled && !tqBypass;
+
+ if (log2TrSize <= MAX_LOG2_TS_SIZE && !tqBypass && cu.m_slice->m_pps->bTransformSkipEnabled)
+ codeTransformSkipFlags(cu.m_transformSkip[ttype][absPartIdx], ttype);
- if (cu.m_slice->m_pps->bTransformSkipEnabled)
- codeTransformSkipFlags(cu, absPartIdx, trSize, ttype);
-
bool bIsLuma = ttype == TEXT_LUMA;
// select scans
TUEntropyCodingParameters codingParameters;
cu.getTUEntropyCodingParameters(codingParameters, absPartIdx, log2TrSize, bIsLuma);
+ uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16]
+ uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign
+ uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff
+ memset(coeffNum, 0, sizeof(coeffNum));
+ memset(coeffFlag, 0, sizeof(coeffFlag));
+ memset(coeffSign, 0, sizeof(coeffSign));
+
//----- encode significance map -----
// Find position of last coefficient
int scanPosLast = 0;
uint32_t posLast;
uint64_t sigCoeffGroupFlag64 = 0;
- const uint32_t maskPosXY = ((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1;
- assert((uint32_t)((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1) == (((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1));
+ //const uint32_t maskPosXY = ((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1;
+ X265_CHECK((uint32_t)((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1) == (((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1), "maskPosXY fault\n");
+
+ uint32_t cgBlkNum = 0;
do
{
+ const uint32_t cgBlkIdx = scanPosLast & (MLS_CG_BLK_SIZE - 1);
+ const uint32_t cgIdx = scanPosLast >> MLS_CG_SIZE;
+
posLast = codingParameters.scan[scanPosLast++];
- const uint32_t isNZCoeff = (coeff[posLast] != 0);
+ const int curCoeff = coeff[posLast];
+ const uint32_t isNZCoeff = (curCoeff != 0);
// get L1 sig map
// NOTE: the new algorithm is complicated, so I keep reference code here
//uint32_t posy = posLast >> log2TrSize;
//uint32_t posx = posLast - (posy << log2TrSize);
//uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
- const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
- sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
+ //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
+ //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
numSig -= isNZCoeff;
+
+ // TODO: optimize by instruction BTS
+ coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << cgBlkNum);
+ coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
+ cgBlkNum += isNZCoeff;
+ // TODO: reduce memory store operator, but avoid conditional branch
+ coeffNum[cgIdx] = (uint8_t)cgBlkNum;
+
+ if (cgBlkIdx == (MLS_CG_BLK_SIZE - 1))
+ {
+ cgBlkNum = 0;
+ }
}
while (numSig > 0);
scanPosLast--;
+ const int lastScanSet = scanPosLast >> MLS_CG_SIZE;
+
+ // Calculate CG block non-zero mask, the latest CG always flag as non-zero in CG scan loop
+ for(int idx = 0; idx < lastScanSet; idx++)
+ {
+ const uint8_t subSet = (uint8_t)codingParameters.scanCG[idx];
+ const uint8_t nonZero = (coeffNum[idx] != 0);
+ sigCoeffGroupFlag64 |= ((nonZero ? (uint64_t)1 : 0) << subSet);
+ }
+
// Code position of last coefficient
- int posLastY = posLast >> log2TrSize;
- int posLastX = posLast & (trSize - 1);
- codeLastSignificantXY(posLastX, posLastY, log2TrSize, bIsLuma, codingParameters.scanType);
+ {
+ // The last position is composed of a prefix and suffix.
+ // The prefix is context coded truncated unary bins. The suffix is bypass coded fixed length bins.
+ // The bypass coded bins for both the x and y components are grouped together.
+ uint32_t packedSuffixBits = 0, packedSuffixLen = 0;
+ uint32_t pos[2] = { (posLast & (trSize - 1)), (posLast >> log2TrSize) };
+ // swap
+ if (codingParameters.scanType == SCAN_VER)
+ std::swap(pos[0], pos[1]);
+
+ int ctxIdx = bIsLuma ? (3 * (log2TrSize - 2) + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
+ int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
+ uint32_t maxGroupIdx = (log2TrSize << 1) - 1;
+
+ uint8_t *ctx = &m_contextState[OFF_CTX_LAST_FLAG_X];
+ for (uint32_t i = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
+ {
+ uint32_t temp = g_lastCoeffTable[pos[i]];
+ uint32_t prefixOnes = temp & 15;
+ uint32_t suffixLen = temp >> 4;
+
+ for (uint32_t ctxLast = 0; ctxLast < prefixOnes; ctxLast++)
+ encodeBin(1, *(ctx + ctxIdx + (ctxLast >> ctxShift)));
- //===== code significance flag =====
+ if (prefixOnes < maxGroupIdx)
+ encodeBin(0, *(ctx + ctxIdx + (prefixOnes >> ctxShift)));
+
+ packedSuffixBits <<= suffixLen;
+ packedSuffixBits |= (pos[i] & ((1 << suffixLen) - 1));
+ packedSuffixLen += suffixLen;
+ }
+
+ encodeBinsEP(packedSuffixBits, packedSuffixLen);
+ }
+
+ // code significance flag
uint8_t * const baseCoeffGroupCtx = &m_contextState[OFF_SIG_CG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_CG_FLAG_CTX)];
uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
- const int lastScanSet = scanPosLast >> MLS_CG_SIZE;
uint32_t c1 = 1;
uint32_t goRiceParam = 0;
- int scanPosSig = scanPosLast;
+ int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
+ int absCoeff[1 << MLS_CG_SIZE];
+ int numNonZero = 1;
+ unsigned long lastNZPosInCG;
+ unsigned long firstNZPosInCG;
+
+ absCoeff[0] = int(abs(coeff[posLast]));
for (int subSet = lastScanSet; subSet >= 0; subSet--)
{
- int numNonZero = 0;
- int subPos = subSet << MLS_CG_SIZE;
+ const uint32_t subCoeffFlag = coeffFlag[subSet];
+ uint32_t scanFlagMask = subCoeffFlag;
+ int subPosBase = subSet << MLS_CG_SIZE;
goRiceParam = 0;
- int absCoeff[1 << MLS_CG_SIZE];
- uint32_t coeffSigns = 0;
- int lastNZPosInCG = -1;
- int firstNZPosInCG = 1 << MLS_CG_SIZE;
- if (scanPosSig == scanPosLast)
+
+ if (subSet == lastScanSet)
{
- absCoeff[0] = int(abs(coeff[posLast]));
- coeffSigns = (coeff[posLast] < 0);
- numNonZero = 1;
- lastNZPosInCG = scanPosSig;
- firstNZPosInCG = scanPosSig;
- scanPosSig--;
+ X265_CHECK(scanPosSigOff == scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1, "scanPos mistake\n");
+ scanFlagMask >>= 1;
}
+
// encode significant_coeffgroup_flag
const int cgBlkPos = codingParameters.scanCG[subSet];
- const int cgPosY = cgBlkPos >> codingParameters.log2TrSizeCG;
- const int cgPosX = cgBlkPos - (cgPosY << codingParameters.log2TrSizeCG);
+ const int cgPosY = cgBlkPos >> (log2TrSize - MLS_CG_LOG2_SIZE);
+ const int cgPosX = cgBlkPos & ((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1);
const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
if (subSet == lastScanSet || !subSet)
@@ -1665,31 +1570,31 @@ void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absP
{
const int patternSigCtx = Quant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG);
uint32_t blkPos, sig, ctxSig;
- for (; scanPosSig >= subPos; scanPosSig--)
+ for (; scanPosSigOff >= 0; scanPosSigOff--)
{
- blkPos = codingParameters.scan[scanPosSig];
- sig = (coeff[blkPos] != 0);
- if (scanPosSig > subPos || subSet == 0 || numNonZero)
+ blkPos = codingParameters.scan[subPosBase + scanPosSigOff];
+ sig = scanFlagMask & 1;
+ scanFlagMask >>= 1;
+ X265_CHECK((uint32_t)(coeff[blkPos] != 0) == sig, "sign bit mistake\n");
+ if (scanPosSigOff != 0 || subSet == 0 || numNonZero)
{
ctxSig = Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext);
encodeBin(sig, baseCtx[ctxSig]);
}
- if (sig)
- {
- absCoeff[numNonZero] = int(abs(coeff[blkPos]));
- coeffSigns = 2 * coeffSigns + ((uint32_t)coeff[blkPos] >> 31);
- numNonZero++;
- if (lastNZPosInCG < 0)
- lastNZPosInCG = scanPosSig;
- firstNZPosInCG = scanPosSig;
- }
+ absCoeff[numNonZero] = int(abs(coeff[blkPos]));
+ numNonZero += sig;
}
}
- else
- scanPosSig = subPos - 1;
+ X265_CHECK(coeffNum[subSet] == numNonZero, "coefNum mistake\n");
+ uint32_t coeffSigns = coeffSign[subSet];
+ numNonZero = coeffNum[subSet];
if (numNonZero > 0)
{
+ X265_CHECK(subCoeffFlag > 0, "subCoeffFlag is zero\n");
+ CLZ(lastNZPosInCG, subCoeffFlag);
+ CTZ(firstNZPosInCG, subCoeffFlag);
+
bool signHidden = (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD);
uint32_t ctxSet = (subSet > 0 && bIsLuma) ? 2 : 0;
@@ -1726,10 +1631,8 @@ void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absP
}
}
- if (bHideFirstSign && signHidden)
- encodeBinsEP((coeffSigns >> 1), numNonZero - 1);
- else
- encodeBinsEP(coeffSigns, numNonZero);
+ const int hiddenShift = (bHideFirstSign && signHidden) ? 1 : 0;
+ encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - hiddenShift);
int firstCoeff2 = 1;
if (!c1 || numNonZero > C1FLAG_NUMBER)
@@ -1749,6 +1652,9 @@ void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absP
}
}
}
+ // Initialize value for next loop
+ numNonZero = 0;
+ scanPosSigOff = (1 << MLS_CG_SIZE) - 1;
}
}
@@ -1845,32 +1751,26 @@ void Entropy::estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize
for (uint32_t bin = 0; bin < 2; bin++)
estBitsSbac.significantBits[ctxIdx][bin] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin);
}
- int bitsX = 0, bitsY = 0;
int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
uint32_t maxGroupIdx = log2TrSize * 2 - 1;
uint32_t ctx;
- const uint8_t *ctxX = &m_contextState[OFF_CTX_LAST_FLAG_X];
- for (ctx = 0; ctx < maxGroupIdx; ctx++)
+ for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
{
- int ctxOffset = blkSizeOffset + (ctx >> ctxShift);
- estBitsSbac.lastXBits[ctx] = bitsX + sbacGetEntropyBits(ctxX[ctxOffset], 0);
- bitsX += sbacGetEntropyBits(ctxX[ctxOffset], 1);
- }
+ int bits = 0;
+ const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
- estBitsSbac.lastXBits[ctx] = bitsX;
+ for (ctx = 0; ctx < maxGroupIdx; ctx++)
+ {
+ int ctxOffset = blkSizeOffset + (ctx >> ctxShift);
+ estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctxOffset], 0);
+ bits += sbacGetEntropyBits(ctxState[ctxOffset], 1);
+ }
- const uint8_t *ctxY = &m_contextState[OFF_CTX_LAST_FLAG_Y];
- for (ctx = 0; ctx < maxGroupIdx; ctx++)
- {
- int ctxOffset = blkSizeOffset + (ctx >> ctxShift);
- estBitsSbac.lastYBits[ctx] = bitsY + sbacGetEntropyBits(ctxY[ctxOffset], 0);
- bitsY += sbacGetEntropyBits(ctxY[ctxOffset], 1);
+ estBitsSbac.lastBits[i][ctx] = bits;
}
-
- estBitsSbac.lastYBits[ctx] = bitsY;
}
/* estimate bit cost of significant coefficient */
@@ -2006,9 +1906,9 @@ void Entropy::encodeBin(uint32_t binValue, uint8_t &ctxModel)
if ((binValue ^ mstate) & 1)
{
// NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256
- //numBits = g_renormTable[lps >> 3];
+ //numBits = g_renormTable[lps >> 3];
unsigned long idx;
- CLZ32(idx, lps);
+ CLZ(idx, lps);
X265_CHECK(state != 63 || idx == 1, "state failure\n");
numBits = 8 - idx;
diff --git a/source/encoder/entropy.h b/source/encoder/entropy.h
index bed06cf..2bb1f83 100644
--- a/source/encoder/entropy.h
+++ b/source/encoder/entropy.h
@@ -27,6 +27,7 @@
#include "common.h"
#include "bitstream.h"
#include "frame.h"
+#include "cudata.h"
#include "contexts.h"
#include "slice.h"
@@ -35,8 +36,6 @@ namespace x265 {
struct SaoCtuParam;
struct EstBitsSbac;
-class CUData;
-struct CUGeom;
class ScalingList;
enum SplitType
@@ -89,8 +88,7 @@ struct EstBitsSbac
{
int significantCoeffGroupBits[NUM_SIG_CG_FLAG_CTX][2];
int significantBits[NUM_SIG_FLAG_CTX][2];
- int lastXBits[10];
- int lastYBits[10];
+ int lastBits[2][10];
int greaterOneBits[NUM_ONE_FLAG_CTX][2];
int levelAbsBits[NUM_ABS_FLAG_CTX][2];
int blockCbpBits[NUM_QT_CBF_CTX][2];
@@ -154,41 +152,56 @@ public:
void finishSlice() { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
void encodeCTU(const CUData& cu, const CUGeom& cuGeom);
- void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
- void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
- void codeCUTransquantBypassFlag(uint32_t symbol);
- void codeSkipFlag(const CUData& cu, uint32_t absPartIdx);
- void codeMergeFlag(const CUData& cu, uint32_t absPartIdx);
+ void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
+ void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+
void codeMergeIndex(const CUData& cu, uint32_t absPartIdx);
- void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
- void codeMVPIdx(uint32_t symbol);
void codeMvd(const CUData& cu, uint32_t absPartIdx, int list);
void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
- void codePredMode(int predMode);
void codePredInfo(const CUData& cu, uint32_t absPartIdx);
- void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx);
- void codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel);
- void codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth);
- void codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth);
- void codeQtCbfZero(TextType ttype, uint32_t trDepth);
- void codeQtRootCbfZero();
- void codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]);
+ inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
+
+ void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
+ void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
- uint32_t bitsIntraModeNonMPM() const;
- uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const;
- void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
- void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+ inline void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
+ inline void codeMVPIdx(uint32_t symbol) { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
+ inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
+ inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
+ inline void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) { encodeBin(cu.m_cuDepth[absPartIdx] > depth, m_contextState[OFF_SPLIT_FLAG_CTX + cu.getCtxSplitFlag(absPartIdx, depth)]); }
+ inline void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx) { encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); }
+ inline void codePredMode(int predMode) { encodeBin(predMode == MODE_INTRA ? 1 : 0, m_contextState[OFF_PRED_MODE_CTX]); }
+ inline void codeCUTransquantBypassFlag(uint32_t symbol) { encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); }
+ inline void codeQtCbfLuma(uint32_t cbf, uint32_t tuDepth) { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + !tuDepth]); }
+ inline void codeQtCbfChroma(uint32_t cbf, uint32_t tuDepth) { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + 2 + tuDepth]); }
+ inline void codeQtRootCbf(uint32_t cbf) { encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+ inline void codeTransformSkipFlags(uint32_t transformSkip, TextType ttype) { encodeBin(transformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); }
+
+ void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
- // RDO functions
+ /* RDO functions */
void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
void estCBFBit(EstBitsSbac& estBitsSbac) const;
void estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
void estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
void estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
+ inline uint32_t bitsIntraModeNonMPM() const { return bitsCodeBin(0, m_contextState[OFF_ADI_CTX]) + 5; }
+ inline uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const { return bitsCodeBin(1, m_contextState[OFF_ADI_CTX]) + (dir == preds[0] ? 1 : 2); }
+ inline uint32_t estimateCbfBits(uint32_t cbf, TextType ttype, uint32_t tuDepth) const { return bitsCodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctxCbf[ttype][tuDepth]]); }
+ uint32_t bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const;
+ uint32_t bitsIntraMode(const CUData& cu, uint32_t absPartIdx) const
+ {
+ return bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]) + /* not skip */
+ bitsCodeBin(1, m_contextState[OFF_PRED_MODE_CTX]); /* intra */
+ }
+
+ /* these functions are only used to estimate the bits when cbf is 0 and will never be called when writing the bistream. */
+ inline void codeQtRootCbfZero() { encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+
private:
/* CABAC private methods */
@@ -200,8 +213,15 @@ private:
void encodeBinsEP(uint32_t binValues, int numBins);
void encodeBinTrm(uint32_t binValue);
- void encodeCU(const CUData& cu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
- void finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
+ /* return the bits of encoding the context bin without updating */
+ inline uint32_t bitsCodeBin(uint32_t binValue, uint32_t ctxModel) const
+ {
+ uint64_t fracBits = (m_fracBits & 32767) + sbacGetEntropyBits(ctxModel, binValue);
+ return (uint32_t)(fracBits >> 15);
+ }
+
+ void encodeCU(const CUData& ctu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
+ void finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth);
void writeOut();
@@ -217,7 +237,6 @@ private:
void codePredWeightTable(const Slice& slice);
void codeInterDir(const CUData& cu, uint32_t absPartIdx);
void codePUWise(const CUData& cu, uint32_t absPartIdx);
- void codeQtRootCbf(uint32_t cbf);
void codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list);
void codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list);
@@ -225,18 +244,9 @@ private:
void codeDeltaQP(const CUData& cu, uint32_t absPartIdx);
void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx);
- void codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype);
- struct CoeffCodeState
- {
- uint32_t bakAbsPartIdx;
- uint32_t bakChromaOffset;
- uint32_t bakAbsPartIdxCU;
- };
-
- void encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma,
- uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx,
- bool& bCodeDQP, uint32_t depthRange[2]);
+ void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+ bool& bCodeDQP, const uint32_t depthRange[2]);
void copyFrom(const Entropy& src);
void copyContextsFrom(const Entropy& src);
diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
index c6e6915..b481ad4 100644
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -29,8 +29,6 @@
#include "wavefront.h"
#include "param.h"
-#include "PPA/ppa.h"
-
#include "encoder.h"
#include "frameencoder.h"
#include "common.h"
@@ -44,8 +42,11 @@ FrameEncoder::FrameEncoder()
: WaveFront(NULL)
, m_threadActive(true)
{
- m_totalTime = 0;
+ m_prevOutputTime = x265_mdate();
+ m_totalWorkerElapsedTime = 0;
+ m_slicetypeWaitTime = 0;
m_frameEncoderID = 0;
+ m_activeWorkerCount = 0;
m_bAllRowsStop = false;
m_vbvResetTriggerRow = -1;
m_outStreams = NULL;
@@ -126,23 +127,24 @@ bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id)
ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
}
- if (m_param->noiseReduction)
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
m_nr = X265_MALLOC(NoiseReduction, 1);
if (m_nr)
memset(m_nr, 0, sizeof(NoiseReduction));
else
- m_param->noiseReduction = 0;
+ m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
start();
return ok;
}
/* Generate a complete list of unique geom sets for the current picture dimensions */
-bool FrameEncoder::initializeGeoms(const FrameData& encData)
+bool FrameEncoder::initializeGeoms()
{
/* Geoms only vary between CTUs in the presence of picture edges */
- int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1);
- int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1);
+ int maxCUSize = m_param->maxCUSize;
+ int heightRem = m_param->sourceHeight & (maxCUSize - 1);
+ int widthRem = m_param->sourceWidth & (maxCUSize - 1);
int allocGeoms = 1; // body
if (heightRem && widthRem)
allocGeoms = 4; // body, right, bottom, corner
@@ -154,33 +156,45 @@ bool FrameEncoder::initializeGeoms(const FrameData& encData)
if (!m_cuGeoms || !m_ctuGeomMap)
return false;
- CUGeom cuLocalData[CUGeom::MAX_GEOMS];
- memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp
+ // body
+ CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms);
+ memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
+ if (allocGeoms == 1)
+ return true;
- int countGeoms = 0;
- for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++)
+ int countGeoms = 1;
+ if (widthRem)
{
- /* TODO: detach this logic from TComDataCU */
- encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0);
- encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData);
-
- m_ctuGeomMap[ctuAddr] = MAX_INT;
- for (int i = 0; i < countGeoms; i++)
+ // right
+ CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+ for (uint32_t i = 0; i < m_numRows; i++)
{
- if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS))
- {
- m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS;
- break;
- }
+ uint32_t ctuAddr = m_numCols * (i + 1) - 1;
+ m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
}
+ countGeoms++;
+ }
+ if (heightRem)
+ {
+ // bottom
+ CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+ for (uint32_t i = 0; i < m_numCols; i++)
+ {
+ uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
+ m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
+ }
+ countGeoms++;
- if (m_ctuGeomMap[ctuAddr] == MAX_INT)
+ if (widthRem)
{
- X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n");
+ // corner
+ CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+
+ uint32_t ctuAddr = m_numCols * m_numRows - 1;
m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
- memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS);
countGeoms++;
}
+ X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
}
return true;
@@ -188,20 +202,25 @@ bool FrameEncoder::initializeGeoms(const FrameData& encData)
bool FrameEncoder::startCompressFrame(Frame* curFrame)
{
+ m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
m_frame = curFrame;
curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
curFrame->m_encData->m_slice->m_mref = m_mref;
+
if (!m_cuGeoms)
{
- if (!initializeGeoms(*curFrame->m_encData))
+ if (!initializeGeoms())
return false;
}
+
m_enable.trigger();
return true;
}
void FrameEncoder::threadMain()
{
+ THREAD_NAME("Frame", m_frameEncoderID);
+
// worker thread routine for FrameEncoder
do
{
@@ -217,13 +236,21 @@ void FrameEncoder::threadMain()
void FrameEncoder::compressFrame()
{
- PPAScopeEvent(FrameEncoder_compressFrame);
- int64_t startCompressTime = x265_mdate();
- Slice* slice = m_frame->m_encData->m_slice;
+ ProfileScopeEvent(frameThread);
+
+ m_startCompressTime = x265_mdate();
+ m_totalActiveWorkerCount = 0;
+ m_activeWorkerCountSamples = 0;
+ m_totalWorkerElapsedTime = 0;
+ m_totalNoWorkerTime = 0;
+ m_countRowBlocks = 0;
+ m_allRowsAvailableTime = 0;
+ m_stallStartTime = 0;
/* Emit access unit delimiter unless this is the first frame and the user is
* not repeating headers (since AUD is supposed to be the first NAL in the access
* unit) */
+ Slice* slice = m_frame->m_encData->m_slice;
if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
{
m_bs.resetBits();
@@ -252,7 +279,7 @@ void FrameEncoder::compressFrame()
WeightParam *w = NULL;
if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
w = slice->m_weightPredTable[l][ref];
- m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w);
+ m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
}
}
@@ -262,7 +289,7 @@ void FrameEncoder::compressFrame()
m_rce.newQp = qp;
/* Clip slice QP to 0-51 spec range before encoding */
- slice->m_sliceQp = Clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
+ slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
m_initSliceContext.resetEntropy(*slice);
@@ -270,7 +297,7 @@ void FrameEncoder::compressFrame()
// reset entropy coders
m_entropyCoder.load(m_initSliceContext);
- for (int i = 0; i < m_numRows; i++)
+ for (uint32_t i = 0; i < m_numRows; i++)
m_rows[i].init(m_initSliceContext);
uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
@@ -286,6 +313,8 @@ void FrameEncoder::compressFrame()
for (uint32_t i = 0; i < numSubstreams; i++)
m_outStreams[i].resetBits();
+ int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
+
if (m_frame->m_lowres.bKeyframe)
{
if (m_param->bEmitHRDSEI)
@@ -354,7 +383,7 @@ void FrameEncoder::compressFrame()
// access unit associated with the picture timing SEI message has to
// wait after removal of the access unit with the most recent
// buffering period SEI message
- sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - m_top->m_lastBPSEI), (1 << hrd->cpbRemovalDelayLength));
+ sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
}
@@ -374,7 +403,7 @@ void FrameEncoder::compressFrame()
int totalI = 0, totalP = 0, totalSkip = 0;
// accumulate intra,inter,skip cu count per frame for 2 pass
- for (int i = 0; i < m_numRows; i++)
+ for (uint32_t i = 0; i < m_numRows; i++)
{
m_frameStats.mvBits += m_rows[i].rowStats.mvBits;
m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
@@ -452,7 +481,8 @@ void FrameEncoder::compressFrame()
}
m_accessUnitBits = bytes << 3;
- m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
+ m_endCompressTime = x265_mdate();
+
/* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
m_top->m_aborted = true;
@@ -481,7 +511,7 @@ void FrameEncoder::compressFrame()
for (int i = 0; i < m_top->m_numThreadLocalData; i++)
{
NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
- memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
+ memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
}
@@ -496,6 +526,8 @@ void FrameEncoder::compressFrame()
ATOMIC_DEC(&refpic->m_countRefEncoders);
}
}
+
+ m_endFrameTime = x265_mdate();
}
void FrameEncoder::encodeSlice()
@@ -569,7 +601,6 @@ void FrameEncoder::encodeSlice()
void FrameEncoder::compressCTURows()
{
- PPAScopeEvent(FrameEncoder_compressRows);
Slice* slice = m_frame->m_encData->m_slice;
m_bAllRowsStop = false;
@@ -590,7 +621,7 @@ void FrameEncoder::compressCTURows()
WaveFront::clearEnabledRowMask();
WaveFront::enqueue();
- for (int row = 0; row < m_numRows; row++)
+ for (uint32_t row = 0; row < m_numRows; row++)
{
// block until all reference frames have reconstructed the rows we need
for (int l = 0; l < numPredDir; l++)
@@ -599,7 +630,7 @@ void FrameEncoder::compressCTURows()
{
Frame *refpic = slice->m_refPicList[l][ref];
- int reconRowCount = refpic->m_reconRowCount.get();
+ uint32_t reconRowCount = refpic->m_reconRowCount.get();
while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
@@ -609,19 +640,23 @@ void FrameEncoder::compressCTURows()
}
enableRowEncoder(row);
- if (row == 0)
- enqueueRowEncoder(0);
- else
+ if (row)
m_pool->pokeIdleThread();
+ else
+ {
+ m_row0WaitTime = x265_mdate();
+ enqueueRowEncoder(0);
+ }
}
+ m_allRowsAvailableTime = x265_mdate();
m_completionEvent.wait();
WaveFront::dequeue();
}
else
{
- for (int i = 0; i < this->m_numRows + m_filterRowDelay; i++)
+ for (uint32_t i = 0; i < this->m_numRows + m_filterRowDelay; i++)
{
// Encode
if (i < m_numRows)
@@ -634,7 +669,7 @@ void FrameEncoder::compressCTURows()
{
Frame *refpic = slice->m_refPicList[list][ref];
- int reconRowCount = refpic->m_reconRowCount.get();
+ uint32_t reconRowCount = refpic->m_reconRowCount.get();
while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
@@ -643,22 +678,28 @@ void FrameEncoder::compressCTURows()
}
}
- processRow(i * 2 + 0, -1);
+ if (!i)
+ m_row0WaitTime = x265_mdate();
+ else if (i == m_numRows - 1)
+ m_allRowsAvailableTime = x265_mdate();
+ processRowEncoder(i, *m_tld);
}
// Filter
if (i >= m_filterRowDelay)
- processRow((i - m_filterRowDelay) * 2 + 1, -1);
+ m_frameFilter.processRow(i - m_filterRowDelay);
}
}
- m_frameTime = (double)m_totalTime / 1000000;
- m_totalTime = 0;
}
void FrameEncoder::processRow(int row, int threadId)
{
- const int realRow = row >> 1;
- const int typeNum = row & 1;
+ int64_t startTime = x265_mdate();
+ if (ATOMIC_INC(&m_activeWorkerCount) == 1 && m_stallStartTime)
+ m_totalNoWorkerTime += x265_mdate() - m_stallStartTime;
+
+ const uint32_t realRow = row >> 1;
+ const uint32_t typeNum = row & 1;
ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld;
@@ -666,7 +707,7 @@ void FrameEncoder::processRow(int row, int threadId)
processRowEncoder(realRow, tld);
else
{
- processRowFilter(realRow);
+ m_frameFilter.processRow(realRow);
// NOTE: Active next row
if (realRow != m_numRows - 1)
@@ -674,15 +715,20 @@ void FrameEncoder::processRow(int row, int threadId)
else
m_completionEvent.trigger();
}
+
+ if (ATOMIC_DEC(&m_activeWorkerCount) == 0)
+ m_stallStartTime = x265_mdate();
+
+ m_totalWorkerElapsedTime += x265_mdate() - startTime; // not thread safe, but good enough
}
// Called by worker threads
-void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
+void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
{
- PPAScopeEvent(Thread_ProcessRow);
-
+ uint32_t row = (uint32_t)intRow;
CTURow& curRow = m_rows[row];
+ if (m_param->bEnableWavefront)
{
ScopedLock self(curRow.lock);
if (!curRow.active)
@@ -707,18 +753,16 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
FrameData& curEncData = *m_frame->m_encData;
Slice *slice = curEncData.m_slice;
- PicYuv* fencPic = m_frame->m_origPicYuv;
- tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
-
- int64_t startTime = x265_mdate();
const uint32_t numCols = m_numCols;
const uint32_t lineStartCUAddr = row * numCols;
bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
while (curRow.completed < numCols)
{
- int col = curRow.completed;
+ ProfileScopeEvent(encodeCTU);
+
+ uint32_t col = curRow.completed;
const uint32_t cuAddr = lineStartCUAddr + col;
CUData* ctu = curEncData.getPicCTU(cuAddr);
ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
@@ -731,7 +775,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
}
- if (row >= col && row && m_vbvResetTriggerRow != row)
+ if (row >= col && row && m_vbvResetTriggerRow != intRow)
curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
else
curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_rowStat[row].diagQp;
@@ -743,8 +787,8 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
{
int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp);
tld.analysis.setQP(*slice, qp);
- qp = Clip3(QP_MIN, QP_MAX_SPEC, qp);
- ctu->setQPSubParts((char)qp, 0, 0);
+ qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
+ ctu->setQPSubParts((int8_t)qp, 0, 0);
curEncData.m_rowStat[row].sumQpAq += qp;
}
else
@@ -758,7 +802,11 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
}
// Does all the CU analysis, returns best top level mode decision
- Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
+ Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
+
+ // take a sample of the current active worker count
+ ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
+ ATOMIC_INC(&m_activeWorkerCountSamples);
/* advance top-level row coder to include the context of this CTU.
* if SAO is disabled, rowCoder writes the final CTU bitstream */
@@ -813,7 +861,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
{
double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
int reEncode = m_top->m_rateControl->rowDiagonalVbvRateControl(m_frame, row, &m_rce, qpBase);
- qpBase = Clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase);
+ qpBase = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase);
curEncData.m_rowStat[row].diagQp = qpBase;
curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(qpBase);
@@ -826,7 +874,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
m_vbvResetTriggerRow = row;
m_bAllRowsStop = true;
- for (int r = m_numRows - 1; r >= row; r--)
+ for (uint32_t r = m_numRows - 1; r >= row; r--)
{
CTURow& stopRow = m_rows[r];
@@ -839,9 +887,13 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
if (dequeueRow(r * 2))
stopRow.active = false;
else
+ {
+ /* we must release the row lock to allow the thread to exit */
+ stopRow.lock.release();
GIVE_UP_TIME();
+ stopRow.lock.acquire();
+ }
}
-
stopRow.lock.release();
bool bRowBusy = true;
@@ -886,7 +938,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
ScopedLock below(m_rows[row + 1].lock);
if (m_rows[row + 1].active == false &&
m_rows[row + 1].completed + 2 <= curRow.completed &&
- (!m_bAllRowsStop || row + 1 < m_vbvResetTriggerRow))
+ (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
{
m_rows[row + 1].active = true;
enqueueRowEncoder(row + 1);
@@ -894,12 +946,12 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
}
ScopedLock self(curRow.lock);
- if ((m_bAllRowsStop && row > m_vbvResetTriggerRow) ||
+ if ((m_bAllRowsStop && intRow > m_vbvResetTriggerRow) ||
(row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2))
{
curRow.active = false;
curRow.busy = false;
- m_totalTime += x265_mdate() - startTime;
+ ATOMIC_INC(&m_countRowBlocks);
return;
}
}
@@ -916,7 +968,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
* after half the frame is encoded, but after this initial period we update
* after refLagRows (the number of rows reference frames must have completed
* before referencees may begin encoding) */
- int rowCount = 0;
+ uint32_t rowCount = 0;
if (m_param->rc.rateControlMode == X265_RC_ABR)
{
if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
@@ -928,7 +980,7 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
{
m_rce.rowTotalBits = 0;
if (bIsVbv)
- for (int i = 0; i < rowCount; i++)
+ for (uint32_t i = 0; i < rowCount; i++)
m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits;
else
for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++)
@@ -937,22 +989,24 @@ void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
m_top->m_rateControl->rateControlUpdateStats(&m_rce);
}
- // trigger row-wise loop filters
- if (row >= m_filterRowDelay)
+ if (m_param->bEnableWavefront)
{
- enableRowFilter(row - m_filterRowDelay);
+ /* trigger row-wise loop filters */
+ if (row >= m_filterRowDelay)
+ {
+ enableRowFilter(row - m_filterRowDelay);
- // NOTE: Active Filter to first row (row 0)
- if (row == m_filterRowDelay)
- enqueueRowFilter(0);
- }
- if (row == m_numRows - 1)
- {
- for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
- enableRowFilter(i);
+ /* NOTE: Activate filter if first row (row 0) */
+ if (row == m_filterRowDelay)
+ enqueueRowFilter(0);
+ }
+ if (row == m_numRows - 1)
+ {
+ for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
+ enableRowFilter(i);
+ }
}
- m_totalTime += x265_mdate() - startTime;
curRow.busy = false;
}
@@ -971,13 +1025,13 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
log->cntIntra[depth]++;
log->qTreeIntraCnt[depth]++;
- if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
+ if (ctu.m_predMode[absPartIdx] == MODE_NONE)
{
log->totalCu--;
log->cntIntra[depth]--;
log->qTreeIntraCnt[depth]--;
}
- else if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
+ else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
{
/* TODO: log intra modes at absPartIdx +0 to +3 */
X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
@@ -1000,7 +1054,7 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
log->totalCu++;
log->cntTotalCu[depth]++;
- if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
+ if (ctu.m_predMode[absPartIdx] == MODE_NONE)
{
log->totalCu--;
log->cntTotalCu[depth]--;
@@ -1011,7 +1065,7 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
log->cntSkipCu[depth]++;
log->qTreeSkipCnt[depth]++;
}
- else if (ctu.m_predMode[absPartIdx] == MODE_INTER)
+ else if (ctu.isInter(absPartIdx))
{
log->cntInter[depth]++;
log->qTreeInterCnt[depth]++;
@@ -1021,12 +1075,12 @@ void FrameEncoder::collectCTUStatistics(CUData& ctu)
else
log->cuInterDistribution[depth][AMP_ID]++;
}
- else if (ctu.m_predMode[absPartIdx] == MODE_INTRA)
+ else if (ctu.isIntra(absPartIdx))
{
log->cntIntra[depth]++;
log->qTreeIntraCnt[depth]++;
- if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
+ if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
{
X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
log->cntIntraNxN++;
@@ -1061,7 +1115,8 @@ void FrameEncoder::noiseReductionUpdate()
m_nr->count[cat] >>= 1;
}
- uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
+ int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
+ uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
for (int i = 0; i < coefCount; i++)
{
@@ -1091,8 +1146,8 @@ int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp)
/* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
double qp_offset = 0;
- uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16;
- uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16;
+ uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
+ uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
uint32_t noOfBlocks = g_maxCUSize / 16;
uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks;
uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth;
@@ -1121,7 +1176,7 @@ int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp)
qp_offset /= cnt;
qp += qp_offset;
- return Clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
+ return x265_clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
}
Frame *FrameEncoder::getEncodedPicture(NALList& output)
@@ -1134,6 +1189,7 @@ Frame *FrameEncoder::getEncodedPicture(NALList& output)
Frame *ret = m_frame;
m_frame = NULL;
output.takeContents(m_nalList);
+ m_prevOutputTime = x265_mdate();
return ret;
}
diff --git a/source/encoder/frameencoder.h b/source/encoder/frameencoder.h
index 625c025..f0c5b77 100644
--- a/source/encoder/frameencoder.h
+++ b/source/encoder/frameencoder.h
@@ -134,11 +134,19 @@ public:
Event m_enable;
Event m_done;
+ Event m_completionEvent;
bool m_threadActive;
+ int m_frameEncoderID;
- int m_numRows;
+ uint32_t m_numRows;
uint32_t m_numCols;
- int m_refLagRows;
+ uint32_t m_filterRowDelay;
+ uint32_t m_filterRowDelayCus;
+ uint32_t m_refLagRows;
+
+ volatile bool m_bAllRowsStop;
+ volatile int m_vbvResetTriggerRow;
+
CTURow* m_rows;
RateControlEntry m_rce;
SEIDecodedPictureHash m_seiReconPictureDigest;
@@ -147,17 +155,28 @@ public:
uint64_t m_SSDU;
uint64_t m_SSDV;
double m_ssim;
+ uint64_t m_accessUnitBits;
uint32_t m_ssimCnt;
MD5Context m_state[3];
uint32_t m_crc[3];
uint32_t m_checksum[3];
- double m_elapsedCompressTime; // elapsed time spent in worker threads
- double m_frameTime; // wall time from frame start to finish
StatisticLog m_sliceTypeLog[3]; // per-slice type CU statistics
FrameStats m_frameStats; // stats of current frame for multi-pass encodes
- volatile bool m_bAllRowsStop;
- volatile int m_vbvResetTriggerRow;
- uint64_t m_accessUnitBits;
+
+ volatile int m_activeWorkerCount; // count of workers currently encoding or filtering CTUs
+ volatile int m_totalActiveWorkerCount; // sum of m_activeWorkerCount sampled at end of each CTU
+ volatile int m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts)
+ volatile int m_countRowBlocks; // count of workers forced to abandon a row because of top dependency
+ int64_t m_startCompressTime; // timestamp when frame encoder is given a frame
+ int64_t m_row0WaitTime; // timestamp when row 0 is allowed to start
+ int64_t m_allRowsAvailableTime; // timestamp when all reference dependencies are resolved
+ int64_t m_endCompressTime; // timestamp after all CTUs are compressed
+ int64_t m_endFrameTime; // timestamp after RCEnd, NR updates, etc
+ int64_t m_stallStartTime; // timestamp when worker count becomes 0
+ int64_t m_prevOutputTime; // timestamp when prev frame was retrieved by API thread
+ int64_t m_slicetypeWaitTime; // total elapsed time waiting for decided frame
+ int64_t m_totalWorkerElapsedTime; // total elapsed time spent by worker threads processing CTUs
+ int64_t m_totalNoWorkerTime; // total elapsed time without any active worker threads
Encoder* m_top;
x265_param* m_param;
@@ -177,15 +196,9 @@ public:
FrameFilter m_frameFilter;
NALList m_nalList;
- int m_filterRowDelay;
- int m_filterRowDelayCus;
- Event m_completionEvent;
- int64_t m_totalTime;
- int m_frameEncoderID;
-
protected:
- bool initializeGeoms(const FrameData& encData);
+ bool initializeGeoms();
/* analyze / compress frame, can be run in parallel within reference constraints */
void compressFrame();
@@ -204,7 +217,6 @@ protected:
/* Called by WaveFront::findJob() */
void processRow(int row, int threadId);
void processRowEncoder(int row, ThreadLocalData& tld);
- void processRowFilter(int row) { m_frameFilter.processRow(row); }
void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); }
diff --git a/source/encoder/framefilter.cpp b/source/encoder/framefilter.cpp
index aee75c6..33a577f 100644
--- a/source/encoder/framefilter.cpp
+++ b/source/encoder/framefilter.cpp
@@ -29,7 +29,6 @@
#include "framefilter.h"
#include "frameencoder.h"
#include "wavefront.h"
-#include "PPA/ppa.h"
using namespace x265;
@@ -64,8 +63,6 @@ void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
- m_deblock.init();
-
if (m_param->bEnableSAO)
if (!m_sao.create(m_param))
m_param->bEnableSAO = 0;
@@ -84,7 +81,7 @@ void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
void FrameFilter::processRow(int row)
{
- PPAScopeEvent(Thread_filterCU);
+ ProfileScopeEvent(filterCTURow);
if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
{
@@ -97,22 +94,24 @@ void FrameFilter::processRow(int row)
if (m_param->bEnableLoopFilter)
{
+ const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+ const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+
for (uint32_t col = 0; col < numCols; col++)
{
uint32_t cuAddr = lineStartCUAddr + col;
- CUData* cu = encData.getPicCTU(cuAddr);
-
- m_deblock.deblockCTU(cu, Deblock::EDGE_VER);
+ const CUData* ctu = encData.getPicCTU(cuAddr);
+ deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
if (col > 0)
{
- CUData* cuPrev = encData.getPicCTU(cuAddr - 1);
- m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
+ const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
+ deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
}
}
- CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
- m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
+ const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
+ deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
}
// SAO
@@ -156,7 +155,7 @@ uint32_t FrameFilter::getCUHeight(int rowNum) const
void FrameFilter::processRowPost(int row)
{
- PicYuv *reconPic = m_frame->m_reconPicYuv;
+ PicYuv *reconPic = m_frame->m_reconPic;
const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
const uint32_t lineStartCUAddr = row * numCols;
const int realH = getCUHeight(row);
@@ -209,19 +208,19 @@ void FrameFilter::processRowPost(int row)
uint32_t cuAddr = lineStartCUAddr;
if (m_param->bEnablePsnr)
{
- PicYuv* origPic = m_frame->m_origPicYuv;
+ PicYuv* fencPic = m_frame->m_fencPic;
intptr_t stride = reconPic->m_stride;
uint32_t width = reconPic->m_picWidth - m_pad[0];
uint32_t height = getCUHeight(row);
- uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
+ uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
height >>= m_vChromaShift;
width >>= m_hChromaShift;
stride = reconPic->m_strideC;
- uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
- uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
+ uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
+ uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
m_frameEncoder->m_SSDY += ssdY;
m_frameEncoder->m_SSDU += ssdU;
@@ -229,10 +228,10 @@ void FrameFilter::processRowPost(int row)
}
if (m_param->bEnableSsim && m_ssimBuf)
{
- pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0];
- pixel *org = m_frame->m_origPicYuv->m_picOrg[0];
- intptr_t stride1 = m_frame->m_origPicYuv->m_stride;
- intptr_t stride2 = m_frame->m_reconPicYuv->m_stride;
+ pixel *rec = m_frame->m_reconPic->m_picOrg[0];
+ pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
+ intptr_t stride1 = m_frame->m_fencPic->m_stride;
+ intptr_t stride2 = m_frame->m_reconPic->m_stride;
uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
uint32_t bStart = (row == 0);
uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
@@ -243,7 +242,7 @@ void FrameFilter::processRowPost(int row)
/* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
* to avoid alignment of ssim blocks with DCT blocks. */
minPixY += bStart ? 2 : -6;
- m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2,
+ m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
m_frameEncoder->m_ssimCnt += ssim_cnt;
}
@@ -324,65 +323,43 @@ static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t wi
}
uint32_t y = 0;
- /* Consume Y in chunks of 64 */
- for (; y + 64 <= height; y += 64)
- {
- uint32_t x = 0;
- if (!(stride & 31))
- for (; x + 64 <= width; x += 64)
- ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride);
-
- if (!(stride & 15))
- for (; x + 16 <= width; x += 16)
- ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride);
+ /* Consume rows in ever narrower chunks of height */
+ for (int size = BLOCK_64x64; size >= BLOCK_4x4 && y < height; size--)
+ {
+ uint32_t rowHeight = 1 << (size + 2);
- for (; x + 4 <= width; x += 4)
+ for (; y + rowHeight <= height; y += rowHeight)
{
- ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
- ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride);
- ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride);
- ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride);
+ uint32_t y1, x = 0;
+
+ /* Consume each row using the largest square blocks possible */
+ if (size == BLOCK_64x64 && !(stride & 31))
+ for (; x + 64 <= width; x += 64)
+ ssd += primitives.cu[BLOCK_64x64].sse_pp(fenc + x, stride, rec + x, stride);
+
+ if (size >= BLOCK_32x32 && !(stride & 15))
+ for (; x + 32 <= width; x += 32)
+ for (y1 = 0; y1 + 32 <= rowHeight; y1 += 32)
+ ssd += primitives.cu[BLOCK_32x32].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+ if (size >= BLOCK_16x16)
+ for (; x + 16 <= width; x += 16)
+ for (y1 = 0; y1 + 16 <= rowHeight; y1 += 16)
+ ssd += primitives.cu[BLOCK_16x16].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+ if (size >= BLOCK_8x8)
+ for (; x + 8 <= width; x += 8)
+ for (y1 = 0; y1 + 8 <= rowHeight; y1 += 8)
+ ssd += primitives.cu[BLOCK_8x8].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+ for (; x + 4 <= width; x += 4)
+ for (y1 = 0; y1 + 4 <= rowHeight; y1 += 4)
+ ssd += primitives.cu[BLOCK_4x4].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+ fenc += stride * rowHeight;
+ rec += stride * rowHeight;
}
-
- fenc += stride * 64;
- rec += stride * 64;
- }
-
- /* Consume Y in chunks of 16 */
- for (; y + 16 <= height; y += 16)
- {
- uint32_t x = 0;
-
- if (!(stride & 31))
- for (; x + 64 <= width; x += 64)
- ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride);
-
- if (!(stride & 15))
- for (; x + 16 <= width; x += 16)
- ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride);
-
- for (; x + 4 <= width; x += 4)
- ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
-
- fenc += stride * 16;
- rec += stride * 16;
- }
-
- /* Consume Y in chunks of 4 */
- for (; y + 4 <= height; y += 4)
- {
- uint32_t x = 0;
-
- if (!(stride & 15))
- for (; x + 16 <= width; x += 16)
- ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride);
-
- for (; x + 4 <= width; x += 4)
- ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride);
-
- fenc += stride * 4;
- rec += stride * 4;
}
return ssd;
@@ -417,59 +394,54 @@ static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t
}
/* restore original YUV samples to recon after SAO (if lossless) */
-static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
{
- uint32_t size = g_maxCUSize >> depth;
- int part = partitionFromSizes(size, size);
+ int size = cu->m_log2CUSize[absPartIdx] - 2;
+ uint32_t cuAddr = cu->m_cuAddr;
- PicYuv* reconPic = frame.m_reconPicYuv;
- PicYuv* fencPic = frame.m_origPicYuv;
+ PicYuv* reconPic = frame.m_reconPic;
+ PicYuv* fencPic = frame.m_fencPic;
- pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
- pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
+ pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
+ pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
- primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride);
+ primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
- pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
- pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
+ pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
+ pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
- pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
- pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
+ pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
+ pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
int csp = fencPic->m_picCsp;
- primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
- primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
+ primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
+ primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
}
/* Original YUV restoration for CU in lossless coding */
-static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
{
- if (cu->m_cuDepth[absPartIdx] > depth)
+ uint32_t absPartIdx = cuGeom.encodeIdx;
+ if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
{
- /* TODO: this could use cuGeom.numPartition and flags */
- uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
- uint32_t qNumParts = curNumParts >> 2;
- uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples - cu->m_cuPelX;
- uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
-
- /* process four split sub-cu at next depth */
- for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+ for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
{
- if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
- origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
+ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+ if (childGeom.flags & CUGeom::PRESENT)
+ origCUSampleRestoration(cu, childGeom, frame);
}
-
return;
}
// restore original YUV samples
if (cu->m_tqBypass[absPartIdx])
- restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
+ restoreOrigLosslessYuv(cu, frame, absPartIdx);
}
void FrameFilter::processSao(int row)
{
- SAOParam* saoParam = m_frame->m_encData->m_saoParam;
+ FrameData& encData = *m_frame->m_encData;
+ SAOParam* saoParam = encData.m_saoParam;
if (saoParam->bSaoFlag[0])
m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
@@ -480,12 +452,19 @@ void FrameFilter::processSao(int row)
m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
}
- if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
+ if (encData.m_slice->m_pps->bTransquantBypassEnabled)
{
- uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
+ uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
uint32_t lineStartCUAddr = row * numCols;
+ const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+ const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+
for (uint32_t col = 0; col < numCols; col++)
- origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
+ {
+ uint32_t cuAddr = lineStartCUAddr + col;
+ const CUData* ctu = encData.getPicCTU(cuAddr);
+ origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frame);
+ }
}
}
diff --git a/source/encoder/framefilter.h b/source/encoder/framefilter.h
index acdec98..8818571 100644
--- a/source/encoder/framefilter.h
+++ b/source/encoder/framefilter.h
@@ -39,7 +39,7 @@ class FrameEncoder;
struct ThreadLocalData;
// Manages the processing of a single frame loopfilter
-class FrameFilter
+class FrameFilter : public Deblock
{
public:
@@ -50,7 +50,6 @@ public:
int m_vChromaShift;
int m_pad[2];
- Deblock m_deblock;
SAO m_sao;
int m_numRows;
int m_saoRowDelay;
diff --git a/source/encoder/level.cpp b/source/encoder/level.cpp
index f00f4ca..556ca11 100644
--- a/source/encoder/level.cpp
+++ b/source/encoder/level.cpp
@@ -199,8 +199,8 @@ void determineLevel(const x265_param ¶m, VPS& vps)
* circumstances it will be quite noisy */
bool enforceLevel(x265_param& param, VPS& vps)
{
- vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : 1;
- vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
+ vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
+ vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
/* no level specified by user, just auto-detect from the configuration */
if (param.levelIdc <= 0)
diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp
index f6129ff..7d3afdc 100644
--- a/source/encoder/motion.cpp
+++ b/source/encoder/motion.cpp
@@ -34,6 +34,7 @@
using namespace x265;
namespace {
+
struct SubpelWorkload
{
int hpel_iters;
@@ -43,7 +44,7 @@ struct SubpelWorkload
bool hpel_satd;
};
-SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
+const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
{
{ 1, 4, 0, 4, false }, // 4 SAD HPEL only
{ 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
@@ -54,15 +55,14 @@ SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
{ 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
{ 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
};
-}
-static int size_scale[NUM_LUMA_PARTITIONS];
-#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum])))
+int sizeScale[NUM_PU_SIZES];
+#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
-static void init_scales(void)
+void initScales(void)
{
#define SETUP_SCALE(W, H) \
- size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
+ sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
SETUP_SCALE(4, 4);
SETUP_SCALE(8, 8);
SETUP_SCALE(8, 4);
@@ -91,51 +91,18 @@ static void init_scales(void)
#undef SETUP_SCALE
}
-MotionEstimate::MotionEstimate()
- : searchMethod(3)
- , subpelRefine(5)
-{
- if (size_scale[0] == 0)
- init_scales();
-
- fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-}
-
-MotionEstimate::~MotionEstimate()
-{
- X265_FREE(fenc);
-}
-
-void MotionEstimate::setSourcePU(intptr_t offset, int width, int height)
-{
- partEnum = partitionFromSizes(width, height);
- X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
- sad = primitives.sad[partEnum];
- satd = primitives.satd[partEnum];
- sa8d = primitives.sa8d_inter[partEnum];
- sad_x3 = primitives.sad_x3[partEnum];
- sad_x4 = primitives.sad_x4[partEnum];
-
- blockwidth = width;
- blockheight = height;
- blockOffset = offset;
-
- /* copy PU block into cache */
- primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
-}
-
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
-static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
-static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
-static const MV hex4[16] =
+const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
+const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
+const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
+const MV hex4[16] =
{
- MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
+ MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
- MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
+ MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
};
-static const MV offsets[] =
+const MV offsets[] =
{
MV(-1, 0), MV(0, -1),
MV(-1, -1), MV(1, -1),
@@ -147,8 +114,8 @@ static const MV offsets[] =
MV(1, 0), MV(0, 1),
}; // offsets for Two Point Search
-/* sum of absolute differences between MV candidates */
-static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates)
+/* sum of absolute differences between MV candidates, used for adaptive ME range */
+inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
{
int sum = 0;
@@ -161,6 +128,77 @@ static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidate
return sum;
}
+}
+
+MotionEstimate::MotionEstimate()
+{
+ ctuAddr = -1;
+ absPartIdx = -1;
+ searchMethod = X265_HEX_SEARCH;
+ subpelRefine = 2;
+ bChromaSATD = false;
+ chromaSatd = NULL;
+}
+
+void MotionEstimate::init(int method, int refine, int csp)
+{
+ if (!sizeScale[0])
+ initScales();
+
+ searchMethod = method;
+ subpelRefine = refine;
+ fencPUYuv.create(FENC_STRIDE, csp);
+}
+
+MotionEstimate::~MotionEstimate()
+{
+ fencPUYuv.destroy();
+}
+
+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
+{
+ partEnum = partitionFromSizes(pwidth, pheight);
+ X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+ sad = primitives.pu[partEnum].sad;
+ satd = primitives.pu[partEnum].satd;
+ sad_x3 = primitives.pu[partEnum].sad_x3;
+ sad_x4 = primitives.pu[partEnum].sad_x4;
+
+ blockwidth = pwidth;
+ blockOffset = offset;
+ absPartIdx = ctuAddr = -1;
+
+ /* copy PU block into cache */
+ primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
+ X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
+}
+
+/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
+{
+ partEnum = partitionFromSizes(pwidth, pheight);
+ X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+ sad = primitives.pu[partEnum].sad;
+ satd = primitives.pu[partEnum].satd;
+ sad_x3 = primitives.pu[partEnum].sad_x3;
+ sad_x4 = primitives.pu[partEnum].sad_x4;
+ chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
+
+ /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
+ * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
+ bChromaSATD = subpelRefine > 2 && chromaSatd;
+ X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
+
+ ctuAddr = _ctuAddr;
+ absPartIdx = cuPartIdx + puPartIdx;
+ blockwidth = pwidth;
+ blockOffset = 0;
+
+ /* copy PU from CU Yuv */
+ fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
+}
+
#define COST_MV_PT_DIST(mx, my, point, dist) \
do \
{ \
@@ -291,8 +329,9 @@ void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
int merange)
{
ALIGN_VAR_16(int, costs[16]);
- pixel *fref = ref->fpelPlane + blockOffset;
- size_t stride = ref->lumaStride;
+ pixel* fenc = fencPUYuv.m_buf[0];
+ pixel* fref = ref->fpelPlane[0] + blockOffset;
+ intptr_t stride = ref->lumaStride;
MV omv = bmv;
int saved = bcost;
@@ -532,8 +571,11 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
MV & outQMv)
{
ALIGN_VAR_16(int, costs[16]);
- size_t stride = ref->lumaStride;
- pixel *fref = ref->fpelPlane + blockOffset;
+ if (ctuAddr >= 0)
+ blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
+ intptr_t stride = ref->lumaStride;
+ pixel* fenc = fencPUYuv.m_buf[0];
+ pixel* fref = ref->fpelPlane[0] + blockOffset;
setMVP(qmvp);
@@ -561,9 +603,7 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
MV bmv = pmv.roundToFPel();
int bcost = bprecost;
if (pmv.isSubpel())
- {
bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
- }
// measure SAD cost at MV(0) if MVP is not zero
if (pmv.notZero())
@@ -577,21 +617,35 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
}
// measure SAD cost at each QPEL motion vector candidate
- for (int i = 0; i < numCandidates; i++)
+ if (ref->isLowres)
{
- MV m = mvc[i].clipped(qmvmin, qmvmax);
- if (m.notZero() && m != pmv && m != bestpre) // check already measured
+ for (int i = 0; i < numCandidates; i++)
{
- int cost;
- if (ref->isLowres)
- cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
- else
- cost = subpelCompare(ref, m, sad) + mvcost(m);
-
- if (cost < bprecost)
+ MV m = mvc[i].clipped(qmvmin, qmvmax);
+ if (m.notZero() && m != pmv && m != bestpre) // check already measured
{
- bprecost = cost;
- bestpre = m;
+ int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
+ if (cost < bprecost)
+ {
+ bprecost = cost;
+ bestpre = m;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i = 0; i < numCandidates; i++)
+ {
+ MV m = mvc[i].clipped(qmvmin, qmvmax);
+ if (m.notZero() && m != pmv && m != bestpre) // check already measured
+ {
+ int cost = subpelCompare(ref, m, sad) + mvcost(m);
+ if (cost < bprecost)
+ {
+ bprecost = cost;
+ bestpre = m;
+ }
}
}
}
@@ -780,7 +834,7 @@ me_hex2:
mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
denom++;
}
- mvd += x265_predictor_difference(mvc, numCandidates);
+ mvd += predictorDifference(mvc, numCandidates);
}
sad_ctx = SAD_THRESH(1000) ? 0
@@ -1043,7 +1097,7 @@ me_hex2:
else
bmv = bmv.toQPel(); // promote search bmv to qpel
- SubpelWorkload& wl = workload[this->subpelRefine];
+ const SubpelWorkload& wl = workload[this->subpelRefine];
if (!bcost)
{
@@ -1053,11 +1107,11 @@ me_hex2:
}
else if (ref->isLowres)
{
- int bdir = 0, cost;
+ int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2;
- cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
@@ -1068,7 +1122,7 @@ me_hex2:
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
- cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
@@ -1088,11 +1142,11 @@ me_hex2:
for (int iter = 0; iter < wl.hpel_iters; iter++)
{
- int bdir = 0, cost;
+ int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2;
- cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
+ int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
@@ -1108,11 +1162,11 @@ me_hex2:
for (int iter = 0; iter < wl.qpel_iters; iter++)
{
- int bdir = 0, cost;
+ int bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
- cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
+ int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
@@ -1130,40 +1184,94 @@ me_hex2:
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
{
+ intptr_t refStride = ref->lumaStride;
+ pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
int xFrac = qmv.x & 0x3;
int yFrac = qmv.y & 0x3;
+ int cost;
+ intptr_t lclStride = fencPUYuv.m_size;
+ X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
- if ((yFrac | xFrac) == 0)
- {
- pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
- return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride);
- }
+ if (!(yFrac | xFrac))
+ cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
else
{
- /* We are taking a short-cut here if the reference is weighted. To be
+ /* we are taking a short-cut here if the reference is weighted. To be
* accurate we should be interpolating unweighted pixels and weighting
- * the final 16bit values prior to rounding and downshifting. Instead we
+ * the final 16bit values prior to rounding and down shifting. Instead we
* are simply interpolating the weighted full-pel pixels. Not 100%
* accurate but good enough for fast qpel ME */
ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
- pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
- if (yFrac == 0)
- {
- primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
- }
- else if (xFrac == 0)
+ if (!yFrac)
+ primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
+ else if (!xFrac)
+ primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
+ else
+ primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
+
+ cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
+ }
+
+ if (bChromaSATD)
+ {
+ int csp = fencPUYuv.m_csp;
+ int hshift = fencPUYuv.m_hChromaShift;
+ int vshift = fencPUYuv.m_vChromaShift;
+ int shiftHor = (2 + hshift);
+ int shiftVer = (2 + vshift);
+ lclStride = fencPUYuv.m_csize;
+
+ intptr_t refStrideC = ref->reconPic->m_strideC;
+ intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
+
+ const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
+ const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
+
+ xFrac = qmv.x & ((1 << shiftHor) - 1);
+ yFrac = qmv.y & ((1 << shiftVer) - 1);
+
+ if (!(yFrac | xFrac))
{
- primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
}
else
{
- ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
+ ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+ if (!yFrac)
+ {
+ primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
- int filterSize = NTAPS_LUMA;
- int halfFilterSize = filterSize >> 1;
- primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1);
- primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
+ primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+ }
+ else if (!xFrac)
+ {
+ primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+ primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+ }
+ else
+ {
+ ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
+
+ int extStride = blockwidth >> hshift;
+ int filterSize = NTAPS_CHROMA;
+ int halfFilterSize = (filterSize >> 1);
+
+ primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+ primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+ primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+ primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+ cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+ }
}
- return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
}
+
+ return cost;
}
diff --git a/source/encoder/motion.h b/source/encoder/motion.h
index 51687f5..a5fddd7 100644
--- a/source/encoder/motion.h
+++ b/source/encoder/motion.h
@@ -28,6 +28,7 @@
#include "reference.h"
#include "mv.h"
#include "bitcost.h"
+#include "yuv.h"
namespace x265 {
// private x265 namespace
@@ -36,63 +37,59 @@ class MotionEstimate : public BitCost
{
protected:
- /* Aligned copy of original pixels, extra room for manual alignment */
- pixel *fencplane;
- intptr_t fencLumaStride;
-
- pixelcmp_t sad;
- pixelcmp_t satd;
- pixelcmp_t sa8d;
- pixelcmp_x3_t sad_x3;
- pixelcmp_x4_t sad_x4;
-
intptr_t blockOffset;
- int partEnum;
+
+ int ctuAddr;
+ int absPartIdx; // part index of PU, including CU offset within CTU
+
int searchMethod;
int subpelRefine;
- /* subpel generation buffers */
int blockwidth;
int blockheight;
+ pixelcmp_t sad;
+ pixelcmp_x3_t sad_x3;
+ pixelcmp_x4_t sad_x4;
+ pixelcmp_t satd;
+ pixelcmp_t chromaSatd;
+
MotionEstimate& operator =(const MotionEstimate&);
public:
static const int COST_MAX = 1 << 28;
- pixel *fenc;
+ Yuv fencPUYuv;
+ int partEnum;
+ bool bChromaSATD;
MotionEstimate();
-
~MotionEstimate();
- void setSearchMethod(int i) { searchMethod = i; }
-
- void setSubpelRefine(int i) { subpelRefine = i; }
+ void init(int method, int refine, int csp);
/* Methods called at slice setup */
- void setSourcePlane(pixel *Y, intptr_t luma)
- {
- fencplane = Y;
- fencLumaStride = luma;
- }
-
- void setSourcePU(intptr_t offset, int pwidth, int pheight);
+ void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
+ void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
/* buf*() and motionEstimate() methods all use cached fenc pixels and thus
* require setSourcePU() to be called prior. */
- inline int bufSAD(pixel *fref, intptr_t stride) { return sad(fenc, FENC_STRIDE, fref, stride); }
+ inline int bufSAD(const pixel* fref, intptr_t stride) { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
- inline int bufSA8D(pixel *fref, intptr_t stride) { return sa8d(fenc, FENC_STRIDE, fref, stride); }
+ inline int bufSATD(const pixel* fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
- inline int bufSATD(pixel *fref, intptr_t stride) { return satd(fenc, FENC_STRIDE, fref, stride); }
+ inline int bufChromaSATD(const Yuv& refYuv, int puPartIdx)
+ {
+ return chromaSatd(refYuv.getCbAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[1], fencPUYuv.m_csize) +
+ chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize);
+ }
- int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
+ int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
- int subpelCompare(ReferencePlanes * ref, const MV &qmv, pixelcmp_t);
+ int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
protected:
diff --git a/source/encoder/nal.cpp b/source/encoder/nal.cpp
index c38c651..a1e935a 100644
--- a/source/encoder/nal.cpp
+++ b/source/encoder/nal.cpp
@@ -90,7 +90,7 @@ void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
uint8_t *out = m_buffer + m_occupancy;
uint32_t bytes = 0;
- if (!m_numNal || nalUnitType == NAL_UNIT_SPS || nalUnitType == NAL_UNIT_PPS)
+ if (!m_numNal || nalUnitType == NAL_UNIT_VPS || nalUnitType == NAL_UNIT_SPS || nalUnitType == NAL_UNIT_PPS)
{
memcpy(out, startCodePrefix, 4);
bytes += 4;
@@ -193,12 +193,10 @@ uint32_t NALList::serializeSubstreams(uint32_t* streamSizeBytes, uint32_t stream
{
for (uint32_t i = 0; i < inSize; i++)
{
- if (bytes > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03)
+ if (bytes >= 2 && !out[bytes - 2] && !out[bytes - 1] && inBytes[i] <= 0x03)
{
/* inject 0x03 to prevent emulating a start code */
- out[bytes] = out[bytes - 1];
- out[bytes - 1] = 0x03;
- bytes++;
+ out[bytes++] = 3;
}
out[bytes++] = inBytes[i];
diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
index f54b101..33c70ee 100644
--- a/source/encoder/ratecontrol.cpp
+++ b/source/encoder/ratecontrol.cpp
@@ -40,8 +40,7 @@
using namespace x265;
/* Amortize the partial cost of I frames over the next N frames */
-const double RateControl::s_amortizeFraction = 0.85;
-const int RateControl::s_amortizeFrames = 75;
+
const int RateControl::s_slidingWindowFrames = 20;
const char *RateControl::s_defaultStatFileName = "x265_2pass.log";
@@ -163,18 +162,30 @@ static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcSt
if ((colorFormat != X265_CSP_I444) && bChroma)
{
ALIGN_VAR_8(pixel, pix[8 * 8]);
- primitives.luma_copy_pp[LUMA_8x8](pix, 8, src, srcStride);
- return acEnergyVar(curFrame, primitives.var[BLOCK_8x8](pix, 8), 6, bChroma);
+ primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
+ return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, bChroma);
}
else
- return acEnergyVar(curFrame, primitives.var[BLOCK_16x16](src, srcStride), 8, bChroma);
+ return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, bChroma);
+}
+
+/* Returns the zone for the current frame */
+x265_zone* RateControl::getZone()
+{
+ for (int i = m_param->rc.zoneCount - 1; i >= 0; i--)
+ {
+ x265_zone *z = &m_param->rc.zones[i];
+ if (m_framesDone + 1 >= z->startFrame && m_framesDone < z->endFrame)
+ return z;
+ }
+ return NULL;
}
/* Find the total AC energy of each block in all planes */
uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y)
{
- intptr_t stride = curFrame->m_origPicYuv->m_stride;
- intptr_t cStride = curFrame->m_origPicYuv->m_strideC;
+ intptr_t stride = curFrame->m_fencPic->m_stride;
+ intptr_t cStride = curFrame->m_fencPic->m_strideC;
intptr_t blockOffsetLuma = block_x + (block_y * stride);
int colorFormat = m_param->internalCsp;
int hShift = CHROMA_H_SHIFT(colorFormat);
@@ -183,9 +194,9 @@ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t blo
uint32_t var;
- var = acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
- var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
- var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
+ var = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
+ var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
+ var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
x265_emms();
return var;
}
@@ -193,8 +204,8 @@ uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t blo
void RateControl::calcAdaptiveQuantFrame(Frame *curFrame)
{
/* Actual adaptive quantization */
- int maxCol = curFrame->m_origPicYuv->m_picWidth;
- int maxRow = curFrame->m_origPicYuv->m_picHeight;
+ int maxCol = curFrame->m_fencPic->m_picWidth;
+ int maxRow = curFrame->m_fencPic->m_picHeight;
for (int y = 0; y < 3; y++)
{
@@ -323,6 +334,8 @@ RateControl::RateControl(x265_param *p)
m_bTerminated = false;
m_finalFrameCount = 0;
m_numEntries = 0;
+ m_amortizeFraction = 0.85;
+ m_amortizeFrames = 75;
if (m_param->rc.rateControlMode == X265_RC_CRF)
{
m_param->rc.qp = (int)m_param->rc.rfConstant;
@@ -360,10 +373,12 @@ RateControl::RateControl(x265_param *p)
m_rce2Pass = NULL;
// vbv initialization
- m_param->rc.vbvBufferSize = Clip3(0, 2000000, m_param->rc.vbvBufferSize);
- m_param->rc.vbvMaxBitrate = Clip3(0, 2000000, m_param->rc.vbvMaxBitrate);
- m_param->rc.vbvBufferInit = Clip3(0.0, 2000000.0, m_param->rc.vbvBufferInit);
+ m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
+ m_param->rc.vbvMaxBitrate = x265_clip3(0, 2000000, m_param->rc.vbvMaxBitrate);
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 2000000.0, m_param->rc.vbvBufferInit);
m_singleFrameVbv = 0;
+ m_rateTolerance = 1.0;
+
if (m_param->rc.vbvBufferSize)
{
if (m_param->rc.rateControlMode == X265_RC_CQP)
@@ -403,8 +418,20 @@ RateControl::RateControl(x265_param *p)
x265_log(m_param, X265_LOG_WARNING, "NAL HRD parameters require VBV parameters, ignored\n");
m_param->bEmitHRDSEI = 0;
}
-
m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && !m_2pass && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
+ if (m_param->rc.bStrictCbr && !m_isCbr)
+ {
+ x265_log(m_param, X265_LOG_WARNING, "strict CBR set without CBR mode, ignored\n");
+ m_param->rc.bStrictCbr = 0;
+ }
+ if (m_param->totalFrames <= 2 * m_fps && m_param->rc.bStrictCbr) /* Strict CBR segment encode */
+ {
+ m_amortizeFraction = 0.85;
+ m_amortizeFrames = m_param->totalFrames / 2;
+ }
+ if(m_param->rc.bStrictCbr)
+ m_rateTolerance = 0.7;
+
m_leadingBframes = m_param->bframes;
m_bframeBits = 0;
m_leadingNoBSatd = 0;
@@ -423,8 +450,8 @@ RateControl::RateControl(x265_param *p)
if (m_qp && !m_param->bLossless)
{
m_qpConstant[P_SLICE] = m_qp;
- m_qpConstant[I_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_qp - m_ipOffset + 0.5));
- m_qpConstant[B_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_qp + m_pbOffset + 0.5));
+ m_qpConstant[I_SLICE] = x265_clip3(QP_MIN, QP_MAX_MAX, (int)(m_qp - m_ipOffset + 0.5));
+ m_qpConstant[B_SLICE] = x265_clip3(QP_MIN, QP_MAX_MAX, (int)(m_qp + m_pbOffset + 0.5));
}
else
{
@@ -466,12 +493,13 @@ bool RateControl::init(const SPS *sps)
m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
if (m_param->rc.vbvBufferInit > 1.)
- m_param->rc.vbvBufferInit = Clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
- m_param->rc.vbvBufferInit = Clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
}
m_totalBits = 0;
+ m_encodedBits = 0;
m_framesDone = 0;
m_residualCost = 0;
m_partialResidualCost = 0;
@@ -494,12 +522,12 @@ bool RateControl::init(const SPS *sps)
/* Frame Predictors and Row predictors used in vbv */
for (int i = 0; i < 5; i++)
{
- m_pred[i].coeff = 2.0;
+ m_pred[i].coeff = 1.5;
m_pred[i].count = 1.0;
m_pred[i].decay = 0.5;
m_pred[i].offset = 0.0;
}
- m_predBfromP = m_pred[0];
+ m_pred[0].coeff = 1.0;
if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
{
/* If the user hasn't defined the stat filename, use the default value */
@@ -732,10 +760,10 @@ void RateControl::initHRD(SPS *sps)
hrd->cbrFlag = m_isCbr;
// normalize HRD size and rate to the value / scale notation
- hrd->bitRateScale = Clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT);
+ hrd->bitRateScale = x265_clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT);
hrd->bitRateValue = (vbvMaxBitrate >> (hrd->bitRateScale + BR_SHIFT));
- hrd->cpbSizeScale = Clip3(0, 15, calcScale(vbvBufferSize) - CPB_SHIFT);
+ hrd->cpbSizeScale = x265_clip3(0, 15, calcScale(vbvBufferSize) - CPB_SHIFT);
hrd->cpbSizeValue = (vbvBufferSize >> (hrd->cpbSizeScale + CPB_SHIFT));
int bitRateUnscale = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
int cpbSizeUnscale = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
@@ -748,9 +776,9 @@ void RateControl::initHRD(SPS *sps)
int maxDpbOutputDelay = (int)(sps->maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick);
int maxDelay = (int)(90000.0 * cpbSizeUnscale / bitRateUnscale + 0.5);
- hrd->initialCpbRemovalDelayLength = 2 + Clip3(4, 22, 32 - calcLength(maxDelay));
- hrd->cpbRemovalDelayLength = Clip3(4, 31, 32 - calcLength(maxCpbOutputDelay));
- hrd->dpbOutputDelayLength = Clip3(4, 31, 32 - calcLength(maxDpbOutputDelay));
+ hrd->initialCpbRemovalDelayLength = 2 + x265_clip3(4, 22, 32 - calcLength(maxDelay));
+ hrd->cpbRemovalDelayLength = x265_clip3(4, 31, 32 - calcLength(maxCpbOutputDelay));
+ hrd->dpbOutputDelayLength = x265_clip3(4, 31, 32 - calcLength(maxDpbOutputDelay));
#undef MAX_DURATION
}
@@ -1019,9 +1047,9 @@ int RateControl::rateControlSliceType(int frameNum)
* adaptive B-frames, but that would be complicated.
* So just calculate the average QP used so far. */
m_param->rc.qp = (m_accumPQp < 1) ? ABR_INIT_QP_MAX : (int)(m_accumPQp + 0.5);
- m_qpConstant[P_SLICE] = Clip3(0, QP_MAX_MAX, m_param->rc.qp);
- m_qpConstant[I_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_param->rc.qp - m_ipOffset + 0.5));
- m_qpConstant[B_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_param->rc.qp + m_pbOffset + 0.5));
+ m_qpConstant[P_SLICE] = x265_clip3(QP_MIN, QP_MAX_MAX, m_param->rc.qp);
+ m_qpConstant[I_SLICE] = x265_clip3(QP_MIN, QP_MAX_MAX, (int)(m_param->rc.qp - m_ipOffset + 0.5));
+ m_qpConstant[B_SLICE] = x265_clip3(QP_MIN, QP_MAX_MAX, (int)(m_param->rc.qp + m_pbOffset + 0.5));
x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", m_numEntries);
x265_log(m_param, X265_LOG_ERROR, "continuing anyway, at constant QP=%d\n", m_param->rc.qp);
@@ -1120,7 +1148,7 @@ int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encode
}
else
{
- /* 1.5 * MaxLumaSr * (AuCpbRemovalTime[ n ] - AyCpbRemovalTime[ n - 1 ]) ? MinCr */
+ /* 1.5 * MaxLumaSr * (AuCpbRemovalTime[ n ] - AuCpbRemovalTime[ n - 1 ]) / MinCr */
rce->frameSizeMaximum = 8 * 1.5 * enc->m_vps.ptl.maxLumaSrForLevel * m_frameDuration / mincr;
}
}
@@ -1134,7 +1162,7 @@ int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encode
rce->lastSatd = m_currentSatd;
}
double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce));
- q = Clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
+ q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
m_qp = int(q + 0.5);
rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = q;
/* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
@@ -1148,6 +1176,15 @@ int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encode
else
m_qp = m_qpConstant[m_sliceType];
curEncData.m_avgQpAq = curEncData.m_avgQpRc = m_qp;
+
+ x265_zone* zone = getZone();
+ if (zone)
+ {
+ if (zone->bForceQp)
+ m_qp += zone->qp - m_qpConstant[P_SLICE];
+ else
+ m_qp -= (int)(6.0 * X265_LOG2(zone->bitrateFactor));
+ }
}
if (m_sliceType != B_SLICE)
{
@@ -1157,8 +1194,8 @@ int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encode
rce->leadingNoBSatd = m_leadingNoBSatd;
if (curFrame->m_forceqp)
{
- m_qp = int32_t(curFrame->m_forceqp + 0.5) - 1;
- m_qp = Clip3(QP_MIN, QP_MAX_MAX, m_qp);
+ m_qp = (int32_t)(curFrame->m_forceqp + 0.5) - 1;
+ m_qp = x265_clip3(QP_MIN, QP_MAX_MAX, m_qp);
rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp;
}
// Do not increment m_startEndOrder here. Make rateControlEnd of previous thread
@@ -1218,7 +1255,7 @@ double RateControl::getDiffLimitedQScale(RateControlEntry *rce, double q)
{
double maxQscale = m_lastQScaleFor[rce->sliceType] * m_lstep;
double minQscale = m_lastQScaleFor[rce->sliceType] / m_lstep;
- q = Clip3(minQscale, maxQscale, q);
+ q = x265_clip3(minQscale, maxQscale, q);
}
m_lastQScaleFor[rce->sliceType] = q;
@@ -1237,6 +1274,14 @@ double RateControl::getDiffLimitedQScale(RateControlEntry *rce, double q)
m_accumPNorm = mask * (1 + m_accumPNorm);
}
+ x265_zone* zone = getZone();
+ if (zone)
+ {
+ if (zone->bForceQp)
+ q = x265_qp2qScale(zone->qp);
+ else
+ q /= zone->bitrateFactor;
+ }
return q;
}
@@ -1266,7 +1311,7 @@ bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int over)
{
fill += (m_frameDuration * m_vbvMaxRate -
qScale2bits(&m_rce2Pass[i], m_rce2Pass[i].newQScale)) * parity;
- fill = Clip3(0.0, m_bufferSize, fill);
+ fill = x265_clip3(0.0, m_bufferSize, fill);
fills[i] = fill;
if (fill <= bufferMin || i == 0)
{
@@ -1291,9 +1336,9 @@ bool RateControl::fixUnderflow(int t0, int t1, double adjustment, double qscaleM
for (int i = t0; i <= t1; i++)
{
qscaleOrig = m_rce2Pass[i].newQScale;
- qscaleOrig = Clip3(qscaleMin, qscaleMax, qscaleOrig);
+ qscaleOrig = x265_clip3(qscaleMin, qscaleMax, qscaleOrig);
qscaleNew = qscaleOrig * adjustment;
- qscaleNew = Clip3(qscaleMin, qscaleMax, qscaleNew);
+ qscaleNew = x265_clip3(qscaleMin, qscaleMax, qscaleNew);
m_rce2Pass[i].newQScale = qscaleNew;
adjusted = adjusted || (qscaleNew != qscaleOrig);
}
@@ -1343,6 +1388,33 @@ fail:
return false;
}
+double RateControl::tuneAbrQScaleFromFeedback(double qScale)
+{
+ double abrBuffer = 2 * m_rateTolerance * m_bitrate;
+ if (m_currentSatd)
+ {
+ /* use framesDone instead of POC as poc count is not serial with bframes enabled */
+ double overflow = 1.0;
+ double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
+ double wantedBits = timeDone * m_bitrate;
+ int64_t encodedBits = m_totalBits;
+ if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
+ {
+ abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
+ encodedBits = m_encodedBits;
+ }
+
+ if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames ||
+ m_param->rc.bStrictCbr))
+ {
+ abrBuffer *= X265_MAX(1, sqrt(timeDone));
+ overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) / abrBuffer);
+ qScale *= overflow;
+ }
+ }
+ return qScale;
+}
+
double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
{
double q;
@@ -1415,17 +1487,25 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
q += m_pbOffset / 2;
else
q += m_pbOffset;
- rce->qpNoVbv = q;
- double qScale = x265_qp2qScale(q);
- if (!m_2pass && m_isVbv)
+ double qScale = x265_qp2qScale(q);
+ if (m_isCbr)
{
- if (m_leadingBframes > 5)
+ qScale = tuneAbrQScaleFromFeedback(qScale);
+ if (!m_isAbrReset)
{
- qScale = clipQscale(curFrame, rce, qScale);
- m_lastQScaleFor[m_sliceType] = qScale;
+ double lmin = m_lastQScaleFor[P_SLICE] / m_lstep;
+ double lmax = m_lastQScaleFor[P_SLICE] * m_lstep;
+ qScale = x265_clip3(lmin, lmax, qScale);
}
- rce->frameSizePlanned = predictSize(&m_predBfromP, qScale, (double)m_leadingNoBSatd);
+ q = x265_qScale2qp(qScale);
+ }
+ rce->qpNoVbv = q;
+ if (!m_2pass && m_isVbv)
+ {
+ qScale = clipQscale(curFrame, rce, qScale);
+ m_lastQScaleFor[m_sliceType] = qScale;
+ rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], qScale, (double)m_currentSatd);
}
else if (m_2pass && m_isVbv)
{
@@ -1440,7 +1520,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
}
else
{
- double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate;
+ double abrBuffer = 2 * m_rateTolerance * m_bitrate;
if (m_2pass)
{
int64_t diff;
@@ -1462,13 +1542,13 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
}
diff = m_predictedBits - (int64_t)rce->expectedBits;
q = rce->newQScale;
- q /= Clip3(0.5, 2.0, (double)(abrBuffer - diff) / abrBuffer);
+ q /= x265_clip3(0.5, 2.0, (double)(abrBuffer - diff) / abrBuffer);
if (m_expectedBitsSum > 0)
{
/* Adjust quant based on the difference between
* achieved and expected bitrate so far */
double curTime = (double)rce->encodeOrder / m_numEntries;
- double w = Clip3(0.0, 1.0, curTime * 100);
+ double w = x265_clip3(0.0, 1.0, curTime * 100);
q *= pow((double)m_totalBits / m_expectedBitsSum, w);
}
rce->qpNoVbv = x265_qScale2qp(q);
@@ -1492,7 +1572,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
expectedVbv = m_bufferFill + m_bufferRate - expectedSize;
}
}
- q = Clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
+ q = x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
}
else
{
@@ -1508,7 +1588,7 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
* tradeoff between quality and bitrate precision. But at large
* tolerances, the bit distribution approaches that of 2pass. */
- double wantedBits, overflow = 1;
+ double overflow = 1;
m_shortTermCplxSum *= 0.5;
m_shortTermCplxCount *= 0.5;
@@ -1528,29 +1608,15 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
{
if (!m_param->rc.bStatRead)
checkAndResetABR(rce, false);
- q = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
-
- /* ABR code can potentially be counterproductive in CBR, so just
- * don't bother. Don't run it if the frame complexity is zero
- * either. */
- if (!m_isCbr && m_currentSatd)
- {
- /* use framesDone instead of POC as poc count is not serial with bframes enabled */
- double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
- wantedBits = timeDone * m_bitrate;
- if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
- {
- abrBuffer *= X265_MAX(1, sqrt(timeDone));
- overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
- q *= overflow;
- }
- }
+ double initialQScale = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
+ q = tuneAbrQScaleFromFeedback(initialQScale);
+ overflow = q / initialQScale;
}
-
if (m_sliceType == I_SLICE && m_param->keyframeMax > 1
&& m_lastNonBPictType != I_SLICE && !m_isAbrReset)
{
- q = x265_qp2qScale(m_accumPQp / m_accumPNorm);
+ if (!m_param->rc.bStrictCbr)
+ q = x265_qp2qScale(m_accumPQp / m_accumPNorm);
q /= fabs(m_param->rc.ipFactor);
}
else if (m_framesDone > 0)
@@ -1567,20 +1633,20 @@ double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
else if (overflow < 0.9)
lqmin /= m_lstep;
}
- q = Clip3(lqmin, lqmax, q);
+ q = x265_clip3(lqmin, lqmax, q);
}
}
else if (m_qCompress != 1 && m_param->rc.rateControlMode == X265_RC_CRF)
{
q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor);
}
- else if (m_framesDone == 0 && !m_isVbv)
+ else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR)
{
/* for ABR alone, clip the first I frame qp */
double lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep;
q = X265_MIN(lqmax, q);
}
- q = Clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
+ q = x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
rce->qpNoVbv = x265_qScale2qp(q);
q = clipQscale(curFrame, rce, q);
}
@@ -1615,8 +1681,8 @@ void RateControl::rateControlUpdateStats(RateControlEntry* rce)
if (m_partialResidualFrames)
rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames;
- m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
- m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames);
+ m_partialResidualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax);
+ m_partialResidualCost = (int)((rce->rowTotalBits * m_amortizeFraction) /m_partialResidualFrames);
rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames;
}
else if (m_partialResidualFrames)
@@ -1643,7 +1709,7 @@ void RateControl::rateControlUpdateStats(RateControlEntry* rce)
void RateControl::checkAndResetABR(RateControlEntry* rce, bool isFrameDone)
{
- double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate;
+ double abrBuffer = 2 * m_rateTolerance * m_bitrate;
// Check if current Slice is a scene cut that follows low detailed/blank frames
if (rce->lastSatd > 4 * rce->movingAvgSum)
@@ -1725,13 +1791,10 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
{
double frameQ[3];
double curBits;
- if (m_sliceType == B_SLICE)
- curBits = predictSize(&m_predBfromP, q, (double)m_currentSatd);
- else
- curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
+ curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
double bufferFillCur = m_bufferFill - curBits;
double targetFill;
- double totalDuration = 0;
+ double totalDuration = m_frameDuration;
frameQ[P_SLICE] = m_sliceType == I_SLICE ? q * m_param->rc.ipFactor : (m_sliceType == B_SLICE ? q / m_param->rc.pbFactor : q);
frameQ[B_SLICE] = frameQ[P_SLICE] * m_param->rc.pbFactor;
frameQ[I_SLICE] = frameQ[P_SLICE] / m_param->rc.ipFactor;
@@ -1739,7 +1802,7 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
for (int j = 0; bufferFillCur >= 0; j++)
{
int type = curFrame->m_lowres.plannedType[j];
- if (type == X265_TYPE_AUTO)
+ if (type == X265_TYPE_AUTO || totalDuration >= 1.0)
break;
totalDuration += m_frameDuration;
double wantedFrameSize = m_vbvMaxRate * m_frameDuration;
@@ -1752,15 +1815,20 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
}
/* Try to get the buffer at least 50% filled, but don't set an impossible goal. */
- targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * 0.5);
+ double finalDur = 1;
+ if (m_param->rc.bStrictCbr)
+ {
+ finalDur = x265_clip3(0.4, 1.0, totalDuration);
+ }
+ targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5 , m_bufferSize * (1 - 0.5 * finalDur));
if (bufferFillCur < targetFill)
{
q *= 1.01;
loopTerminate |= 1;
continue;
}
- /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
- targetFill = Clip3(m_bufferSize * 0.8, m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
+ /* Try to get the buffer not more than 80% filled, but don't set an impossible goal. */
+ targetFill = x265_clip3(m_bufferSize * (1 - 0.2 * finalDur), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
if (m_isCbr && bufferFillCur > targetFill)
{
q /= 1.01;
@@ -1778,7 +1846,7 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
(m_sliceType == I_SLICE && m_lastNonBPictType == I_SLICE)) &&
m_bufferFill / m_bufferSize < 0.5)
{
- q /= Clip3(0.5, 1.0, 2.0 * m_bufferFill / m_bufferSize);
+ q /= x265_clip3(0.5, 1.0, 2.0 * m_bufferFill / m_bufferSize);
}
// Now a hard threshold to make sure the frame fits in VBV.
// This one is mostly for I-frames.
@@ -1794,7 +1862,7 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
{
double qf = 1.0;
if (bits > m_bufferFill / maxFillFactor)
- qf = Clip3(0.2, 1.0, m_bufferFill / (maxFillFactor * bits));
+ qf = x265_clip3(0.2, 1.0, m_bufferFill / (maxFillFactor * bits));
q /= qf;
bits *= qf;
if (bits < m_bufferRate / minFillFactor)
@@ -1810,25 +1878,6 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
if (pbits > rce->frameSizeMaximum)
q *= pbits / rce->frameSizeMaximum;
- // Check B-frame complexity, and use up any bits that would
- // overflow before the next P-frame.
- if (m_leadingBframes <= 5 && m_sliceType == P_SLICE && !m_singleFrameVbv)
- {
- int nb = m_leadingBframes;
- double bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd);
- double bbits = predictSize(&m_predBfromP, q * m_param->rc.pbFactor, (double)m_currentSatd);
- double space;
- if (bbits > m_bufferRate)
- nb = 0;
- double pbbits = nb * bbits;
-
- space = m_bufferFill + (1 + nb) * m_bufferRate - m_bufferSize;
- if (pbbits < space)
- q *= X265_MAX(pbbits / space, bits / (0.5 * m_bufferSize));
-
- q = X265_MAX(q0 / 2, q);
- }
-
if (!m_isCbr || (m_isAbr && m_currentSatd >= rce->movingAvgSum && q <= q0 / 2))
q = X265_MAX(q0, q);
@@ -1836,7 +1885,7 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
{
double qpNoVbv = x265_qScale2qp(q0);
double qmax = X265_MIN(MAX_MAX_QPSCALE,x265_qp2qScale(qpNoVbv + m_rateFactorMaxIncrement));
- return Clip3(MIN_QPSCALE, qmax, q);
+ return x265_clip3(MIN_QPSCALE, qmax, q);
}
}
if (m_2pass)
@@ -1848,7 +1897,7 @@ double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
q = q*(max - min) + min;
return exp(q);
}
- return Clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
+ return x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
}
double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, double qpVbv, int32_t& encodedBitsSoFar)
@@ -1899,22 +1948,24 @@ double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, d
&& refQScale > 0
&& refRowSatdCost > 0)
{
- if (abs(int32_t(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2)
+ if (abs((int32_t)(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2)
{
double predTotal = refRowBits * satdCostForPendingCus / refRowSatdCost * refQScale / qScale;
- totalSatdBits += int32_t((pred_s + predTotal) * 0.5);
+ totalSatdBits += (int32_t)((pred_s + predTotal) * 0.5);
continue;
}
}
- totalSatdBits += int32_t(pred_s);
+ totalSatdBits += (int32_t)pred_s;
}
- else
+ else if (picType == P_SLICE)
{
/* Our QP is lower than the reference! */
double pred_intra = predictSize(rce->rowPred[1], qScale, intraCost);
/* Sum: better to overestimate than underestimate by using only one of the two predictors. */
- totalSatdBits += int32_t(pred_intra + pred_s);
+ totalSatdBits += (int32_t)(pred_intra + pred_s);
}
+ else
+ totalSatdBits += (int32_t)pred_s;
}
}
@@ -1969,16 +2020,8 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo
if (row < sps.numCuInHeight - 1)
{
- /* B-frames shouldn't use lower QP than their reference frames. */
- if (rce->sliceType == B_SLICE)
- {
- Frame* refSlice1 = curEncData.m_slice->m_refPicList[0][0];
- Frame* refSlice2 = curEncData.m_slice->m_refPicList[1][0];
- qpMin = X265_MAX(qpMin, X265_MAX(refSlice1->m_encData->m_rowStat[row].diagQp, refSlice2->m_encData->m_rowStat[row].diagQp));
- qpVbv = X265_MAX(qpVbv, qpMin);
- }
/* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
- double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_param->rc.rateTolerance;
+ double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_rateTolerance;
int32_t encodedBitsSoFar = 0;
double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
@@ -1988,28 +2031,55 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo
if (encodedBitsSoFar < 0.05f * rce->frameSizePlanned)
qpMax = qpAbsoluteMax = prevRowQp;
- if (rce->sliceType != I_SLICE)
+ if (rce->sliceType != I_SLICE || (m_param->rc.bStrictCbr && rce->poc > 0))
rcTol *= 0.5;
if (!m_isCbr)
qpMin = X265_MAX(qpMin, rce->qpNoVbv);
+ double totalBitsNeeded = m_wantedBitsWindow;
+ if (m_param->totalFrames)
+ totalBitsNeeded = (m_param->totalFrames * m_bitrate) / m_fps;
+ double abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
+
while (qpVbv < qpMax
- && ((accFrameBits > rce->frameSizePlanned + rcTol) ||
+ && (((accFrameBits > rce->frameSizePlanned + rcTol) ||
(rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) ||
- (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv)))
+ (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv))
+ && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot > 0.1)))
{
qpVbv += stepSize;
accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
+ abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
}
while (qpVbv > qpMin
&& (qpVbv > curEncData.m_rowStat[0].diagQp || m_singleFrameVbv)
- && ((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
- || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1))
+ && (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
+ || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
+ && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
{
qpVbv -= stepSize;
accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
+ abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
+ }
+
+ if (m_param->rc.bStrictCbr && m_param->totalFrames)
+ {
+ double timeDone = (double)(m_framesDone) / m_param->totalFrames;
+ while (qpVbv < qpMax && (qpVbv < rce->qpNoVbv + (m_param->rc.qpStep * timeDone)) &&
+ (timeDone > 0.75 && abrOvershoot > 0))
+ {
+ qpVbv += stepSize;
+ accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
+ abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
+ }
+ if (qpVbv > curEncData.m_rowStat[0].diagQp &&
+ abrOvershoot < -0.1 && timeDone > 0.5 && accFrameBits < rce->frameSizePlanned - rcTol)
+ {
+ qpVbv -= stepSize;
+ accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
+ }
}
/* avoid VBV underflow or MinCr violation */
@@ -2027,7 +2097,7 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo
if (qpVbv > qpMax && prevRowQp < qpMax && canReencodeRow)
{
/* Bump QP to halfway in between... close enough. */
- qpVbv = Clip3(prevRowQp + 1.0f, qpMax, (prevRowQp + qpVbv) * 0.5);
+ qpVbv = x265_clip3(prevRowQp + 1.0f, qpMax, (prevRowQp + qpVbv) * 0.5);
return -1;
}
@@ -2035,7 +2105,7 @@ int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateCo
{
if (qpVbv < qpMin && prevRowQp > qpMin && canReencodeRow)
{
- qpVbv = Clip3(qpMin, prevRowQp, (prevRowQp + qpVbv) * 0.5);
+ qpVbv = x265_clip3(qpMin, prevRowQp, (prevRowQp + qpVbv) * 0.5);
return -1;
}
}
@@ -2078,6 +2148,15 @@ double RateControl::getQScale(RateControlEntry *rce, double rateFactor)
m_lastRceq = q;
q /= rateFactor;
}
+
+ x265_zone* zone = getZone();
+ if (zone)
+ {
+ if (zone->bForceQp)
+ q = x265_qp2qScale(zone->qp);
+ else
+ q /= zone->bitrateFactor;
+ }
return q;
}
@@ -2085,10 +2164,10 @@ void RateControl::updatePredictor(Predictor *p, double q, double var, double bit
{
if (var < 10)
return;
- const double range = 1.5;
+ const double range = 2;
double old_coeff = p->coeff / p->count;
double new_coeff = bits * q / var;
- double new_coeff_clipped = Clip3(old_coeff / range, old_coeff * range, new_coeff);
+ double new_coeff_clipped = x265_clip3(old_coeff / range, old_coeff * range, new_coeff);
double new_offset = bits * q - new_coeff_clipped * var;
if (new_offset >= 0)
new_coeff = new_coeff_clipped;
@@ -2220,8 +2299,8 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
/* previous I still had a residual; roll it into the new loan */
if (m_residualFrames)
bits += m_residualCost * m_residualFrames;
- m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
- m_residualCost = (int)((bits * s_amortizeFraction) / m_residualFrames);
+ m_residualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax);
+ m_residualCost = (int)((bits * m_amortizeFraction) / m_residualFrames);
bits -= m_residualCost * m_residualFrames;
}
else if (m_residualFrames)
@@ -2244,6 +2323,7 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
}
m_wantedBitsWindow += m_frameDuration * m_bitrate;
m_totalBits += bits - rce->rowTotalBits;
+ m_encodedBits += actualBits;
int pos = m_sliderPos - m_param->frameNumThreads;
if (pos >= 0)
m_encodedBitsWindow[pos % s_slidingWindowFrames] = actualBits;
@@ -2257,16 +2337,6 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
if (m_isVbv)
{
- if (rce->sliceType == B_SLICE)
- {
- m_bframeBits += actualBits;
- if (rce->bLastMiniGopBFrame)
- {
- if (rce->bframes != 0)
- updatePredictor(&m_predBfromP, x265_qp2qScale(rce->qpaRc), (double)rce->leadingNoBSatd, (double)m_bframeBits / rce->bframes);
- m_bframeBits = 0;
- }
- }
updateVbv(actualBits, rce);
if (m_param->bEmitHRDSEI)
@@ -2295,9 +2365,9 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
rce->hrdTiming->dpbOutputTime = (double)rce->picTimingSEI->m_picDpbOutputDelay * time->numUnitsInTick / time->timeScale + rce->hrdTiming->cpbRemovalTime;
}
}
+ rce->isActive = false;
// Allow rateControlStart of next frame only when rateControlEnd of previous frame is over
m_startEndOrder.incr();
- rce->isActive = false;
return 0;
writeFailure:
@@ -2378,5 +2448,7 @@ void RateControl::destroy()
X265_FREE(m_rce2Pass);
for (int i = 0; i < 2; i++)
X265_FREE(m_cuTreeStats.qpBuffer[i]);
+
+ X265_FREE(m_param->rc.zones);
}
diff --git a/source/encoder/ratecontrol.h b/source/encoder/ratecontrol.h
index 5b86147..761b3e7 100644
--- a/source/encoder/ratecontrol.h
+++ b/source/encoder/ratecontrol.h
@@ -42,7 +42,7 @@ class SEIBufferingPeriod;
#define MAX_FRAME_DURATION 1.00
#define MIN_FRAME_DURATION 0.01
-#define CLIP_DURATION(f) Clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
+#define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
/* Current frame stats for 2 pass */
struct FrameStats
@@ -139,6 +139,7 @@ public:
bool m_isAbrReset;
int m_lastAbrResetPoc;
+ double m_rateTolerance;
double m_frameDuration; /* current frame duration in seconds */
double m_bitrate;
double m_rateFactorConstant;
@@ -173,8 +174,9 @@ public:
double m_shortTermCplxCount;
double m_lastRceq;
double m_qCompress;
- int64_t m_totalBits; /* total bits used for already encoded frames */
+ int64_t m_totalBits; /* total bits used for already encoded frames (after ammortization) */
int m_framesDone; /* # of frames passed through RateCotrol already */
+ int64_t m_encodedBits; /* bits used for encoded frames (without ammortization) */
double m_fps;
int64_t m_satdCostWindow[50];
int m_sliderPos;
@@ -233,11 +235,10 @@ public:
void initHRD(SPS* sps);
int rateControlSliceType(int frameNum);
bool cuTreeReadFor2Pass(Frame* curFrame);
+ double tuneAbrQScaleFromFeedback(double qScale);
protected:
- static const double s_amortizeFraction;
- static const int s_amortizeFrames;
static const int s_slidingWindowFrames;
static const char *s_defaultStatFileName;
@@ -245,7 +246,10 @@ protected:
int m_partialResidualFrames;
int m_residualCost;
int m_partialResidualCost;
+ int m_amortizeFrames;
+ double m_amortizeFraction;
+ x265_zone* getZone();
double getQScale(RateControlEntry *rce, double rateFactor);
double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
void accumPQpUpdate();
diff --git a/source/encoder/rdcost.h b/source/encoder/rdcost.h
index 10bfff3..55af1ad 100644
--- a/source/encoder/rdcost.h
+++ b/source/encoder/rdcost.h
@@ -37,30 +37,45 @@ public:
/* all weights and factors stored as FIX8 */
uint64_t m_lambda2;
uint64_t m_lambda;
- uint64_t m_cbDistortionWeight;
- uint64_t m_crDistortionWeight;
+ uint32_t m_chromaDistWeight[2];
+ uint32_t m_psyRdBase;
uint32_t m_psyRd;
int m_qp;
- void setPsyRdScale(double scale) { m_psyRd = (uint32_t)floor(256.0 * scale * 0.33); }
- void setCbDistortionWeight(uint16_t weightFix8) { m_cbDistortionWeight = weightFix8; }
- void setCrDistortionWeight(uint16_t weightFix8) { m_crDistortionWeight = weightFix8; }
+ void setPsyRdScale(double scale) { m_psyRdBase = (uint32_t)floor(65536.0 * scale * 0.33); }
void setQP(const Slice& slice, int qp)
{
m_qp = qp;
- setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
+ /* Scale PSY RD factor by a slice type factor */
+ static const uint32_t psyScaleFix8[3] = { 300, 256, 96 }; /* B, P, I */
+ m_psyRd = (m_psyRdBase * psyScaleFix8[slice.m_sliceType]) >> 8;
+
+ /* Scale PSY RD factor by QP, at high QP psy-rd can cause artifacts */
+ if (qp >= 40)
+ {
+ int scale = qp >= QP_MAX_SPEC ? 0 : (QP_MAX_SPEC - qp) * 23;
+ m_psyRd = (m_psyRd * scale) >> 8;
+ }
- int qpCb = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCbQpOffset);
+ int qpCb, qpCr;
+ setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
+ if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+ qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]);
+ else
+ qpCb = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC);
int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
- setCbDistortionWeight(lambdaOffset);
+ m_chromaDistWeight[0] = lambdaOffset;
- int qpCr = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCrQpOffset);
+ if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+ qpCr = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice.m_pps->chromaQpOffset[0]]);
+ else
+ qpCr = X265_MIN(qp + slice.m_pps->chromaQpOffset[0], QP_MAX_SPEC);
chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
- setCrDistortionWeight(lambdaOffset);
+ m_chromaDistWeight[1] = lambdaOffset;
}
void setLambda(double lambda2, double lambda)
@@ -72,47 +87,40 @@ public:
inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const
{
X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
- "calcRdCost wrap detected dist: %d, bits %d, lambda: %d\n", distortion, bits, (int)m_lambda2);
+ "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
return distortion + ((bits * m_lambda2 + 128) >> 8);
}
/* return the difference in energy between the source block and the recon block */
- inline int psyCost(int size, pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) const
+ inline int psyCost(int size, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) const
{
- return primitives.psy_cost_pp[size](source, sstride, recon, rstride);
+ return primitives.cu[size].psy_cost_pp(source, sstride, recon, rstride);
}
/* return the difference in energy between the source block and the recon block */
- inline int psyCost(int size, int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) const
+ inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
{
- return primitives.psy_cost_ss[size](source, sstride, recon, rstride);
+ return primitives.cu[size].psy_cost_ss(source, sstride, recon, rstride);
}
/* return the RD cost of this prediction, including the effect of psy-rd */
inline uint64_t calcPsyRdCost(uint32_t distortion, uint32_t bits, uint32_t psycost) const
{
- return distortion + ((m_lambda * m_psyRd * psycost) >> 16) + ((bits * m_lambda2) >> 8);
+ return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
}
inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
{
X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
- "calcRdSADCost wrap detected dist: %d, bits %d, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
+ "calcRdSADCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
return sadCost + ((bits * m_lambda + 128) >> 8);
}
- inline uint32_t scaleChromaDistCb(uint32_t dist) const
- {
- X265_CHECK(dist <= (UINT64_MAX - 128) / m_cbDistortionWeight,
- "scaleChromaDistCb wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_cbDistortionWeight);
- return (uint32_t)(((dist * m_cbDistortionWeight) + 128) >> 8);
- }
-
- inline uint32_t scaleChromaDistCr(uint32_t dist) const
+ inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
{
- X265_CHECK(dist <= (UINT64_MAX - 128) / m_crDistortionWeight,
- "scaleChromaDistCr wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_crDistortionWeight);
- return (uint32_t)(((dist * m_crDistortionWeight) + 128) >> 8);
+ X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
+ "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
+ return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
}
inline uint32_t getCost(uint32_t bits) const
diff --git a/source/encoder/reference.cpp b/source/encoder/reference.cpp
index 958042b..425174c 100644
--- a/source/encoder/reference.cpp
+++ b/source/encoder/reference.cpp
@@ -33,86 +33,142 @@ using namespace x265;
MotionReference::MotionReference()
{
- m_weightBuffer = NULL;
+ weightBuffer[0] = NULL;
+ weightBuffer[1] = NULL;
+ weightBuffer[2] = NULL;
}
-int MotionReference::init(PicYuv* recPic, WeightParam *w)
+MotionReference::~MotionReference()
+{
+ X265_FREE(weightBuffer[0]);
+ X265_FREE(weightBuffer[1]);
+ X265_FREE(weightBuffer[2]);
+}
+
+int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p)
{
- m_reconPic = recPic;
+ reconPic = recPic;
+ numWeightedRows = 0;
lumaStride = recPic->m_stride;
- intptr_t startpad = recPic->m_lumaMarginY * lumaStride + recPic->m_lumaMarginX;
+ chromaStride = recPic->m_strideC;
+ numInterpPlanes = p.subpelRefine > 2 ? 3 : 1; /* is chroma satd possible? */
- /* directly reference the pre-extended integer pel plane */
- fpelPlane = recPic->m_picBuf[0] + startpad;
+ /* directly reference the extended integer pel planes */
+ fpelPlane[0] = recPic->m_picOrg[0];
+ fpelPlane[1] = recPic->m_picOrg[1];
+ fpelPlane[2] = recPic->m_picOrg[2];
isWeighted = false;
- if (w)
+ if (wp)
{
- if (!m_weightBuffer)
+ uint32_t numCUinHeight = (reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
+
+ int marginX = reconPic->m_lumaMarginX;
+ int marginY = reconPic->m_lumaMarginY;
+ intptr_t stride = reconPic->m_stride;
+ int cuHeight = g_maxCUSize;
+
+ for (int c = 0; c < numInterpPlanes; c++)
{
- uint32_t numCUinHeight = (recPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
- size_t padheight = (numCUinHeight * g_maxCUSize) + recPic->m_lumaMarginY * 2;
- m_weightBuffer = X265_MALLOC(pixel, lumaStride * padheight);
- if (!m_weightBuffer)
- return -1;
+ if (c == 1)
+ {
+ marginX = reconPic->m_chromaMarginX;
+ marginY = reconPic->m_chromaMarginY;
+ stride = reconPic->m_strideC;
+ cuHeight >>= reconPic->m_vChromaShift;
+ }
+
+ if (wp[c].bPresentFlag)
+ {
+ if (!weightBuffer[c])
+ {
+ size_t padheight = (numCUinHeight * cuHeight) + marginY * 2;
+ weightBuffer[c] = X265_MALLOC(pixel, stride * padheight);
+ if (!weightBuffer[c])
+ return -1;
+ }
+
+ /* use our buffer which will have weighted pixels written to it */
+ fpelPlane[c] = weightBuffer[c] + marginY * stride + marginX;
+ X265_CHECK(recPic->m_picOrg[c] - recPic->m_picBuf[c] == marginY * stride + marginX, "PicYuv pad calculation mismatch\n");
+
+ w[c].weight = wp[c].inputWeight;
+ w[c].offset = wp[c].inputOffset * (1 << (X265_DEPTH - 8));
+ w[c].shift = wp[c].log2WeightDenom;
+ w[c].round = w[c].shift ? 1 << (w[c].shift - 1) : 0;
+ }
}
isWeighted = true;
- weight = w->inputWeight;
- offset = w->inputOffset * (1 << (X265_DEPTH - 8));
- shift = w->log2WeightDenom;
- round = shift ? 1 << (shift - 1) : 0;
- m_numWeightedRows = 0;
-
- /* use our buffer which will have weighted pixels written to it */
- fpelPlane = m_weightBuffer + startpad;
}
return 0;
}
-MotionReference::~MotionReference()
-{
- X265_FREE(m_weightBuffer);
-}
-
-void MotionReference::applyWeight(int rows, int numRows)
+void MotionReference::applyWeight(int finishedRows, int maxNumRows)
{
- rows = X265_MIN(rows, numRows);
- if (m_numWeightedRows >= rows)
+ finishedRows = X265_MIN(finishedRows, maxNumRows);
+ if (numWeightedRows >= finishedRows)
return;
- int marginX = m_reconPic->m_lumaMarginX;
- int marginY = m_reconPic->m_lumaMarginY;
- pixel* src = (pixel*)m_reconPic->m_picOrg[0] + (m_numWeightedRows * (int)g_maxCUSize * lumaStride);
- pixel* dst = fpelPlane + ((m_numWeightedRows * (int)g_maxCUSize) * lumaStride);
- int width = m_reconPic->m_picWidth;
- int height = ((rows - m_numWeightedRows) * g_maxCUSize);
- if (rows == numRows)
- height = ((m_reconPic->m_picHeight % g_maxCUSize) ? (m_reconPic->m_picHeight % g_maxCUSize) : g_maxCUSize);
-
- // Computing weighted CU rows
- int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
- int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
- primitives.weight_pp(src, dst, lumaStride, padwidth, height,
- weight, round << correction, shift + correction, offset);
-
- // Extending Left & Right
- primitives.extendRowBorder(dst, lumaStride, width, height, marginX);
-
- // Extending Above
- if (m_numWeightedRows == 0)
+
+ int marginX = reconPic->m_lumaMarginX;
+ int marginY = reconPic->m_lumaMarginY;
+ intptr_t stride = reconPic->m_stride;
+ int width = reconPic->m_picWidth;
+ int height = (finishedRows - numWeightedRows) * g_maxCUSize;
+ if (finishedRows == maxNumRows && (reconPic->m_picHeight % g_maxCUSize))
{
- pixel *pixY = fpelPlane - marginX;
- for (int y = 0; y < marginY; y++)
- memcpy(pixY - (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel));
+ /* the last row may be partial height */
+ height -= g_maxCUSize;
+ height += reconPic->m_picHeight % g_maxCUSize;
}
+ int cuHeight = g_maxCUSize;
- // Extending Bottom
- if (rows == numRows)
+ for (int c = 0; c < numInterpPlanes; c++)
{
- pixel *pixY = fpelPlane - marginX + (m_reconPic->m_picHeight - 1) * lumaStride;
- for (int y = 0; y < marginY; y++)
- memcpy(pixY + (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel));
+ if (c == 1)
+ {
+ marginX = reconPic->m_chromaMarginX;
+ marginY = reconPic->m_chromaMarginY;
+ stride = reconPic->m_strideC;
+ width >>= reconPic->m_hChromaShift;
+ height >>= reconPic->m_vChromaShift;
+ cuHeight >>= reconPic->m_vChromaShift;
+ }
+
+ /* Do not generate weighted predictions if using original picture */
+ if (fpelPlane[c] == reconPic->m_picOrg[c])
+ continue;
+
+ const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
+ pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
+
+ // Computing weighted CU rows
+ int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+ int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
+ primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
+
+ // Extending Left & Right
+ primitives.extendRowBorder(dst, stride, width, height, marginX);
+
+ // Extending Above
+ if (numWeightedRows == 0)
+ {
+ pixel *pixY = fpelPlane[c] - marginX;
+ for (int y = 0; y < marginY; y++)
+ memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
+ }
+
+ // Extending Bottom
+ if (finishedRows == maxNumRows)
+ {
+ int picHeight = reconPic->m_picHeight;
+ if (c) picHeight >>= reconPic->m_vChromaShift;
+ pixel *pixY = fpelPlane[c] - marginX + (picHeight - 1) * stride;
+ for (int y = 0; y < marginY; y++)
+ memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
+ }
}
- m_numWeightedRows = rows;
+
+ numWeightedRows = finishedRows;
}
diff --git a/source/encoder/reference.h b/source/encoder/reference.h
index 3fb9afd..6b33499 100644
--- a/source/encoder/reference.h
+++ b/source/encoder/reference.h
@@ -25,13 +25,13 @@
#define X265_REFERENCE_H
#include "primitives.h"
+#include "picyuv.h"
#include "lowres.h"
#include "mv.h"
namespace x265 {
// private x265 namespace
-class PicYuv;
struct WeightParam;
class MotionReference : public ReferencePlanes
@@ -40,12 +40,12 @@ public:
MotionReference();
~MotionReference();
- int init(PicYuv*, WeightParam* w = NULL);
+ int init(PicYuv*, WeightParam* wp, const x265_param& p);
void applyWeight(int rows, int numRows);
- PicYuv* m_reconPic;
- pixel* m_weightBuffer;
- int m_numWeightedRows;
+ pixel* weightBuffer[3];
+ int numInterpPlanes;
+ int numWeightedRows;
protected:
diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp
index 1179fe0..3a7c7f2 100644
--- a/source/encoder/sao.cpp
+++ b/source/encoder/sao.cpp
@@ -3,6 +3,7 @@
*
* Authors: Steve Borho <steve at borho.org>
* Min Chen <chenm003 at 163.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -36,7 +37,7 @@ inline int32_t roundIBDI(int32_t num, int32_t den)
}
/* get the sign of input variable (TODO: this is a dup, make common) */
-inline int signOf(int x)
+inline int8_t signOf(int x)
{
return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
}
@@ -73,7 +74,6 @@ SAO::SAO()
m_param = NULL;
m_clipTable = NULL;
m_clipTableBase = NULL;
- m_offsetBo = NULL;
m_tmpU1[0] = NULL;
m_tmpU1[1] = NULL;
m_tmpU1[2] = NULL;
@@ -107,15 +107,17 @@ bool SAO::create(x265_param* param)
int numCtu = m_numCuInWidth * m_numCuInHeight;
CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt);
- CHECKED_MALLOC(m_offsetBo, pixel, maxY + 2 * rangeExt);
CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
for (int i = 0; i < 3; i++)
{
- CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth);
- CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth);
+ // SAO asm code will read 1 pixel before and after, so pad by 2
+ CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
+ m_tmpU1[i] += 1;
+ CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
+ m_tmpU2[i] += 1;
}
CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
@@ -145,15 +147,14 @@ fail:
void SAO::destroy()
{
X265_FREE(m_clipTableBase);
- X265_FREE(m_offsetBo);
X265_FREE(m_tmpL1);
X265_FREE(m_tmpL2);
for (int i = 0; i < 3; i++)
{
- X265_FREE(m_tmpU1[i]);
- X265_FREE(m_tmpU2[i]);
+ if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
+ if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
}
X265_FREE(m_count);
@@ -176,8 +177,11 @@ void SAO::allocSaoParam(SAOParam* saoParam) const
void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
{
Slice* slice = frame->m_encData->m_slice;
-
- int qpCb = Clip3(0, QP_MAX_MAX, qp + slice->m_pps->chromaCbQpOffset);
+ int qpCb = qp;
+ if (m_param->internalCsp == X265_CSP_I420)
+ qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
+ else
+ qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
m_lumaLambda = x265_lambda2_tab[qp];
m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
m_frame = frame;
@@ -225,8 +229,8 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
{
int x, y;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
- pixel* rec = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
- intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+ pixel* rec = m_frame->m_reconPic->getPlaneAddr(plane, addr);
+ intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
@@ -254,8 +258,8 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
pixel* tmpL;
pixel* tmpU;
- int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
- int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
+ int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
+ int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
{
const pixel* recR = &rec[ctuWidth - 1];
@@ -325,21 +329,34 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
if (!tpely)
rec += stride;
- for (x = 0; x < ctuWidth; x++)
- upBuff1[x] = signOf(rec[x] - tmpU[x]);
-
- for (y = startY; y < endY; y++)
+ if (ctuWidth & 15)
{
for (x = 0; x < ctuWidth; x++)
+ upBuff1[x] = signOf(rec[x] - tmpU[x]);
+
+ for (y = startY; y < endY; y++)
{
- int signDown = signOf(rec[x] - rec[x + stride]);
- int edgeType = signDown + upBuff1[x] + 2;
- upBuff1[x] = -signDown;
+ for (x = 0; x < ctuWidth; x++)
+ {
+ int8_t signDown = signOf(rec[x] - rec[x + stride]);
+ int edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x] = -signDown;
- rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+ rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+ }
+
+ rec += stride;
}
+ }
+ else
+ {
+ primitives.sign(upBuff1, rec, tmpU, ctuWidth);
- rec += stride;
+ for (y = startY; y < endY; y++)
+ {
+ primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
+ rec += stride;
+ }
}
break;
@@ -355,25 +372,77 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
if (!tpely)
rec += stride;
- for (x = startX; x < endX; x++)
- upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
+ if (!(ctuWidth & 15))
+ {
+ int8_t firstSign, lastSign;
+
+ if (!lpelx)
+ firstSign = upBuff1[0];
+
+ if (rpelx == picWidth)
+ lastSign = upBuff1[ctuWidth - 1];
+
+ primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth);
+
+ if (!lpelx)
+ upBuff1[0] = firstSign;
- for (y = startY; y < endY; y++)
+ if (rpelx == picWidth)
+ upBuff1[ctuWidth - 1] = lastSign;
+ }
+ else
{
- upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
for (x = startX; x < endX; x++)
+ upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
+ }
+
+ if (ctuWidth & 15)
+ {
+ for (y = startY; y < endY; y++)
+ {
+ upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
+ for (x = startX; x < endX; x++)
+ {
+ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
+ int edgeType = signDown + upBuff1[x] + 2;
+ upBufft[x + 1] = -signDown;
+ rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+ }
+
+ std::swap(upBuff1, upBufft);
+
+ rec += stride;
+ }
+ }
+ else
+ {
+ for (y = startY; y < endY; y++)
{
- int signDown = signOf(rec[x] - rec[x + stride + 1]);
- int edgeType = signDown + upBuff1[x] + 2;
- upBufft[x + 1] = -signDown;
- rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
- }
+ int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
+ pixel firstPxl = rec[0]; // copy first Pxl
+ pixel lastPxl = rec[ctuWidth - 1];
+ int8_t one = upBufft[1];
+ int8_t two = upBufft[endX + 1];
+
+ primitives.saoCuOrgE2(rec, upBufft, upBuff1, m_offsetEo, ctuWidth, stride);
+ if (!lpelx)
+ {
+ rec[0] = firstPxl;
+ upBufft[1] = one;
+ }
- std::swap(upBuff1, upBufft);
+ if (rpelx == picWidth)
+ {
+ rec[ctuWidth - 1] = lastPxl;
+ upBufft[endX + 1] = two;
+ }
- rec += stride;
- }
+ upBufft[startX] = iSignDown2;
+ std::swap(upBuff1, upBufft);
+ rec += stride;
+ }
+ }
break;
}
case SAO_EO_3: // dir: 45
@@ -387,43 +456,92 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
if (!tpely)
rec += stride;
- for (x = startX - 1; x < endX; x++)
- upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);
-
- for (y = startY; y < endY; y++)
+ if (ctuWidth & 15)
{
- x = startX;
- int signDown = signOf(rec[x] - tmpL[y + 1]);
- int edgeType = signDown + upBuff1[x] + 2;
- upBuff1[x - 1] = -signDown;
- rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
- for (x = startX + 1; x < endX; x++)
+ for (x = startX - 1; x < endX; x++)
+ upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);
+
+ for (y = startY; y < endY; y++)
{
- signDown = signOf(rec[x] - rec[x + stride - 1]);
- edgeType = signDown + upBuff1[x] + 2;
+ x = startX;
+ int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
+ int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = -signDown;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+
+ for (x = startX + 1; x < endX; x++)
+ {
+ signDown = signOf(rec[x] - rec[x + stride - 1]);
+ edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x - 1] = -signDown;
+ rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+ }
+
+ upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+
+ rec += stride;
}
+ }
+ else
+ {
+ int8_t firstSign, lastSign;
- upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+ if (lpelx)
+ firstSign = signOf(rec[-1] - tmpU[0]);
+ if (rpelx == picWidth)
+ lastSign = upBuff1[ctuWidth - 1];
- rec += stride;
+ primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth);
+
+ if (lpelx)
+ upBuff1[-1] = firstSign;
+ if (rpelx == picWidth)
+ upBuff1[ctuWidth - 1] = lastSign;
+
+ for (y = startY; y < endY; y++)
+ {
+ x = startX;
+ int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
+ int edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x - 1] = -signDown;
+ rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+
+ primitives.saoCuOrgE3(rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
+
+ upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+
+ rec += stride;
+ }
}
break;
}
case SAO_BO:
{
- const pixel* offsetBo = m_offsetBo;
+ const int8_t* offsetBo = m_offsetBo;
- for (y = 0; y < ctuHeight; y++)
+ if (ctuWidth & 15)
{
- for (x = 0; x < ctuWidth; x++)
- rec[x] = offsetBo[rec[x]];
-
- rec += stride;
+ #define SAO_BO_BITS 5
+ const int boShift = X265_DEPTH - SAO_BO_BITS;
+ for (y = 0; y < ctuHeight; y++)
+ {
+ for (x = 0; x < ctuWidth; x++)
+ {
+ int val = rec[x] + offsetBo[rec[x] >> boShift];
+ if (val < 0)
+ val = 0;
+ else if (val > ((1 << X265_DEPTH) - 1))
+ val = ((1 << X265_DEPTH) - 1);
+ rec[x] = (pixel)val;
+ }
+ rec += stride;
+ }
+ }
+ else
+ {
+ primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
}
-
break;
}
default: break;
@@ -436,7 +554,7 @@ void SAO::processSaoCu(int addr, int typeIdx, int plane)
/* Process SAO all units */
void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
{
- intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+ intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
int ctuWidth = g_maxCUSize;
int ctuHeight = g_maxCUSize;
@@ -449,12 +567,12 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
if (!idxY)
{
- pixel* rec = m_frame->m_reconPicYuv->m_picOrg[plane];
+ pixel* rec = m_frame->m_reconPic->m_picOrg[plane];
memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
}
int addr = idxY * m_numCuInWidth;
- pixel* rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr);
+ pixel* rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
for (int i = 0; i < ctuHeight + 1; i++)
{
@@ -466,8 +584,6 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
- const int boShift = X265_DEPTH - SAO_BO_BITS;
-
for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
{
addr = idxY * m_numCuInWidth + idxX;
@@ -481,15 +597,10 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
{
if (typeIdx == SAO_BO)
{
- pixel* offsetBo = m_offsetBo;
- int offset[SAO_NUM_BO_CLASSES];
- memset(offset, 0, sizeof(offset));
+ memset(m_offsetBo, 0, sizeof(m_offsetBo));
for (int i = 0; i < SAO_NUM_OFFSET; i++)
- offset[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
-
- for (int i = 0; i < (1 << X265_DEPTH); i++)
- offsetBo[i] = m_clipTable[i + offset[i >> boShift]];
+ m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
}
else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
{
@@ -506,7 +617,7 @@ void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
}
else if (idxX != (m_numCuInWidth - 1))
{
- rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr);
+ rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
for (int i = 0; i < ctuHeight + 1; i++)
{
@@ -543,12 +654,12 @@ void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
void SAO::calcSaoStatsCu(int addr, int plane)
{
int x, y;
- CUData* cu = m_frame->m_encData->getPicCTU(addr);
- const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr);
- const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
+ const CUData* cu = m_frame->m_encData->getPicCTU(addr);
+ const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
+ const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr);
const pixel* fenc;
const pixel* rec;
- intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride;
+ intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
@@ -579,8 +690,8 @@ void SAO::calcSaoStatsCu(int addr, int plane)
int skipB = plane ? 2 : 4;
int skipR = plane ? 3 : 5;
- int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
- int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
+ int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
+ int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
// SAO_BO:
{
@@ -670,14 +781,19 @@ void SAO::calcSaoStatsCu(int addr, int plane)
rec += stride;
}
- for (x = 0; x < ctuWidth; x++)
- upBuff1[x] = signOf(rec[x] - rec[x - stride]);
+ if (!(ctuWidth & 15))
+ primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
+ else
+ {
+ for (x = 0; x < ctuWidth; x++)
+ upBuff1[x] = signOf(rec[x] - rec[x - stride]);
+ }
for (y = startY; y < endY; y++)
{
for (x = 0; x < endX; x++)
{
- int signDown = signOf(rec[x] - rec[x + stride]);
+ int8_t signDown = signOf(rec[x] - rec[x + stride]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x] = -signDown;
@@ -722,7 +838,7 @@ void SAO::calcSaoStatsCu(int addr, int plane)
upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
for (x = startX; x < endX; x++)
{
- int signDown = signOf(rec[x] - rec[x + stride + 1]);
+ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBufft[x + 1] = -signDown;
stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
@@ -768,7 +884,7 @@ void SAO::calcSaoStatsCu(int addr, int plane)
{
for (x = startX; x < endX; x++)
{
- int signDown = signOf(rec[x] - rec[x + stride - 1]);
+ int8_t signDown = signOf(rec[x] - rec[x + stride - 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = -signDown;
stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
@@ -789,10 +905,10 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
int addr = idxX + m_numCuInWidth * idxY;
int x, y;
- CUData* cu = frame->m_encData->getPicCTU(addr);
+ const CUData* cu = frame->m_encData->getPicCTU(addr);
const pixel* fenc;
const pixel* rec;
- intptr_t stride = m_frame->m_reconPicYuv->m_stride;
+ intptr_t stride = m_frame->m_reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
@@ -826,7 +942,7 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
{
if (plane == 1)
{
- stride = frame->m_reconPicYuv->m_strideC;
+ stride = frame->m_reconPic->m_strideC;
picWidth >>= m_hChromaShift;
picHeight >>= m_vChromaShift;
ctuWidth >>= m_hChromaShift;
@@ -845,8 +961,8 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
count = m_countPreDblk[addr][plane][SAO_BO];
- const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr);
- const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr);
+ const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
+ const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr);
fenc = fenc0;
rec = rec0;
@@ -1214,7 +1330,7 @@ inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_
if (count)
{
int offset = roundIBDI(offsetOrg, count << SAO_BIT_INC);
- offset = Clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
+ offset = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
if (typeIdx < SAO_BO)
{
if (classIdx < 3)
diff --git a/source/encoder/sao.h b/source/encoder/sao.h
index 70df9da..da836f0 100644
--- a/source/encoder/sao.h
+++ b/source/encoder/sao.h
@@ -3,6 +3,7 @@
*
* Authors: Steve Borho <steve at borho.org>
* Min Chen <chenm003 at 163.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -77,7 +78,7 @@ protected:
PerPlane* m_offsetOrgPreDblk;
double m_depthSaoRate[2][4];
- pixel* m_offsetBo;
+ int8_t m_offsetBo[SAO_NUM_BO_CLASSES];
int8_t m_offsetEo[NUM_EDGETYPE];
int m_numCuInWidth;
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index cd86318..4a69f6b 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -35,8 +35,11 @@ using namespace x265;
#if _MSC_VER
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
+#pragma warning(disable: 4127) // conditional expression is constant
#endif
+#define MVP_IDX_BITS 1
+
ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
@@ -60,17 +63,17 @@ Search::Search() : JobProvider(NULL)
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
{
+ uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
m_param = ¶m;
m_bEnableRDOQ = param.rdLevel >= 4;
m_bFrameParallel = param.frameNumThreads > 1;
m_numLayers = g_log2Size[param.maxCUSize] - 2;
m_rdCost.setPsyRdScale(param.psyRd);
- m_me.setSearchMethod(param.searchMethod);
- m_me.setSubpelRefine(param.subpelRefine);
+ m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
- if (m_param->noiseReduction)
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
ok &= m_quant.allocNoiseReduction(param);
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
@@ -79,9 +82,9 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
* available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
- uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
+ uint32_t sizeL = 1 << (maxLog2CUSize * 2);
uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
- uint32_t numPartitions = NUM_CU_PARTITIONS;
+ uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
/* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
* the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
@@ -163,70 +166,53 @@ void Search::invalidateContexts(int fromDepth)
void Search::invalidateContexts(int) {}
#endif
-void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
+void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
- uint32_t subdiv = tuDepthL > trDepth;
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
- bool mCodeAll = true;
- const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
+ uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
- if (mCodeAll)
+ if (!(log2TrSize - m_hChromaShift < 2))
{
- if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
-
- if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
if (subdiv)
{
- absPartIdxStep >>= 2;
- width >>= 1;
- height >>= 1;
-
- uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t part = 0; part < 4; part++)
- codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
}
}
-void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
+void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
{
- if (!cu.getCbf(absPartIdx, ttype, trDepth))
+ if (!cu.getCbf(absPartIdx, ttype, tuDepth))
return;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
- if (tuDepthL > trDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t part = 0; part < 4; part++)
- codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
return;
}
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
- uint32_t trDepthC = trDepth;
+ uint32_t tuDepthC = tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- if (log2TrSizeC == 1)
- {
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+
+ if (log2TrSizeC < 2)
+ {
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
uint32_t qtLayer = log2TrSize - 2;
@@ -243,18 +229,19 @@ void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absP
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
- if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
+ uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
}
}
-void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
+void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
{
- uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+ CUData& cu = mode.cu;
+ uint32_t fullDepth = cuGeom.depth + tuDepth;
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t qtLayer = log2TrSize - 2;
uint32_t sizeIdx = log2TrSize - 2;
bool mightNotSplit = log2TrSize <= depthRange[1];
@@ -267,8 +254,6 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
mightSplit = true;
}
- CUData& cu = mode.cu;
-
Cost fullCost;
uint32_t bCBF = 0;
@@ -280,20 +265,22 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
if (mightSplit)
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+ initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
@@ -302,21 +289,21 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
+ primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSig)
{
- m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
- primitives.luma_add_ps[sizeIdx](reconQt, reconQtStride, pred, residual, stride, stride);
+ m_quant.invtransformNxN(residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
+ primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
}
else
// no coded residual, recon = pred
- primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
+ primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
- bCBF = !!numSig << trDepth;
+ bCBF = !!numSig << tuDepth;
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
- fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
+ fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
m_entropyCoder.resetBits();
if (!absPartIdx)
@@ -329,7 +316,7 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
- m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
}
if (cu.m_partSize[0] == SIZE_2Nx2N)
{
@@ -338,21 +325,21 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!trDepth)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!tuDepth)
{
- for (uint32_t part = 0; part < 4; part++)
- m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+ m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
- else if (!(absPartIdx & (qtNumParts - 1)))
+ else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
if (log2TrSize != depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+ if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
@@ -379,27 +366,26 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode
}
- // code split block
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- uint32_t absPartIdxSub = absPartIdx;
+ /* code split block */
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
if (m_param->bEnableTSkipFast)
- checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
+ checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
Cost splitCost;
uint32_t cbf = 0;
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
if (checkTransformSkip)
- codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
+ codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
else
- codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
+ codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
- cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+ cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
if (mightNotSplit && log2TrSize != depthRange[0])
{
@@ -428,16 +414,16 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
// recover transform index and Cbf values
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
// set reconstruction for next intra prediction blocks if full TU prediction won
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+ primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
@@ -445,13 +431,13 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
outCost.energy += fullCost.energy;
}
-void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
+void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
{
- uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+ uint32_t fullDepth = cuGeom.depth + tuDepth;
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t tuSize = 1 << log2TrSize;
- X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
+ X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
CUData& cu = mode.cu;
Yuv* predYuv = &mode.predYuv;
@@ -462,20 +448,22 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
int bTSkip = 0;
uint32_t bCBF = 0;
- pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
pixel* pred = predYuv->getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = fencYuv->m_size;
- int sizeIdx = log2TrSize - 2;
+ uint32_t sizeIdx = log2TrSize - 2;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+ initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
@@ -502,13 +490,13 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
pixel* tmpRecon = (useTSkip ? tsReconY : reconQt);
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
- primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
+ primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
if (numSig)
{
- m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
- primitives.luma_add_ps[sizeIdx](tmpRecon, tmpReconStride, pred, residual, stride, stride);
+ m_quant.invtransformNxN(residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
+ primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
}
else if (useTSkip)
{
@@ -518,12 +506,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
}
else
// no residual coded, recon = pred
- primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
+ primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
- uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
+ uint32_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
if (useTSkip)
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
@@ -539,7 +527,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
- m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
}
if (cu.m_partSize[0] == SIZE_2Nx2N)
{
@@ -548,20 +536,20 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!trDepth)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!tuDepth)
{
- for (uint32_t part = 0; part < 4; part++)
- m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+ m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
- else if (!(absPartIdx & (qtNumParts - 1)))
+ else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+ if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
@@ -591,19 +579,19 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
if (bTSkip)
{
memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
- primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
+ primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, tsReconY, tuSize);
}
else if (checkTransformSkip)
{
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
}
// set reconstruction for next intra prediction blocks
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+ primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
@@ -612,13 +600,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
}
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
-
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- bool bCheckFull = log2TrSize <= depthRange[1];
+ uint32_t fullDepth = cuGeom.depth + tuDepth;
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
+ bool bCheckFull = log2TrSize <= depthRange[1];
X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
@@ -629,62 +616,68 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
if (bCheckFull)
{
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
- pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
- int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- uint32_t stride = mode.fencYuv->m_size;
- uint32_t sizeIdx = log2TrSize - 2;
+ const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
+ pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
+ int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
+ uint32_t stride = mode.fencYuv->m_size;
+
+ // init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
- coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+ initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ // get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
- primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
+ uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
+ coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY;
+
+ uint32_t sizeIdx = log2TrSize - 2;
+ primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
+
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+
+ uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSig)
{
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
- primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
- cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ m_quant.invtransformNxN(residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
+ primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
+ cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
}
else
{
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
+ primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
else
{
X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
-
+
/* code split block */
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t cbf = 0;
- for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
- cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+ residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
+ cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
}
}
-void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
+void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
- if (tuDepth == trDepth)
+ if (tuDepth == cu.m_tuDepth[absPartIdx])
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t qtLayer = log2TrSize - 2;
// copy transform coefficients
@@ -698,88 +691,76 @@ void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, u
}
else
{
- uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
}
}
+inline void offsetCBFs(uint8_t subTUCBF[2])
+{
+ uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
+ subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
+ subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
+}
+
/* 4:2:2 post-TU split processing */
-void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
+void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t depth = cu.m_cuDepth[0];
- uint32_t fullDepth = depth + trDepth;
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
- uint32_t trDepthC = trDepth;
if (log2TrSize == 2)
{
- X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
- trDepthC--;
+ X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ ++log2TrSize;
}
- uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
+ uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
// move the CBFs down a level and set the parent CBF
uint8_t subTUCBF[2];
- uint8_t combinedSubTUCBF = 0;
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
-
- subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
- combinedSubTUCBF |= subTUCBF[subTU];
- }
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
- const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
+ subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
+ subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
+ offsetCBFs(subTUCBF);
- cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
- }
+ cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
}
/* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
- if (tuDepthL > trDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
- for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
- splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
- splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
+ outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+ splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
- cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
- cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
+ cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+ cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
return outDist;
}
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- uint32_t trDepthC = trDepth;
- if (log2TrSizeC == 1)
+ uint32_t tuDepthC = tuDepth;
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return 0;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
if (m_bEnableRDOQ)
@@ -788,41 +769,38 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
if (checkTransformSkip)
- return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
+ return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
+ ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t qtLayer = log2TrSize - 2;
- uint32_t tuSize = 1 << log2TrSizeC;
+ uint32_t stride = mode.fencYuv->m_csize;
+ const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t outDist = 0;
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+ do
{
- TextType ttype = (TextType)chromaId;
+ uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- TURecurse tuIterator(splitType, curPartNum, absPartIdx);
- do
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+ for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+ TextType ttype = (TextType)chromaId;
- pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
- int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
- uint32_t stride = mode.fencYuv->m_csize;
- uint32_t sizeIdxC = log2TrSizeC - 2;
-
+ int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
-
- pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
-
- // init availability pattern
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
- pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+ pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
@@ -830,52 +808,54 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
- // get prediction signal
- predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+ // init availability pattern
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
+ // get prediction signal
+ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
- primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
+ primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
- uint32_t tmpDist;
if (numSig)
{
- m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
- primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ m_quant.invtransformNxN(residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
+ primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
// no coded residual, recon = pred
- primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
+ primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
- tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
- outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+ outDist += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
if (m_rdCost.m_psyRd)
- psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
+ psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
+ primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
}
- while (tuIterator.isNextSection());
+ }
+ while (tuIterator.isNextSection());
- if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+ if (splitType == VERTICAL_SPLIT)
+ {
+ offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+ offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
return outDist;
}
/* returns distortion */
-uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = 2;
- uint32_t tuSize = 4;
+ uint32_t fullDepth = cuGeom.depth + tuDepth;
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
+ const uint32_t log2TrSizeC = 2;
uint32_t qtLayer = log2TrSize - 2;
uint32_t outDist = 0;
@@ -887,23 +867,26 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+ do
{
- TextType ttype = (TextType)chromaId;
+ uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
- TURecurse tuIterator(splitType, curPartNum, absPartIdx);
- do
+ for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+ TextType ttype = (TextType)chromaId;
- pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
- uint32_t sizeIdxC = log2TrSizeC - 2;
+ const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
@@ -911,8 +894,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
// init availability pattern
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
- pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
@@ -921,7 +903,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
// get prediction signal
- predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
uint64_t bCost = MAX_INT64;
uint32_t bDist = 0;
@@ -936,14 +918,14 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
pixel* recon = (useTSkip ? tskipReconC : reconQt);
uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
- primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
+ primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
if (numSig)
{
- m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
- primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ m_quant.invtransformNxN(residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
+ primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else if (useTSkip)
{
@@ -952,11 +934,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
}
else
{
- primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
+ primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
- uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
- tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+ uint32_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
+ tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
@@ -991,50 +973,45 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
if (bTSkip)
{
memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
- primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
+ primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
}
- cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
- pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
- primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
+ pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
+ primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
outDist += bDist;
psyEnergy += bEnergy;
}
- while (tuIterator.isNextSection());
+ }
+ while (tuIterator.isNextSection());
- if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+ if (splitType == VERTICAL_SPLIT)
+ {
+ offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+ offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
return outDist;
}
-void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
+void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
+ uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- if (tuDepthL == trDepth)
+ if (tuDepthL == tuDepth || log2TrSizeC == 2)
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- if (tuQuad)
- {
- log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
- trDepth--; /* also adjust the number of coeff read */
- }
-
// copy transform coefficients
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
- uint32_t qtLayer = log2TrSize - 2;
+ uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
@@ -1047,124 +1024,120 @@ void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absP
}
else
{
- if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
- /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
- extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
- else
- {
- uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
- }
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
}
}
-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
-
- if (tuDepthL == trDepth)
+ uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- uint32_t trDepthC = trDepth;
- if (log2TrSizeC == 1)
- {
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
- return;
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ uint32_t splitCbfU = 0, splitCbfV = 0;
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+ {
+ residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
+ splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+ }
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ {
+ cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+ cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
- ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
- uint32_t tuSize = 1 << log2TrSizeC;
- uint32_t stride = mode.fencYuv->m_csize;
- const int sizeIdxC = log2TrSizeC - 2;
+ return;
+ }
+
+ uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+ uint32_t tuDepthC = tuDepth;
+ if (log2TrSizeC < 2)
+ {
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
+ return;
+ log2TrSizeC = 2;
+ tuDepthC--;
+ }
+
+ ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
+ uint32_t stride = mode.fencYuv->m_csize;
+ const uint32_t sizeIdxC = log2TrSizeC - 2;
+
+ uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
+ const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
+
+ TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+ do
+ {
+ uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
- TURecurse tuIterator(splitType, curPartNum, absPartIdx);
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
+ pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
+ int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+ uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
+ coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC;
+ pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
- pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
- int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
- pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
- uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
- coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC;
- pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- uint32_t picStride = m_frame->m_reconPicYuv->m_strideC;
+ uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
+ if (chromaPredMode == DM_CHROMA_IDX)
+ chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
+ if (m_csp == X265_CSP_I422)
+ chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
- uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
- if (chromaPredMode == DM_CHROMA_IDX)
- chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
- chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
- pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+ // init availability pattern
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
- predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+ // get prediction signal
+ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
- X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
+ X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
- primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
- if (numSig)
- {
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
- primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
- }
- else
- {
- primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
- cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
- }
+ primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
+ uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+ if (numSig)
+ {
+ m_quant.invtransformNxN(residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
+ primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ }
+ else
+ {
+ // no coded residual, recon = pred
+ primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
+ cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
- while (tuIterator.isNextSection());
-
- if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
}
}
- else
+ while (tuIterator.isNextSection());
+
+ if (splitType == VERTICAL_SPLIT)
{
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- uint32_t splitCbfU = 0, splitCbfV = 0;
- for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
- {
- residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
- splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
- splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
- }
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- {
- cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
- cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
- }
+ offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+ offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
}
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
{
- uint32_t depth = cuGeom.depth;
CUData& cu = intraMode.cu;
cu.setPartSizeSubParts(partSize);
cu.setPredModeSubParts(MODE_INTRA);
+ m_quant.m_tqBypass = !!cu.m_tqBypass[0];
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
@@ -1183,96 +1156,310 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
- m_entropyCoder.codePartSize(cu, 0, depth);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
m_entropyCoder.store(intraMode.contexts);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
if (m_rdCost.m_psyRd)
- intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
+ {
+ const Yuv* fencYuv = intraMode.fencYuv;
+ intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
+ }
+
+ updateModeCost(intraMode);
+}
+
+/* Note that this function does not save the best intra prediction, it must
+ * be generated later. It records the best mode in the cu */
+void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+ CUData& cu = intraMode.cu;
+ uint32_t depth = cuGeom.depth;
+
+ cu.setPartSizeSubParts(SIZE_2Nx2N);
+ cu.setPredModeSubParts(MODE_INTRA);
+
+ const uint32_t initTuDepth = 0;
+ uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
+ uint32_t tuSize = 1 << log2TrSize;
+ const uint32_t absPartIdx = 0;
+
+ // Reference sample smoothing
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
+ initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
+
+ const pixel* fenc = intraMode.fencYuv->m_buf[0];
+ uint32_t stride = intraMode.fencYuv->m_size;
+
+ int sad, bsad;
+ uint32_t bits, bbits, mode, bmode;
+ uint64_t cost, bcost;
+
+ // 33 Angle modes once
+ ALIGN_VAR_32(pixel, bufScale[32 * 32]);
+ ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
+ ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
+ int scaleTuSize = tuSize;
+ int scaleStride = stride;
+ int costShift = 0;
+ int sizeIdx = log2TrSize - 2;
+
+ if (tuSize > 32)
+ {
+ // origin is 64x64, we scale to 32x32 and setup required parameters
+ primitives.scale2D_64to32(bufScale, fenc, stride);
+ fenc = bufScale;
+
+ pixel nScale[129];
+ intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
+ primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1, 0);
+
+ //TO DO: primitive
+ for (int x = 1; x < 65; x++)
+ {
+ intraNeighbourBuf[0][x] = nScale[x]; // Top pixel
+ intraNeighbourBuf[0][x + 64] = nScale[x + 64]; // Left pixel
+ intraNeighbourBuf[1][x] = nScale[x]; // Top pixel
+ intraNeighbourBuf[1][x + 64] = nScale[x + 64]; // Left pixel
+ }
+
+ scaleTuSize = 32;
+ scaleStride = 32;
+ costShift = 2;
+ sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
+ }
+
+ pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
+ int predsize = scaleTuSize * scaleTuSize;
+
+ m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+
+ /* there are three cost tiers for intra modes:
+ * pred[0] - mode probable, least cost
+ * pred[1], pred[2] - less probable, slightly more cost
+ * non-mpm modes - all cost the same (rbits) */
+ uint64_t mpms;
+ uint32_t preds[3];
+ uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+
+ // DC
+ primitives.cu[sizeIdx].intra_pred[DC_IDX](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
+ bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+ bmode = mode = DC_IDX;
+ bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+ // PLANAR
+ pixel* planar = intraNeighbourBuf[0];
+ if (tuSize & (8 | 16 | 32))
+ planar = intraNeighbourBuf[1];
+
+ primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](tmp, scaleStride, planar, 0, 0);
+ sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+ mode = PLANAR_IDX;
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ cost = m_rdCost.calcRdSADCost(sad, bits);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+ bool allangs = true;
+ if (primitives.cu[sizeIdx].intra_pred_allangs)
+ {
+ primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
+ primitives.cu[sizeIdx].intra_pred_allangs(tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
+ }
+ else
+ allangs = false;
+
+#define TRY_ANGLE(angle) \
+ if (allangs) { \
+ if (angle < 18) \
+ sad = sa8d(bufTrans, scaleTuSize, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+ else \
+ sad = sa8d(fenc, scaleStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+ cost = m_rdCost.calcRdSADCost(sad, bits); \
+ } else { \
+ int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
+ primitives.cu[sizeIdx].intra_pred[angle](tmp, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
+ sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+ cost = m_rdCost.calcRdSADCost(sad, bits); \
+ }
+
+ if (m_param->bEnableFastIntra)
+ {
+ int asad = 0;
+ uint32_t lowmode, highmode, amode = 5, abits = 0;
+ uint64_t acost = MAX_INT64;
+
+ /* pick the best angle, sampling at distance of 5 */
+ for (mode = 5; mode < 35; mode += 5)
+ {
+ TRY_ANGLE(mode);
+ COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+ }
+
+ /* refine best angle at distance 2, then distance 1 */
+ for (uint32_t dist = 2; dist >= 1; dist--)
+ {
+ lowmode = amode - dist;
+ highmode = amode + dist;
+
+ X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
+ TRY_ANGLE(lowmode);
+ COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+
+ X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
+ TRY_ANGLE(highmode);
+ COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+ }
+
+ if (amode == 33)
+ {
+ TRY_ANGLE(34);
+ COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+ }
+
+ COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
+ }
+ else // calculate and search all intra prediction angles for lowest cost
+ {
+ for (mode = 2; mode < 35; mode++)
+ {
+ TRY_ANGLE(mode);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ }
+ }
+
+ cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
+ intraMode.initCosts();
+ intraMode.totalBits = bbits;
+ intraMode.distortion = bsad;
+ intraMode.sa8dCost = bcost;
+ intraMode.sa8dBits = bbits;
+}
+
+void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+ CUData& cu = intraMode.cu;
+ Yuv* reconYuv = &intraMode.reconYuv;
+
+ X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
+ X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+ m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
+
+ Cost icosts;
+ codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
+ extractIntraResultQT(cu, *reconYuv, 0, 0);
+
+ intraMode.distortion = icosts.distortion;
+ intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
+
+ m_entropyCoder.resetBits();
+ if (m_slice->m_pps->bTransquantBypassEnabled)
+ m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
+ m_entropyCoder.codeSkipFlag(cu, 0);
+ m_entropyCoder.codePredMode(cu.m_predMode[0]);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
+ m_entropyCoder.codePredInfo(cu, 0);
+ intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+
+ bool bCodeDQP = m_slice->m_pps->bUseDQP;
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
+
+ intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
+ intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
+ if (m_rdCost.m_psyRd)
+ {
+ const Yuv* fencYuv = intraMode.fencYuv;
+ intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+ }
+ m_entropyCoder.store(intraMode.contexts);
updateModeCost(intraMode);
}
-uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
+uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
{
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
Yuv* predYuv = &intraMode.predYuv;
const Yuv* fencYuv = intraMode.fencYuv;
- uint32_t depth = cu.m_cuDepth[0];
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
- uint32_t numPU = 1 << (2 * initTrDepth);
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
+ uint32_t depth = cuGeom.depth;
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+ uint32_t numPU = 1 << (2 * initTuDepth);
+ uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
uint32_t qNumParts = cuGeom.numPartitions >> 2;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t absPartIdx = 0;
uint32_t totalDistortion = 0;
- int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
+ int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
// loop over partitions
- for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
+ for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
{
uint32_t bmode = 0;
if (sharedModes)
- bmode = sharedModes[pu];
+ bmode = sharedModes[puIdx];
else
{
// Reference sample smoothing
- initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
+ initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
// determine set of modes to be tested (using prediction signal only)
- pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
- pixel *above = m_refAbove + tuSize - 1;
- pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel *left = m_refLeft + tuSize - 1;
- pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
-
// 33 Angle modes once
- ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
+ ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
- ALIGN_VAR_32(pixel, bufScale[32 * 32]);
- pixel _above[4 * 32 + 1];
- pixel _left[4 * 32 + 1];
+
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
if (tuSize > 32)
{
- pixel *aboveScale = _above + 2 * 32;
- pixel *leftScale = _left + 2 * 32;
-
// origin is 64x64, we scale to 32x32 and setup required parameters
+ ALIGN_VAR_32(pixel, bufScale[32 * 32]);
primitives.scale2D_64to32(bufScale, fenc, stride);
fenc = bufScale;
- // reserve space in case primitives need to store data in above
- // or left buffers
- aboveScale[0] = leftScale[0] = above[0];
- primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
- primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+ pixel nScale[129];
+ intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
+ primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1, 0);
+
+ // TO DO: primitive
+ for (int x = 1; x < 65; x++)
+ {
+ intraNeighbourBuf[0][x] = nScale[x]; // Top pixel
+ intraNeighbourBuf[0][x + 64] = nScale[x + 64]; // Left pixel
+ intraNeighbourBuf[1][x] = nScale[x]; // Top pixel
+ intraNeighbourBuf[1][x + 64] = nScale[x + 64]; // Left pixel
+ }
scaleTuSize = 32;
scaleStride = 32;
costShift = 2;
sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
- // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
- above = aboveScale;
- left = leftScale;
- aboveFiltered = aboveScale;
- leftFiltered = leftScale;
}
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
@@ -1285,43 +1472,54 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
uint32_t preds[3];
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
- pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+ pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
uint64_t modeCosts[35];
uint64_t bcost;
// DC
- primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+ primitives.cu[sizeIdx].intra_pred[DC_IDX](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
// PLANAR
- pixel *abovePlanar = above;
- pixel *leftPlanar = left;
+ pixel* planar = intraNeighbourBuf[0];
if (tuSize >= 8 && tuSize <= 32)
- {
- abovePlanar = aboveFiltered;
- leftPlanar = leftFiltered;
- }
- primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ planar = intraNeighbourBuf[1];
+
+ primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](tmp, scaleStride, planar, 0, 0);
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
// angular predictions
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
- primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
- for (int mode = 2; mode < 35; mode++)
+ if (primitives.cu[sizeIdx].intra_pred_allangs)
{
- bool modeHor = (mode < 18);
- pixel *cmp = (modeHor ? buf_trans : fenc);
- intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
- bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
- sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
- modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
- COPY1_IF_LT(bcost, modeCosts[mode]);
+ primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
+ primitives.cu[sizeIdx].intra_pred_allangs(tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
+ for (int mode = 2; mode < 35; mode++)
+ {
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ if (mode < 18)
+ sad = sa8d(bufTrans, scaleTuSize, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+ else
+ sad = sa8d(fenc, scaleStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[mode]);
+ }
+ }
+ else
+ {
+ for (int mode = 2; mode < 35; mode++)
+ {
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
+ primitives.cu[sizeIdx].intra_pred[mode](tmp, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
+ sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift;
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[mode]);
+ }
}
/* Find the top maxCandCount candidate modes with cost within 25% of best
@@ -1330,7 +1528,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
* levels and at higher depths */
uint64_t candCostList[MAX_RD_INTRA_MODES];
uint32_t rdModeList[MAX_RD_INTRA_MODES];
- int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
+ int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
for (int i = 0; i < maxCandCount; i++)
candCostList[i] = MAX_INT64;
@@ -1346,51 +1544,50 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
if (candCostList[i] == MAX_INT64)
break;
m_entropyCoder.load(m_rqt[depth].cur);
- cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
+ cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
Cost icosts;
if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+ codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
- codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
+ codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
}
}
/* remeasure best mode, allowing TU splits */
- cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
+ cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
m_entropyCoder.load(m_rqt[depth].cur);
Cost icosts;
if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+ codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
- codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
+ codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
totalDistortion += icosts.distortion;
- extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
+ extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
// set reconstruction for next intra prediction blocks
- if (pu != numPU - 1)
+ if (puIdx != numPU - 1)
{
/* This has important implications for parallelism and RDO. It is writing intermediate results into the
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
* it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
* that the contexts should be tracked through each PU */
- pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- uint32_t dststride = m_frame->m_reconPicYuv->m_stride;
- pixel* src = reconYuv->getLumaAddr(absPartIdx);
+ pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ uint32_t dststride = m_frame->m_reconPic->m_stride;
+ const pixel* src = reconYuv->getLumaAddr(absPartIdx);
uint32_t srcstride = reconYuv->m_size;
- primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+ primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
}
}
if (numPU > 1)
{
uint32_t combCbfY = 0;
- uint32_t partIdx = 0;
- for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
- combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
+ for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+ combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
cu.m_cbf[0][offs] |= combCbfY;
@@ -1414,18 +1611,18 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
uint32_t tuSize = 1 << log2TrSizeC;
- int32_t scaleTuSize = tuSize;
+ uint32_t tuDepth = 0;
int32_t costShift = 0;
if (tuSize > 32)
{
- scaleTuSize = 32;
+ tuDepth = 1;
costShift = 2;
log2TrSizeC = 5;
}
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
+ IntraNeighbors intraNeighbors;
+ initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
cu.getAllowedChromaDir(0, modeList);
// check chroma modes
@@ -1440,13 +1637,12 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
uint64_t cost = 0;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- pixel* fenc = fencYuv->m_buf[chromaId];
+ const pixel* fenc = fencYuv->m_buf[chromaId];
pixel* pred = predYuv->m_buf[chromaId];
- pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
-
+ Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
// get prediction signal
- predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
- cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
+ predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
+ cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
}
if (cost < bestCost)
@@ -1456,7 +1652,7 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
}
}
- cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
+ cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
}
uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
@@ -1464,20 +1660,19 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
CUData& cu = intraMode.cu;
Yuv& reconYuv = intraMode.reconYuv;
- uint32_t depth = cu.m_cuDepth[0];
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
- uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
+ uint32_t depth = cuGeom.depth;
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
+ uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
+ uint32_t absPartStep = cuGeom.numPartitions;
uint32_t totalDistortion = 0;
- int part = partitionFromLog2Size(log2TrSize);
+ int size = partitionFromLog2Size(log2TrSize);
- TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
+ TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
uint32_t bestMode = 0;
uint32_t bestDist = 0;
@@ -1496,9 +1691,9 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
// restore context models
m_entropyCoder.load(m_rqt[depth].cur);
- cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
+ cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
uint32_t psyEnergy = 0;
- uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
+ uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
if (m_slice->m_pps->bTransformSkipEnabled)
m_entropyCoder.load(m_rqt[depth].cur);
@@ -1512,14 +1707,14 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!(absPartIdxC & (qtNumParts - 1)))
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!(absPartIdxC & (qNumParts - 1)))
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
- codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
- codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
- codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
+ codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
+ codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
+ codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
@@ -1528,7 +1723,7 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
bestCost = cost;
bestDist = dist;
bestMode = modeList[mode];
- extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
+ extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
@@ -1539,39 +1734,40 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
if (!tuIterator.isLastSection())
{
uint32_t zorder = cuGeom.encodeIdx + absPartIdxC;
- uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
- pixel *src, *dst;
+ uint32_t dststride = m_frame->m_reconPic->m_strideC;
+ const pixel* src;
+ pixel* dst;
- dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
+ dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCbAddr(absPartIdxC);
- primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
+ primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
- dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
+ dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCrAddr(absPartIdxC);
- primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
+ primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
}
memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
- cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
+ cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
totalDistortion += bestDist;
}
while (tuIterator.isNextSection());
- if (initTrDepth != 0)
+ if (initTuDepth != 0)
{
uint32_t combCbfU = 0;
uint32_t combCbfV = 0;
- uint32_t partIdx = 0;
- for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
+ uint32_t qNumParts = tuIterator.absPartIdxStep;
+ for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
- combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
+ combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
+ combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
}
- for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
cu.m_cbf[1][offs] |= combCbfU;
cu.m_cbf[2][offs] |= combCbfV;
@@ -1615,13 +1811,17 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me
continue;
cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
- cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
+ cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
- cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
+ cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(tempYuv, true, false);
+ motionCompensation(tempYuv, true, m_me.bChromaSATD);
+
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+ if (m_me.bChromaSATD)
+ costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
+
uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
costCand = costCand + m_rdCost.getCost(bitsCand);
if (costCand < outCost)
@@ -1642,41 +1842,45 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me
/* this function assumes the caller has configured its MotionEstimation engine with the
* correct source plane and source PU, and has called prepMotionCompensation() to set
* m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
{
uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
- MV amvpCand[AMVP_NUM_CANDS];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
- int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
+ int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
- uint32_t bestCost = MAX_INT;
int mvpIdx = 0;
int merange = m_param->searchRange;
- for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ MotionData* bestME = interMode.bestME[part];
+
+ if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
{
- MV mvCand = amvpCand[i];
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[list][ref][i];
- // NOTE: skip mvCand if Y is > merange and -FN>1
- if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
- continue;
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
- cu.clipMv(mvCand);
+ interMode.cu.clipMv(mvCand);
- Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
- predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
- uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+ Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
+ predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- if (bestCost > cost)
- {
- bestCost = cost;
- mvpIdx = i;
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
}
}
- MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
- setSearchRange(cu, mvp, merange, mvmin, mvmax);
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
+ setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
@@ -1685,34 +1889,32 @@ void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGe
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
- checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
+ checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
/* tie goes to the smallest ref ID, just like --no-pme */
- ScopedLock _lock(master.m_outputLock);
- if (cost < master.m_bestME[list].cost ||
- (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
+ ScopedLock _lock(master.m_meLock);
+ if (cost < bestME[list].cost ||
+ (cost == bestME[list].cost && ref < bestME[list].ref))
{
- master.m_bestME[list].mv = outmv;
- master.m_bestME[list].mvp = mvp;
- master.m_bestME[list].mvpIdx = mvpIdx;
- master.m_bestME[list].ref = ref;
- master.m_bestME[list].cost = cost;
- master.m_bestME[list].bits = bits;
+ bestME[list].mv = outmv;
+ bestME[list].mvp = mvp;
+ bestME[list].mvpIdx = mvpIdx;
+ bestME[list].ref = ref;
+ bestME[list].cost = cost;
+ bestME[list].bits = bits;
}
}
/* search of the best candidate for inter prediction
* returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
{
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
- MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
const Slice *slice = m_slice;
- PicYuv* fencPic = m_frame->m_origPicYuv;
int numPart = cu.getNumPartInter();
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
@@ -1727,23 +1929,24 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
+ MotionData* bestME = interMode.bestME[puIdx];
+
/* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
initMotionCompensation(cu, cuGeom, puIdx);
- pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
- m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
+ m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
uint32_t mrgCost = MAX_UINT;
- /* find best cost merge candidate */
- if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
+ /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
+ if (cu.m_partSize[0] != SIZE_2Nx2N)
{
merge.absPartIdx = m_puAbsPartIdx;
merge.width = m_puWidth;
merge.height = m_puHeight;
mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
- if (bMergeOnly && cu.m_log2CUSize[0] > 3)
+ if (bMergeOnly)
{
if (mrgCost == MAX_UINT)
{
@@ -1762,33 +1965,88 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
totalmebits += merge.bits;
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(*predYuv, true, bChroma);
+ motionCompensation(*predYuv, true, bChromaSA8D);
continue;
}
}
- MotionData bidir[2];
- uint32_t bidirCost = MAX_UINT;
- int bidirBits = 0;
-
- m_bestME[0].cost = MAX_UINT;
- m_bestME[1].cost = MAX_UINT;
+ bestME[0].cost = MAX_UINT;
+ bestME[1].cost = MAX_UINT;
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
- if (bDistributed)
+ /* Uni-directional prediction */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD)
{
- m_curMECu = &cu;
- m_curGeom = &cuGeom;
+ for (int l = 0; l < numPredDir; l++)
+ {
+ int ref = bestME[l].ref;
+ uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
+ bits += getTUBits(ref, numRefIdx[l]);
+
+ int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+
+ // Pick the best possible MVP from AMVP candidates based on least residual
+ int mvpIdx = 0;
+ int merange = m_param->searchRange;
+
+ if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
+ {
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[l][ref][i];
+
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
+
+ cu.clipMv(mvCand);
+ predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- /* this worker might already be enqueued for pmode, so other threads
- * might be looking at the ME job counts at any time, do these sets
- * in a safe order */
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
+ }
+ }
+
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
+
+ int satdCost;
+ setSearchRange(cu, mvp, merange, mvmin, mvmax);
+ satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
+
+ /* Get total cost of partition, but only include MV bit cost once */
+ bits += m_me.bitcost(outmv);
+ uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
+
+ /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
+ checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+
+ if (cost < bestME[l].cost)
+ {
+ bestME[l].mv = outmv;
+ bestME[l].mvp = mvp;
+ bestME[l].mvpIdx = mvpIdx;
+ bestME[l].cost = cost;
+ bestME[l].bits = bits;
+ }
+ }
+ }
+ else if (bDistributed)
+ {
+ m_meLock.acquire();
+ m_curInterMode = &interMode;
+ m_curGeom = &cuGeom;
m_curPart = puIdx;
m_totalNumME = 0;
m_numAcquiredME = 1;
m_numCompletedME = 0;
m_totalNumME = numRefIdx[0] + numRefIdx[1];
+ m_meLock.release();
if (!m_bJobsQueued)
JobProvider::enqueue();
@@ -1796,34 +2054,43 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
for (int i = 1; i < m_totalNumME; i++)
m_pool->pokeIdleThread();
- while (m_totalNumME > m_numAcquiredME)
+ do
{
- int id = ATOMIC_INC(&m_numAcquiredME);
- if (m_totalNumME >= id)
+ m_meLock.acquire();
+ if (m_totalNumME > m_numAcquiredME)
{
- id -= 1;
+ int id = m_numAcquiredME++;
+ m_meLock.release();
+
if (id < numRefIdx[0])
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
else
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
- m_meCompletionEvent.trigger();
+ m_meLock.acquire();
+ m_numCompletedME++;
+ m_meLock.release();
}
+ else
+ m_meLock.release();
}
+ while (m_totalNumME > m_numAcquiredME);
+
if (!m_bJobsQueued)
JobProvider::dequeue();
/* we saved L0-0 for ourselves */
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
+
+ m_meLock.acquire();
+ if (++m_numCompletedME == m_totalNumME)
m_meCompletionEvent.trigger();
+ m_meLock.release();
m_meCompletionEvent.wait();
}
else
{
- // Uni-directional prediction
for (int l = 0; l < numPredDir; l++)
{
for (int ref = 0; ref < numRefIdx[l]; ref++)
@@ -1831,33 +2098,36 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[l]);
- int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
+ int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
// Pick the best possible MVP from AMVP candidates based on least residual
- uint32_t bestCost = MAX_INT;
int mvpIdx = 0;
int merange = m_param->searchRange;
- for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
{
- MV mvCand = amvpCand[l][ref][i];
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[l][ref][i];
- // NOTE: skip mvCand if Y is > merange and -FN>1
- if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
- continue;
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
- cu.clipMv(mvCand);
- predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
- uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+ cu.clipMv(mvCand);
+ predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- if (bestCost > cost)
- {
- bestCost = cost;
- mvpIdx = i;
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
}
}
- MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
setSearchRange(cu, mvp, merange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
@@ -1867,45 +2137,67 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
- checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+ checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
- if (cost < m_bestME[l].cost)
+ if (cost < bestME[l].cost)
{
- m_bestME[l].mv = outmv;
- m_bestME[l].mvp = mvp;
- m_bestME[l].mvpIdx = mvpIdx;
- m_bestME[l].ref = ref;
- m_bestME[l].cost = cost;
- m_bestME[l].bits = bits;
+ bestME[l].mv = outmv;
+ bestME[l].mvp = mvp;
+ bestME[l].mvpIdx = mvpIdx;
+ bestME[l].ref = ref;
+ bestME[l].cost = cost;
+ bestME[l].bits = bits;
}
}
}
}
/* Bi-directional prediction */
- if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
+ MotionData bidir[2];
+ uint32_t bidirCost = MAX_UINT;
+ int bidirBits = 0;
+
+ if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
+ cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
+ bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
- bidir[0] = m_bestME[0];
- bidir[1] = m_bestME[1];
+ bidir[0] = bestME[0];
+ bidir[1] = bestME[1];
+
+ int satdCost;
+
+ if (m_me.bChromaSATD)
+ {
+ cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
+ cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+ cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
+ cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
+
+ prepMotionCompensation(cu, cuGeom, puIdx);
+ motionCompensation(tmpPredYuv, true, true);
- /* Generate reference subpels */
- PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
- PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
- Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
- predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
- predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
+ satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+ m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+ }
+ else
+ {
+ PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
+ PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
+ Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
- pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
- pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
+ /* Generate reference subpels */
+ predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
+ predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
- int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
- primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
- int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
+ bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+ satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ }
- bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
- bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
+ bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
/* Do not try zero MV if unidir motion predictors are beyond
@@ -1917,38 +2209,48 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
mvmin <<= 2;
mvmax <<= 2;
- bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
- bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
- // coincident blocks of the two reference pictures
- pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
- pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
- intptr_t refStride = slice->m_mref[0][0].lumaStride;
+ /* coincident blocks of the two reference pictures */
+ if (m_me.bChromaSATD)
+ {
+ cu.m_mv[0][m_puAbsPartIdx] = mvzero;
+ cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+ cu.m_mv[1][m_puAbsPartIdx] = mvzero;
+ cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
- primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
- satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ prepMotionCompensation(cu, cuGeom, puIdx);
+ motionCompensation(tmpPredYuv, true, true);
+
+ satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+ m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+ }
+ else
+ {
+ const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ intptr_t refStride = slice->m_mref[0][0].lumaStride;
- MV mvp0 = m_bestME[0].mvp;
- int mvpIdx0 = m_bestME[0].mvpIdx;
- uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+ primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+ satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ }
- MV mvp1 = m_bestME[1].mvp;
- int mvpIdx1 = m_bestME[1].mvpIdx;
- uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+ MV mvp0 = bestME[0].mvp;
+ int mvpIdx0 = bestME[0].mvpIdx;
+ uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
- uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
+ MV mvp1 = bestME[1].mvp;
+ int mvpIdx1 = bestME[1].mvpIdx;
+ uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
- if (bDistributed)
- {
- cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
- cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
- }
+ uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
- checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
- checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
+ checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
+ checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
if (cost < bidirCost)
{
@@ -1965,7 +2267,7 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
}
/* select best option and store into CU */
- if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
+ if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{
cu.m_mergeFlag[m_puAbsPartIdx] = true;
cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
@@ -1977,39 +2279,39 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
totalmebits += merge.bits;
}
- else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
+ else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}
- else if (m_bestME[0].cost <= m_bestME[1].cost)
+ else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
- cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
- cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
- cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
+ cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
+ cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+ cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
- totalmebits += m_bestME[0].bits;
+ totalmebits += bestME[0].bits;
}
else
{
@@ -2017,19 +2319,19 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
- cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
- cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
- cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
+ cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
+ cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+ cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
- totalmebits += m_bestME[1].bits;
+ totalmebits += bestME[1].bits;
}
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(*predYuv, true, bChroma);
+ motionCompensation(*predYuv, true, bChromaSA8D);
}
interMode.sa8dBits += totalmebits;
@@ -2142,12 +2444,11 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
- uint32_t cuSize = 1 << cu.m_log2CUSize[0];
uint32_t depth = cu.m_cuDepth[0];
// No residual coding : SKIP mode
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
@@ -2155,11 +2456,10 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
// Luma
int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
- interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+ interMode.distortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
// Chroma
- part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
- interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
- interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+ interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
@@ -2185,18 +2485,16 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
Yuv* predYuv = &interMode.predYuv;
- ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
+ uint32_t depth = cuGeom.depth;
+ ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
const Yuv* fencYuv = interMode.fencYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
- uint32_t log2CUSize = cu.m_log2CUSize[0];
- uint32_t cuSize = 1 << log2CUSize;
- uint32_t depth = cu.m_cuDepth[0];
-
- int part = partitionFromLog2Size(log2CUSize);
- int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+ uint32_t log2CUSize = cuGeom.log2CUSize;
+ int sizeIdx = log2CUSize - 2;
+ uint32_t tqBypass = cu.m_tqBypass[0];
m_quant.setQPforQuant(interMode.cu);
resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
@@ -2207,13 +2505,13 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
m_entropyCoder.load(m_rqt[depth].cur);
Cost costs;
- estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
+ estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
- if (!cu.m_tqBypass[0])
+ if (!tqBypass)
{
- uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
- cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
- cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+ uint32_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
+ cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+ cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
/* Consider the RD cost of not signaling any residual */
m_entropyCoder.load(m_rqt[depth].cur);
@@ -2239,20 +2537,21 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
}
if (cu.getQtRootCbf(0))
- saveResidualQTData(cu, *resiYuv, 0, depth);
+ saveResidualQTData(cu, *resiYuv, 0, 0);
/* calculate signal bits for inter/merge/skip coded CU */
m_entropyCoder.load(m_rqt[depth].cur);
+ m_entropyCoder.resetBits();
+ if (m_slice->m_pps->bTransquantBypassEnabled)
+ m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
+
uint32_t coeffBits, bits;
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
{
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
/* Merge/Skip */
- m_entropyCoder.resetBits();
- if (m_slice->m_pps->bTransquantBypassEnabled)
- m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codeMergeIndex(cu, 0);
coeffBits = 0;
@@ -2260,17 +2559,14 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
}
else
{
- m_entropyCoder.resetBits();
- if (m_slice->m_pps->bTransquantBypassEnabled)
- m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
- m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
bits = m_entropyCoder.getNumberOfWrittenBits();
coeffBits = bits - mvBits;
@@ -2284,9 +2580,9 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
reconYuv->copyFromYuv(*predYuv);
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
- uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
- bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
- bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+ uint32_t bestDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+ bestDist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ bestDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
@@ -2297,50 +2593,14 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
updateModeCost(interMode);
}
-void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
-{
- CUData& cu = mode.cu;
-
- m_quant.setQPforQuant(mode.cu);
-
- if (cu.m_predMode[0] == MODE_INTER)
- {
- uint32_t tuDepthRange[2];
- cu.getInterTUQtDepthRange(tuDepthRange, 0);
-
- residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
- if (cu.getQtRootCbf(0))
- mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
- else
- {
- mode.reconYuv.copyFromYuv(mode.predYuv);
- if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
- cu.setSkipFlagSubParts(true);
- }
- }
- else if (cu.m_predMode[0] == MODE_INTRA)
- {
- uint32_t tuDepthRange[2];
- cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
- residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
- getBestIntraModeChroma(mode, cuGeom);
- residualQTIntraChroma(mode, cuGeom, 0, 0);
- mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
- }
-}
-
-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
{
+ uint32_t depth = cuGeom.depth + tuDepth;
CUData& cu = mode.cu;
- X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
-
- uint32_t log2TrSize = g_maxLog2CUSize - depth;
- uint32_t tuDepth = depth - cu.m_cuDepth[0];
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
- if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+ if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
bCheckFull = false;
if (bCheckFull)
@@ -2349,43 +2609,42 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if (log2TrSizeC == 1)
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+ uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
uint32_t setCbf = 1 << tuDepth;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
- coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
+ coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
uint32_t sizeIdx = log2TrSize - 2;
- cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
const Yuv* fencYuv = mode.fencYuv;
- int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
+ int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = resiYuv.m_size;
- pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSigY)
{
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
+ m_quant.invtransformNxN(curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
}
else
{
- primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
+ primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
}
@@ -2395,8 +2654,8 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
uint32_t strideResiC = resiYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
- coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
+ coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
+ coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
@@ -2409,30 +2668,30 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
- pixel* fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
+ const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
if (numSigU)
{
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
+ m_quant.invtransformNxN(curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
- primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
+ primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
- pixel* fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
+ const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
if (numSigV)
{
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
+ m_quant.invtransformNxN(curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
- primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
+ primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
}
@@ -2449,48 +2708,58 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
{
X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
- const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
- for (uint32_t i = 0; i < 4; i++)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
- ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
- ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
- vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+ residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
+ ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+ ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
+ for (uint32_t i = 0; i < 4 * qNumParts; ++i)
{
- cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
- cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
- cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
+ cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
+ cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
+ cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
}
}
}
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
+uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
+{
+ uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
+
+ if (m_rdCost.m_psyRd)
+ return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
+ else
+ return m_rdCost.calcRdCost(dist, nullBits);
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
- uint32_t log2TrSize = g_maxLog2CUSize - depth;
+ uint32_t depth = cuGeom.depth + tuDepth;
+ uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckSplit = log2TrSize > depthRange[0];
bool bCheckFull = log2TrSize <= depthRange[1];
+ bool bSplitPresentFlag = bCheckSplit && bCheckFull;
- if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
+ if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
bCheckFull = false;
X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
- X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
- uint32_t tuDepth = depth - cu.m_cuDepth[0];
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+ if (log2TrSizeC < 2)
{
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
// code full block
@@ -2499,9 +2768,9 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
- uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
- uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
- uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
@@ -2509,7 +2778,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
uint32_t trSize = 1 << log2TrSize;
const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+ uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
const Yuv* fencYuv = mode.fencYuv;
// code full block
@@ -2526,221 +2795,201 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
- cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
- int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
+ int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
- if (cbfFlag[TEXT_LUMA][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
- singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
-
- uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
- if (bCodeChroma)
- {
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
- {
- coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
-
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-
- cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
-
- if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
- m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
-
- fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
- resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
- numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
- cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
-
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
- if (cbfFlag[chromaId][tuIterator.section])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-
- uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
- singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+ if (bSplitPresentFlag && log2TrSize > depthRange[0])
+ m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
+ fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
- singleBitsPrev = newBits;
- }
- while (tuIterator.isNextSection());
- }
- }
+ // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
+ // So it is valid if we encode coefficients and then cbfs at least for analysis.
+// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
+ if (cbfFlag[TEXT_LUMA][0])
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
- const uint32_t numCoeffY = 1 << (log2TrSize * 2);
- const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
+ uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
- uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
+ uint32_t distY = primitives.cu[partSize].ssd_s(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
uint32_t psyEnergyY = 0;
if (m_rdCost.m_psyRd)
psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
- int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
+ int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
if (cbfFlag[TEXT_LUMA][0])
{
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
+ m_quant.invtransformNxN(curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
- const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
- uint32_t nonZeroPsyEnergyY = 0;
+ // non-zero cost calculation for luma - This is an approximation
+ // finally we have to encode correct cbf after comparing with null cost
+ const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+ uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+ uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
if (m_rdCost.m_psyRd)
+ {
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+ singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+ }
+ else
+ singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
if (cu.m_tqBypass[0])
{
- distY = nonZeroDistY;
- psyEnergyY = nonZeroPsyEnergyY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
else
{
- uint64_t singleCostY = 0;
- if (m_rdCost.m_psyRd)
- singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
- else
- singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
- const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
- uint64_t nullCostY = 0;
- if (m_rdCost.m_psyRd)
- nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
- else
- nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
+ // zero-cost calculation for luma. This is an approximation
+ // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
+ // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
+ uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+
if (nullCostY < singleCostY)
{
cbfFlag[TEXT_LUMA][0] = 0;
+ singleBits[TEXT_LUMA][0] = 0;
+ primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
#if CHECKED_BUILD || _DEBUG
+ uint32_t numCoeffY = 1 << (log2TrSize << 1);
memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
#endif
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = nullCostY;
+ singleDist[TEXT_LUMA][0] = distY;
+ singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
else
{
- distY = nonZeroDistY;
- psyEnergyY = nonZeroPsyEnergyY;
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = singleCostY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
}
}
- else if (checkTransformSkipY)
+ else
{
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
- const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
- if (m_rdCost.m_psyRd)
- minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
- else
- minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
+ if (checkTransformSkipY)
+ minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+ primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
+ singleDist[TEXT_LUMA][0] = distY;
+ singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
- singleDistComp[TEXT_LUMA][0] = distY;
- singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
- if (!cbfFlag[TEXT_LUMA][0])
- primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
- uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t distC = 0, psyEnergyC = 0;
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ do
+ {
+ uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
- distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+ if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
+ m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
- if (cbfFlag[chromaId][tuIterator.section])
- {
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
- log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
- uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
- const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
- uint32_t nonZeroPsyEnergyC = 0;
- if (m_rdCost.m_psyRd)
- nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-
- if (cu.m_tqBypass[0])
- {
- distC = nonZeroDistC;
- psyEnergyC = nonZeroPsyEnergyC;
- }
- else
+ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
+ resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+ numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+ cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+
+ //Coding cbf flags has been removed from here
+// m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
+ if (cbfFlag[chromaId][tuIterator.section])
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+ uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+ singleBitsPrev = newBits;
+
+ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ distC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].ssd_s(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+
+ if (cbfFlag[chromaId][tuIterator.section])
{
- uint64_t singleCostC = 0;
+ m_quant.invtransformNxN(curResiC, strideResiC, coeffCurC + subTUOffset,
+ log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
+
+ // non-zero cost calculation for luma, same as luma - This is an approximation
+ // finally we have to encode correct cbf after comparing with null cost
+ uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+ uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+ uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
if (m_rdCost.m_psyRd)
- singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ {
+ nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ }
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
- const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
- uint64_t nullCostC = 0;
- if (m_rdCost.m_psyRd)
- nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+ if (cu.m_tqBypass[0])
+ {
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+ }
else
- nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
- if (nullCostC < singleCostC)
{
- cbfFlag[chromaId][tuIterator.section] = 0;
+ //zero-cost calculation for chroma. This is an approximation
+ uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
+
+ if (nullCostC < singleCostC)
+ {
+ cbfFlag[chromaId][tuIterator.section] = 0;
+ singleBits[chromaId][tuIterator.section] = 0;
+ primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
#if CHECKED_BUILD || _DEBUG
+ uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
#endif
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = nullCostC;
+ singleDist[chromaId][tuIterator.section] = distC;
+ singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
}
else
{
- distC = nonZeroDistC;
- psyEnergyC = nonZeroPsyEnergyC;
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = singleCostC;
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
}
}
}
- else if (checkTransformSkipC)
+ else
{
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
- const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
- if (m_rdCost.m_psyRd)
- minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
- else
- minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
+ if (checkTransformSkipC)
+ minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
+ primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
+ singleDist[chromaId][tuIterator.section] = distC;
+ singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
}
- singleDistComp[chromaId][tuIterator.section] = distC;
- singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
-
- if (!cbfFlag[chromaId][tuIterator.section])
- primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
-
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
while (tuIterator.isNextSection());
@@ -2763,20 +3012,20 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ fenc = fencYuv->getLumaAddr(absPartIdx);
resi = resiYuv.getLumaAddr(absPartIdx);
uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
if (numSigTSkipY)
{
m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
+ m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
+ m_quant.invtransformNxN(tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
- nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
+ nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
if (m_rdCost.m_psyRd)
{
@@ -2791,12 +3040,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
else
{
- singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
- singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
bestTransformMode[TEXT_LUMA][0] = 1;
+ uint32_t numCoeffY = 1 << (log2TrSize << 1);
memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
- primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
+ primitives.cu[partSize].copy_ss(curResiY, strideResiY, tsResiY, trSize);
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
@@ -2821,7 +3071,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
@@ -2831,42 +3081,43 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
- fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
+ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
m_entropyCoder.resetBits();
- singleBitsComp[chromaId][tuIterator.section] = 0;
+ singleBits[chromaId][tuIterator.section] = 0;
if (numSigTSkipC)
{
- m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
- singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
+ m_quant.invtransformNxN(tsResiC, trSizeC, tsCoeffC,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
- uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
- nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
+ uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
+ nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
- singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
}
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
}
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
else
{
- singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
- singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
bestTransformMode[chromaId][tuIterator.section] = 1;
+ uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
- primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
+ primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, tsResiC, trSizeC);
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
@@ -2875,66 +3126,58 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
}
}
+ // Here we were encoding cbfs and coefficients, after calculating distortion above.
+ // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
+ // bits required for coefficients and added with number of cbf bits. As I tested the order does not
+ // make any difference. But bit confused whether I should load the original context as below.
m_entropyCoder.load(m_rqt[depth].rqtRoot);
-
m_entropyCoder.resetBits();
- if (log2TrSize > depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
-
+ //Encode cbf flags
if (bCodeChroma)
{
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ if (!splitIntoSubTUs)
{
- if (!splitIntoSubTUs)
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
- else
- {
- offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
- }
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
+ }
+ else
+ {
+ offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+ offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
}
}
- m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
- if (cbfFlag[TEXT_LUMA][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+ m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
- if (bCodeChroma)
- {
- uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
- {
- coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
- if (!splitIntoSubTUs)
- {
- if (cbfFlag[chromaId][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
- }
- else
- {
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- if (cbfFlag[chromaId][subTU])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
- }
- }
- }
+ uint32_t coeffBits = 0;
+ coeffBits = singleBits[TEXT_LUMA][0];
+ for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
+ {
+ coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
+ coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
}
- fullCost.distortion += singleDistComp[TEXT_LUMA][0];
- fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
+ // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
+ // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
+ // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
+ // For that reason, I am collecting individual coefficient bits only.
+ fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+
+ fullCost.distortion += singleDist[TEXT_LUMA][0];
+ fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
- fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
- fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
+ fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
+ fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
}
- fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
if (m_rdCost.m_psyRd)
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
else
@@ -2951,31 +3194,40 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
}
Cost splitCost;
- const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+ if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+ {
+ // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
+ m_entropyCoder.resetBits();
+ m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+ splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+ }
+
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
- for (uint32_t i = 0; i < 4; ++i)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
- ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
- ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
- vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+ estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
+ ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+ ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
+ for (uint32_t i = 0; i < 4 * qNumParts; ++i)
{
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
}
+ // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+ // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+ // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+ // at depth 0 (for example).
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
- encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
-
- splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+ codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
+ uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+ splitCost.bits += splitCbfBits;
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
@@ -2999,15 +3251,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
- const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-
- uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
- for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
+ if (!splitIntoSubTUs)
{
- const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
-
- cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
- cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
+ cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
+ cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
+ }
+ else
+ {
+ uint32_t tuNumParts = absPartIdxStep >> 1;
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
X265_CHECK(bCheckFull, "check-full must be set\n");
@@ -3019,23 +3274,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
if (bCodeChroma)
{
- uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
- uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ if (!splitIntoSubTUs)
{
- for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
- {
- const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
+ cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
+ cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
+ }
+ else
+ {
+ uint32_t tuNumParts = absPartIdxStep >> 1;
- if (splitIntoSubTUs)
- {
- uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
- cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
- }
- else
- cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
- }
+ offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
+ offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
@@ -3045,147 +3298,61 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
outCosts.energy += fullCost.energy;
}
-void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
+void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
{
- X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
- X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
-
- const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
- const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
- const bool bSubdiv = curTuDepth != tuDepth;
- const uint32_t log2TrSize = g_maxLog2CUSize - depth;
+ X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+ const bool bSubdiv = tuDepth < cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
- if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
-
- bool mCodeAll = true;
- uint32_t trWidthC = 1 << log2TrSizeC;
- uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
-
- const uint32_t numPels = trWidthC * trHeightC;
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
-
- if (bSubdivAndCbf)
+ if (!(log2TrSize - m_hChromaShift < 2))
{
- const bool bFirstCbfOfCU = curTuDepth == 0;
- if (bFirstCbfOfCU || mCodeAll)
- {
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1);
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
- }
- else
- {
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
- }
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
+ }
+ else
+ {
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
}
if (!bSubdiv)
{
- // Luma
- const uint32_t qtLayer = log2TrSize - 2;
- uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
- coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
-
- // Chroma
- bool bCodeChroma = true;
- uint32_t tuDepthC = tuDepth;
- if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
- {
- log2TrSizeC++;
- tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
- }
-
- if (bSubdivAndCbf)
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
- else
- {
- if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
-
- if (bCodeChroma)
- {
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
- coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
-
- if (!splitIntoSubTUs)
- {
- if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
- if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
- }
- else
- {
- uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
- uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
- {
- if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
- }
- if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
- {
- if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
- }
- }
- }
- }
+ m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
}
else
{
- if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
- {
- const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
- for (uint32_t i = 0; i < 4; ++i)
- encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
- }
+ uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
}
}
-void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
+void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
- X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
- const uint32_t curTrMode = depth - cu.m_cuDepth[0];
- const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
- if (curTrMode < tuDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
- for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
- saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
return;
}
- const uint32_t log2TrSize = g_maxLog2CUSize - depth;
const uint32_t qtLayer = log2TrSize - 2;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if (log2TrSizeC == 1)
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
diff --git a/source/encoder/search.h b/source/encoder/search.h
index 79ed94a..081954b 100644
--- a/source/encoder/search.h
+++ b/source/encoder/search.h
@@ -35,9 +35,6 @@
#include "entropy.h"
#include "motion.h"
-#define MVP_IDX_BITS 1
-#define NUM_LAYERS 4
-
namespace x265 {
// private namespace
@@ -68,6 +65,63 @@ struct RQTData
Yuv bidirPredYuv[2];
};
+struct MotionData
+{
+ MV mv;
+ MV mvp;
+ int mvpIdx;
+ int ref;
+ uint32_t cost;
+ int bits;
+};
+
+struct Mode
+{
+ CUData cu;
+ const Yuv* fencYuv;
+ Yuv predYuv;
+ Yuv reconYuv;
+ Entropy contexts;
+
+ enum { MAX_INTER_PARTS = 2 };
+
+ MotionData bestME[MAX_INTER_PARTS][2];
+ MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
+
+ uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
+ uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
+ uint32_t sa8dBits; // signal bits used in sa8dCost calculation
+ uint32_t psyEnergy; // sum of partition psycho-visual energy difference
+ uint32_t distortion; // sum of partition SSE distortion
+ uint32_t totalBits; // sum of partition bits (mv + coeff)
+ uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
+ uint32_t coeffBits; // Texture bits (DCT Coeffs)
+
+ void initCosts()
+ {
+ rdCost = 0;
+ sa8dCost = 0;
+ sa8dBits = 0;
+ psyEnergy = 0;
+ distortion = 0;
+ totalBits = 0;
+ mvBits = 0;
+ coeffBits = 0;
+ }
+
+ void addSubCosts(const Mode& subMode)
+ {
+ rdCost += subMode.rdCost;
+ sa8dCost += subMode.sa8dCost;
+ sa8dBits += subMode.sa8dBits;
+ psyEnergy += subMode.psyEnergy;
+ distortion += subMode.distortion;
+ totalBits += subMode.totalBits;
+ mvBits += subMode.mvBits;
+ coeffBits += subMode.coeffBits;
+ }
+};
+
inline int getTUBits(int idx, int numIdx)
{
return idx + (idx < numIdx - 1);
@@ -98,58 +152,6 @@ public:
uint32_t m_numLayers;
uint32_t m_refLagPixels;
- struct Mode
- {
- CUData cu;
- const Yuv* fencYuv;
- Yuv predYuv;
- Yuv reconYuv;
- Entropy contexts;
-
- uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
- uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
- uint32_t sa8dBits; // signal bits used in sa8dCost calculation
- uint32_t psyEnergy; // sum of partition psycho-visual energy difference
- uint32_t distortion; // sum of partition SSE distortion
- uint32_t totalBits; // sum of partition bits (mv + coeff)
- uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
- uint32_t coeffBits; // Texture bits (DCT Coeffs)
-
- void initCosts()
- {
- rdCost = 0;
- sa8dCost = 0;
- sa8dBits = 0;
- psyEnergy = 0;
- distortion = 0;
- totalBits = 0;
- mvBits = 0;
- coeffBits = 0;
- }
-
- void addSubCosts(const Mode& subMode)
- {
- rdCost += subMode.rdCost;
- sa8dCost += subMode.sa8dCost;
- sa8dBits += subMode.sa8dBits;
- psyEnergy += subMode.psyEnergy;
- distortion += subMode.distortion;
- totalBits += subMode.totalBits;
- mvBits += subMode.mvBits;
- coeffBits += subMode.coeffBits;
- }
- };
-
- struct MotionData
- {
- MV mv;
- MV mvp;
- int mvpIdx;
- int ref;
- uint32_t cost;
- int bits;
- };
-
Search();
~Search();
@@ -162,6 +164,11 @@ public:
// full RD search of intra modes. if sharedModes is not NULL, it directly uses them
void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes);
+ // select best intra mode using only sa8d costs, cannot measure NxN intra
+ void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+ // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode
+ void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+
// estimation inter prediction (non-skip)
bool predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma);
@@ -169,38 +176,41 @@ public:
void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
void encodeResAndCalcRdSkipCU(Mode& interMode);
- void generateCoeffRecon(Mode& mode, const CUGeom& cuGeom);
- void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]);
+ // encode residual without rd-cost
+ void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+ void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+ void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth);
- uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
+ // pick be chroma mode from available using just sa8d costs
+ void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
protected:
/* motion estimation distribution */
ThreadLocalData* m_tld;
- CUData* m_curMECu;
+ Mode* m_curInterMode;
const CUGeom* m_curGeom;
int m_curPart;
- MotionData m_bestME[2];
uint32_t m_listSelBits[3];
int m_totalNumME;
volatile int m_numAcquiredME;
volatile int m_numCompletedME;
Event m_meCompletionEvent;
- Lock m_outputLock;
+ Lock m_meLock;
bool m_bJobsQueued;
- void singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref);
+ void singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
- void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth);
+ void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
// RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
- uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes);
+ uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
// RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
- void codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height);
- void codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype);
+ void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
+ void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
+ void codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
struct Cost
{
@@ -211,24 +221,21 @@ protected:
Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
};
- void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]);
-
- void encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]);
+ uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+ void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
// generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
- void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, uint32_t depthRange[2]);
- void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& costs);
- void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx);
+ void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
+ void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
+ void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
// generate chroma prediction, generate residual and recon
- uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
- uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
- void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad);
-
- void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]);
- void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx);
+ uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
+ uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+ void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
- void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx);
+ // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
+ void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
struct MergeData
{
@@ -258,7 +265,9 @@ protected:
/* intra helper functions */
enum { MAX_RD_INTRA_MODES = 16 };
static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
- void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
+
+ // get most probable luma modes for CU part, and bit cost of all non mpm modes
+ uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
};
diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp
index cc70c20..8350be7 100644
--- a/source/encoder/slicetype.cpp
+++ b/source/encoder/slicetype.cpp
@@ -59,12 +59,14 @@ Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
: JobProvider(pool)
, m_est(pool)
{
- m_bReady = 0;
+ m_bReady = false;
+ m_bBusy = false;
m_param = param;
m_lastKeyframe = -m_param->keyframeMax;
m_lastNonB = NULL;
- m_bFilling = true;
+ m_bFilled = false;
m_bFlushed = false;
+ m_bFlush = false;
m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
@@ -79,17 +81,27 @@ void Lookahead::init()
((m_param->bFrameAdaptive && m_param->bframes) ||
m_param->rc.cuTree || m_param->scenecutThreshold ||
(m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
- m_pool = m_pool; /* allow use of worker thread */
+ {
+ JobProvider::enqueue();
+ }
else
m_pool = NULL; /* disable use of worker thread */
}
-void Lookahead::destroy()
+void Lookahead::stop()
{
+ /* do not allow slicetypeDecide() to get started again */
+ m_bReady = false;
+ m_bFlushed = false;
+ m_bFlush = false;
+ m_bBusy = false;
+
if (m_pool)
- // flush will dequeue, if it is necessary
- JobProvider::flush();
+ JobProvider::flush(); // flush will dequeue, if it is necessary
+}
+void Lookahead::destroy()
+{
// these two queues will be empty unless the encode was aborted
while (!m_inputQueue.empty())
{
@@ -111,56 +123,64 @@ void Lookahead::destroy()
/* Called by API thread */
void Lookahead::addPicture(Frame *curFrame, int sliceType)
{
- PicYuv *orig = curFrame->m_origPicYuv;
-
- curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
+ {
+ ProfileScopeEvent(prelookahead);
+ PicYuv *orig = curFrame->m_fencPic;
+ curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
+ }
m_inputQueueLock.acquire();
m_inputQueue.pushBack(*curFrame);
if (m_inputQueue.size() >= m_param->lookaheadDepth)
{
- /* when queue fills the first time, run slicetypeDecide synchronously,
- * since the encoder will always be blocked here */
- if (m_pool && !m_bFilling)
+ if (m_pool)
{
+ m_bReady = !m_bBusy;
m_inputQueueLock.release();
- m_bReady = 1;
m_pool->pokeIdleThread();
}
else
slicetypeDecide();
-
- if (m_bFilling && m_pool)
- JobProvider::enqueue();
- m_bFilling = false;
}
else
m_inputQueueLock.release();
+
+ /* determine if the lookahead is (over) filled enough for frames to begin to
+ * be consumed by frame encoders */
+ if (!m_bFilled)
+ {
+ if (!m_param->bframes & !m_param->lookaheadDepth)
+ m_bFilled = true; /* zero-latency */
+ else if (curFrame->m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)
+ m_bFilled = true; /* full capacity plus mini-gop lag */
+ }
}
/* Called by API thread */
void Lookahead::flush()
{
- /* just in case the input queue is never allowed to fill */
- m_bFilling = false;
+ m_bFlush = true;
+ m_bFilled = true;
- /* flush synchronously */
+ /* just in case the input queue is never allowed to fill */
m_inputQueueLock.acquire();
- if (!m_inputQueue.empty())
+ if (m_inputQueue.empty())
{
- slicetypeDecide();
+ m_bFlushed = true;
+ m_inputQueueLock.release();
}
else
- m_inputQueueLock.release();
-
- m_inputQueueLock.acquire();
-
- /* bFlushed indicates that an empty output queue actually means all frames
- * have been decided (no more inputs for the encoder) */
- if (m_inputQueue.empty())
- m_bFlushed = true;
- m_inputQueueLock.release();
+ {
+ if (m_pool)
+ {
+ m_bReady = !m_bBusy;
+ m_inputQueueLock.release();
+ m_pool->pokeIdleThread();
+ }
+ else
+ slicetypeDecide();
+ }
}
/* Called by API thread. If the lookahead queue has not yet been filled the
@@ -169,37 +189,60 @@ void Lookahead::flush()
* flush() has been called and the output queue is empty, NULL is returned. */
Frame* Lookahead::getDecidedPicture()
{
+ if (!m_bFilled)
+ return NULL;
+
m_outputQueueLock.acquire();
+ Frame *fenc = m_outputQueue.popFront();
+ m_outputQueueLock.release();
- if (m_bFilling)
- {
- m_outputQueueLock.release();
- return NULL;
- }
+ if (fenc || m_bFlushed)
+ return fenc;
- while (m_outputQueue.empty() && !m_bFlushed)
+ do
{
- m_outputQueueLock.release();
m_outputAvailable.wait();
+
m_outputQueueLock.acquire();
+ fenc = m_outputQueue.popFront();
+ m_outputQueueLock.release();
}
+ while (!fenc);
- Frame *fenc = m_outputQueue.popFront();
- m_outputQueueLock.release();
return fenc;
}
/* Called by pool worker threads */
bool Lookahead::findJob(int)
{
- if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
+ if (!m_bReady)
+ return false;
+
+ m_inputQueueLock.acquire();
+ if (!m_bReady)
{
+ m_inputQueueLock.release();
+ return false;
+ }
+
+ m_bReady = false;
+ m_bBusy = true;
+
+ do
+ {
+ slicetypeDecide(); // releases input queue lock
+
m_inputQueueLock.acquire();
- slicetypeDecide();
- return true;
+
+ if (!m_bBusy)
+ break;
}
- else
- return false;
+ while (m_inputQueue.size() >= m_param->lookaheadDepth ||
+ (m_bFlush && m_inputQueue.size()));
+
+ m_bBusy = false;
+ m_inputQueueLock.release();
+ return true;
}
/* Called by rate-control to calculate the estimated SATD cost for a given
@@ -290,7 +333,7 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame)
/* called by API thread or worker thread with inputQueueLock acquired */
void Lookahead::slicetypeDecide()
{
- ScopedLock lock(m_decideLock);
+ ProfileScopeEvent(slicetypeDecideEV);
Lowres *frames[X265_LOOKAHEAD_MAX];
Frame *list[X265_LOOKAHEAD_MAX];
@@ -417,7 +460,6 @@ void Lookahead::slicetypeDecide()
list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
brefs++;
}
-
/* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
if (m_param->rc.rateControlMode != X265_RC_CQP)
{
@@ -524,14 +566,12 @@ void Lookahead::slicetypeDecide()
void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
{
int prevNonB = 0, curNonB = 1, idx = 0;
- bool isNextNonB = false;
-
while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
curNonB++;
-
int nextNonB = keyframe ? prevNonB : curNonB;
- int nextB = keyframe ? prevNonB + 1 : curNonB + 1;
-
+ int nextB = prevNonB + 1;
+ int nextBRef = 0;
+ int miniGopEnd = keyframe ? prevNonB : curNonB;
while (curNonB < numFrames + !keyframe)
{
/* P/I cost: This shouldn't include the cost of nextNonB */
@@ -540,38 +580,53 @@ void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+ /* Save the nextNonB Cost in each B frame of the current miniGop */
+ if (curNonB > miniGopEnd)
+ {
+ for (int j = nextB; j < miniGopEnd; j++)
+ {
+ frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
+ frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
+
+ }
+ }
idx++;
}
/* Handle the B-frames: coded order */
- for (int i = prevNonB + 1; i < curNonB; i++, idx++)
- {
- frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i);
- frames[nextNonB]->plannedType[idx] = X265_TYPE_B;
- }
+ if (m_param->bBPyramid && curNonB - prevNonB > 1)
+ nextBRef = (prevNonB + curNonB + 1) / 2;
- for (int i = nextB; i <= curNonB; i++)
+ for (int i = prevNonB + 1; i < curNonB; i++, idx++)
{
- for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++)
+ int64_t satdCost = 0; int type = X265_TYPE_B;
+ if (nextBRef)
{
- if (j == curNonB)
+ if (i == nextBRef)
{
- if (isNextNonB)
- {
- int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
- frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB);
- frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType;
- }
+ satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);
+ type = X265_TYPE_BREF;
}
+ else if (i < nextBRef)
+ satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);
else
- {
- frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j);
- frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B;
- }
+ satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
}
- if (i == curNonB && !isNextNonB)
- isNextNonB = true;
- }
+ else
+ satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
+ frames[nextNonB]->plannedSatd[idx] = satdCost;
+ frames[nextNonB]->plannedType[idx] = type;
+ /* Save the nextB Cost in each B frame of the current miniGop */
+ for (int j = nextB; j < miniGopEnd; j++)
+ {
+ if (nextBRef && i == nextBRef)
+ break;
+ if (j >= i && j !=nextBRef)
+ continue;
+ frames[j]->plannedSatd[frames[j]->indB] = satdCost;
+ frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
+ }
+ }
prevNonB = curNonB;
curNonB++;
while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
@@ -1238,7 +1293,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame)
if (m_param->bEnableWeightedPred)
{
- PicYuv *orig = curFrame->m_origPicYuv;
+ PicYuv *orig = curFrame->m_fencPic;
m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
@@ -1249,7 +1304,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame)
m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
}
- m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0];
+ m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
m_weightedRef.isLowres = true;
m_weightedRef.isWeighted = false;
@@ -1290,7 +1345,6 @@ int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b,
for (int i = 0; i < m_heightInCU; i++)
{
m_rows[i].init();
- m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
if (!fenc->bIntraCalculated)
fenc->rowSatds[0][0][i] = 0;
fenc->rowSatds[b - p0][p1 - b][i] = 0;
@@ -1351,7 +1405,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
{
Lowres *fenc = frames[b];
Lowres *ref = frames[p0];
- pixel *src = ref->fpelPlane;
+ pixel *src = ref->fpelPlane[0];
intptr_t stride = fenc->lumaStride;
if (wp)
@@ -1365,7 +1419,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
scale, round << correction, denom + correction, offset);
- src = m_weightedRef.fpelPlane;
+ src = m_weightedRef.fpelPlane[0];
}
uint32_t cost = 0;
@@ -1376,7 +1430,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
{
for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
{
- int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride);
+ int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
cost += X265_MIN(satd, fenc->intraCost[mb]);
}
}
@@ -1428,9 +1482,9 @@ void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
/* Rescale considering the constraints on curOffset. We do it in this order
* because scale has a much wider range than offset (because of denom), so
* it should almost never need to be clamped. */
- curOffset = Clip3(-128, 127, curOffset);
+ curOffset = x265_clip3(-128, 127, curOffset);
curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
- curScale = Clip3(0, 127, curScale);
+ curScale = x265_clip3(0, 127, curScale);
}
SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset);
s = weightCostLuma(frames, b, p0, &m_w);
@@ -1469,6 +1523,8 @@ void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
void CostEstimate::processRow(int row, int /*threadId*/)
{
+ ProfileScopeEvent(costEstimateRow);
+
int realrow = m_heightInCU - 1 - row;
Lowres **frames = m_curframes;
ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
@@ -1531,7 +1587,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
- m_me.setSourcePU(pelOffset, cuSize, cuSize);
+ m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowresPenalty = 4;
@@ -1592,115 +1648,106 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
}
if (bBidir)
{
- pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+ ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+ ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
- pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
- primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
- int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
+ ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+ primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
+ int bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
// Try 0,0 candidates
src0 = wfref0->lowresPlane[0] + pelOffset;
src1 = fref1->lowresPlane[0] + pelOffset;
- primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32);
- bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
+ primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32);
+ bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
}
}
+
if (!fenc->bIntraCalculated)
{
+ ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+ pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
+ const int cuSize2 = cuSize << 1;
- pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
- pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
- pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE;
- pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE;
-
- pixel *pix_cur = fenc->lowresPlane[0] + pelOffset;
+ pixel *pixCur = fenc->lowresPlane[0] + pelOffset;
// Copy Above
- memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
+ memcpy(neighbours[0], pixCur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
// Copy Left
- for (int i = 0; i < cuSize + 1; i++)
- {
- left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
- }
+ for (int i = 1; i < cuSize + 1; i++)
+ neighbours[0][i + cuSize2] = pixCur[-1 - fenc->lumaStride + i * fenc->lumaStride];
for (int i = 0; i < cuSize; i++)
{
- above0[cuSize + i + 1] = above0[cuSize];
- left0[cuSize + i + 1] = left0[cuSize];
+ // Copy above-last pixel
+ neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; //neighbours[0][i + 9] = neighbours[0][8]
+ // Copy left-last pixel
+ neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; //neighbours[0][i + 25] = neighbours[0][24]
}
- // filtering with [1 2 1]
- // assume getUseStrongIntraSmoothing() is disabled
- above1[0] = above0[0];
- above1[2 * cuSize] = above0[2 * cuSize];
- left1[0] = left0[0];
- left1[2 * cuSize] = left0[2 * cuSize];
- for (int i = 1; i < 2 * cuSize; i++)
+ // Filter neighbour pixels with [1-2-1]
+ neighbours[1][0] = neighbours[0][0]; // Copy top-left pixel
+ neighbours[1][cuSize2] = neighbours[0][cuSize2]; //Copy top-right pixel
+ neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
+
+ neighbours[1][1] = (neighbours[0][0] + (neighbours[0][1] << 1) + neighbours[0][2] + 2) >> 2;
+ neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
+ for (int i = 2; i < cuSize2; i++)
{
- above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2;
- left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2;
+ neighbours[1][i] = (neighbours[0][i - 1] + (neighbours[0][i] << 1) + neighbours[0][i + 1] + 2) >> 2;
+ neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
}
- int predsize = cuSize * cuSize;
-
- // generate 35 intra predictions into m_predictions
- pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
- int icost = m_me.COST_MAX, cost;
- primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
- cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
- if (cost < icost)
- icost = cost;
- pixel *above = (cuSize >= 8) ? above1 : above0;
- pixel *left = (cuSize >= 8) ? left1 : left0;
- primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
- cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
- if (cost < icost)
- icost = cost;
- primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
-
- // calculate satd costs, keep least cost
- ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
- primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
-
- int acost = m_me.COST_MAX;
+ int icost = m_me.COST_MAX, ilowmode;
+ primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
+ int cost = m_me.bufSATD(prediction, cuSize);
+ COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
+
+ pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
+ primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
+ cost = m_me.bufSATD(prediction, cuSize);
+ COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
+
uint32_t mode, lowmode = 4;
+ int acost = m_me.COST_MAX, filter;
for (mode = 5; mode < 35; mode += 5)
{
- if (mode < 18)
- cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
- else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ filter = !!(g_intraFilterFlags[mode] & cuSize);
+ primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+ cost = m_me.bufSATD(prediction, cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
}
for (uint32_t dist = 2; dist >= 1; dist--)
{
- mode = lowmode - dist;
- if (mode < 18)
- cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
- else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ int minusmode = lowmode - dist;
+ int plusmode = lowmode + dist;
+
+ mode = minusmode;
+ filter = !!(g_intraFilterFlags[mode] & cuSize);
+ primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+ cost = m_me.bufSATD(prediction, cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
- mode = lowmode + dist;
- if (mode < 18)
- cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
- else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ mode = plusmode;
+ filter = !!(g_intraFilterFlags[mode] & cuSize);
+ primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+ cost = m_me.bufSATD(prediction, cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
}
- if (acost < icost)
- icost = acost;
+ COPY2_IF_LT(icost, acost, ilowmode, lowmode);
const int intraPenalty = 5 * m_lookAheadLambda;
icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
fenc->intraCost[cuXY] = icost;
+ fenc->intraMode[cuXY] = (uint8_t)ilowmode;
+
int icostAq = icost;
if (bFrameScoreCU)
{
diff --git a/source/encoder/slicetype.h b/source/encoder/slicetype.h
index 8805e90..123bf25 100644
--- a/source/encoder/slicetype.h
+++ b/source/encoder/slicetype.h
@@ -53,7 +53,6 @@ public:
x265_param* m_param;
MotionEstimate m_me;
Lock m_lock;
- pixel* m_predictions; // buffer for 35 intra predictions
volatile uint32_t m_completed; // Number of CUs in this row for which cost estimation is completed
volatile bool m_active;
@@ -73,18 +72,11 @@ public:
EstimateRow()
{
m_me.setQP(X265_LOOKAHEAD_QP);
- m_me.setSearchMethod(X265_HEX_SEARCH);
- m_me.setSubpelRefine(1);
- m_predictions = X265_MALLOC(pixel, 35 * 8 * 8);
+ m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
m_merange = 16;
m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
}
- ~EstimateRow()
- {
- X265_FREE(m_predictions);
- }
-
void init();
void estimateCUCost(Lowres * *frames, ReferencePlanes * wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
@@ -148,20 +140,25 @@ public:
void addPicture(Frame*, int sliceType);
void flush();
+ void stop();
Frame* getDecidedPicture();
void getEstimatedPictureCost(Frame *pic);
protected:
+
Lock m_inputQueueLock;
Lock m_outputQueueLock;
- Lock m_decideLock;
Event m_outputAvailable;
- volatile int m_bReady;
- volatile bool m_bFilling;
- volatile bool m_bFlushed;
- bool findJob(int);
+
+ bool m_bReady; /* input lock - slicetypeDecide() can be started */
+ bool m_bBusy; /* input lock - slicetypeDecide() is running */
+ bool m_bFilled; /* enough frames in lookahead for output to be available */
+ bool m_bFlushed; /* all frames have been decided, lookahead is finished */
+ bool m_bFlush; /* no more frames will be received, empty the input queue */
+
+ bool findJob(int);
/* called by addPicture() or flush() to trigger slice decisions */
void slicetypeDecide();
diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp
index 3bf5a45..31efdc7 100644
--- a/source/encoder/weightPrediction.cpp
+++ b/source/encoder/weightPrediction.cpp
@@ -80,7 +80,7 @@ void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
MV mv = mvs[cu];
mv = mv.clipped(mvmin, mvmax);
pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
- primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
+ primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
}
}
}
@@ -133,26 +133,26 @@ void mcChroma(pixel * mcout,
int yFrac = mv.y & 0x7;
if ((yFrac | xFrac) == 0)
{
- primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
+ primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
}
else if (yFrac == 0)
{
- primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
+ primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
}
else if (xFrac == 0)
{
- primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
+ primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
}
else
{
ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
- primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
- primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
+ primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
+ primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
}
}
else
{
- primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
+ primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
}
}
}
@@ -193,23 +193,23 @@ uint32_t weightCost(pixel * fenc,
if (bLuma)
{
int cu = 0;
- for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
+ for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
{
- for (int x = 8; x < width; x += 8, cu++)
+ for (int x = 0; x < width; x += 8, cu++)
{
- int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
+ int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
cost += X265_MIN(cmp, cache.intraCost[cu]);
}
}
}
else if (cache.csp == X265_CSP_I444)
- for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
- for (int x = 16; x < width; x += 16)
- cost += primitives.satd[LUMA_16x16](r + x, stride, f + x, stride);
+ for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
+ for (int x = 0; x < width; x += 16)
+ cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
else
- for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
- for (int x = 8; x < width; x += 8)
- cost += primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
+ for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
+ for (int x = 0; x < width; x += 8)
+ cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
return cost;
}
@@ -219,7 +219,7 @@ namespace x265 {
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
{
WeightParam wp[2][MAX_NUM_REF][3];
- PicYuv *fencPic = frame.m_origPicYuv;
+ PicYuv *fencPic = frame.m_fencPic;
Lowres& fenc = frame.m_lowres;
Cache cache;
@@ -303,7 +303,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
if (plane)
{
- int scale = Clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
+ int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
if (scale > 127)
continue;
weights[plane].inputWeight = scale;
@@ -329,7 +329,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
if (!refFrame->m_bChromaExtended)
{
refFrame->m_bChromaExtended = true;
- PicYuv *refPic = refFrame->m_origPicYuv;
+ PicYuv *refPic = refFrame->m_fencPic;
int width = refPic->m_picWidth >> cache.hshift;
int height = refPic->m_picHeight >> cache.vshift;
extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
@@ -363,7 +363,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
case 1:
orig = fencPic->m_picOrg[1];
stride = fencPic->m_strideC;
- fref = refFrame->m_origPicYuv->m_picOrg[1];
+ fref = refFrame->m_fencPic->m_picOrg[1];
/* Clamp the chroma dimensions to the nearest multiple of
* 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
@@ -381,9 +381,9 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
break;
case 2:
- fref = refFrame->m_origPicYuv->m_picOrg[2];
orig = fencPic->m_picOrg[2];
stride = fencPic->m_strideC;
+ fref = refFrame->m_fencPic->m_picOrg[2];
width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
if (mvs)
@@ -413,8 +413,8 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
static const int scaleDist = 4;
static const int offsetDist = 2;
- int startScale = Clip3(0, 127, minscale - scaleDist);
- int endScale = Clip3(0, 127, minscale + scaleDist);
+ int startScale = x265_clip3(0, 127, minscale - scaleDist);
+ int endScale = x265_clip3(0, 127, minscale + scaleDist);
for (int scale = startScale; scale <= endScale; scale++)
{
int deltaWeight = scale - (1 << mindenom);
@@ -429,13 +429,13 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
/* Rescale considering the constraints on curOffset. We do it in this order
* because scale has a much wider range than offset (because of denom), so
* it should almost never need to be clamped. */
- curOffset = Clip3(-128, 127, curOffset);
+ curOffset = x265_clip3(-128, 127, curOffset);
curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
- curScale = Clip3(0, 127, curScale);
+ curScale = x265_clip3(0, 127, curScale);
}
- int startOffset = Clip3(-128, 127, curOffset - offsetDist);
- int endOffset = Clip3(-128, 127, curOffset + offsetDist);
+ int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
+ int endOffset = x265_clip3(-128, 127, curOffset + offsetDist);
for (int off = startOffset; off <= endOffset; off++)
{
WeightParam wsp;
diff --git a/source/filters/filters.cpp b/source/filters/filters.cpp
index 26a26ac..f602c7c 100644
--- a/source/filters/filters.cpp
+++ b/source/filters/filters.cpp
@@ -41,7 +41,7 @@ void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride,
for (int x = 0; x < width; x++)
{
err = err * 2 + errors[x] + errors[x + 1];
- dst[x * pitch] = (pixel)Clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
+ dst[x * pitch] = (pixel)x265_clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift);
}
}
diff --git a/source/input/y4m.cpp b/source/input/y4m.cpp
index e026eeb..cda4dd4 100644
--- a/source/input/y4m.cpp
+++ b/source/input/y4m.cpp
@@ -375,6 +375,7 @@ void Y4MInput::startReader()
void Y4MInput::threadMain()
{
+ THREAD_NAME("Y4MRead", 0);
do
{
if (!populateFrameQueue())
@@ -419,6 +420,7 @@ bool Y4MInput::populateFrameQueue()
return false;
}
+ ProfileScopeEvent(frameRead);
ifs->read(buf[written % QUEUE_SIZE], framesize);
if (ifs->good())
{
diff --git a/source/input/yuv.cpp b/source/input/yuv.cpp
index c13f471..8fad226 100644
--- a/source/input/yuv.cpp
+++ b/source/input/yuv.cpp
@@ -167,6 +167,7 @@ void YUVInput::startReader()
void YUVInput::threadMain()
{
+ THREAD_NAME("YUVRead", 0);
while (threadActive)
{
if (!populateFrameQueue())
@@ -193,6 +194,7 @@ bool YUVInput::populateFrameQueue()
return false;
}
+ ProfileScopeEvent(frameRead);
ifs->read(buf[written % QUEUE_SIZE], framesize);
if (ifs->good())
{
diff --git a/source/profile/CMakeLists.txt b/source/profile/CMakeLists.txt
new file mode 100644
index 0000000..0a6c79d
--- /dev/null
+++ b/source/profile/CMakeLists.txt
@@ -0,0 +1,25 @@
+# vim: syntax=cmake
+
+option(ENABLE_PPA "Enable PPA profiling instrumentation" OFF)
+if(ENABLE_PPA)
+ add_definitions(-DENABLE_PPA)
+ add_subdirectory(PPA)
+ list(APPEND PLATFORM_LIBS PPA)
+ if(UNIX)
+ list(APPEND PLATFORM_LIBS dl)
+ endif(UNIX)
+endif(ENABLE_PPA)
+
+option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
+if(ENABLE_VTUNE)
+ add_definitions(-DENABLE_VTUNE)
+ add_subdirectory(vtune)
+ list(APPEND PLATFORM_LIBS vtune)
+ include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
+ link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
+ if(WIN32)
+ list(APPEND PLATFORM_LIBS libittnotify.lib)
+ else()
+ list(APPEND PLATFORM_LIBS libittnotify.a dl)
+ endif()
+endif(ENABLE_VTUNE)
diff --git a/source/profile/PPA/CMakeLists.txt b/source/profile/PPA/CMakeLists.txt
new file mode 100644
index 0000000..7207a71
--- /dev/null
+++ b/source/profile/PPA/CMakeLists.txt
@@ -0,0 +1 @@
+add_library(PPA ppa.h ppaApi.h ppa.cpp ../cpuEvents.h)
diff --git a/source/PPA/ppa.cpp b/source/profile/PPA/ppa.cpp
similarity index 96%
rename from source/PPA/ppa.cpp
rename to source/profile/PPA/ppa.cpp
index 607a946..cf20871 100644
--- a/source/PPA/ppa.cpp
+++ b/source/profile/PPA/ppa.cpp
@@ -27,13 +27,13 @@
#include <stdlib.h>
#define PPA_REGISTER_CPU_EVENT2GROUP(x, y) # x, # y,
-#define PPA_REGISTER_CPU_EVENT(x) PPA_REGISTER_CPU_EVENT2GROUP(x, NoGroup)
+#define CPU_EVENT(x) PPA_REGISTER_CPU_EVENT2GROUP(x, NoGroup)
const char *PPACpuAndGroup[] =
{
-#include "ppaCPUEvents.h"
+#include "../cpuEvents.h"
""
};
-#undef PPA_REGISTER_CPU_EVENT
+#undef CPU_EVENT
#undef PPA_REGISTER_CPU_EVENT2GROUP
extern "C" {
@@ -41,8 +41,10 @@ typedef ppa::Base *(FUNC_PPALibInit)(const char **, int);
typedef void (FUNC_PPALibRelease)(ppa::Base* &);
}
+using namespace ppa;
+
static FUNC_PPALibRelease *_pfuncPpaRelease;
-ppa::Base *ppabase;
+ppa::Base *ppa::ppabase;
static void _ppaReleaseAtExit()
{
diff --git a/source/input/input.cpp b/source/profile/PPA/ppa.h
similarity index 75%
copy from source/input/input.cpp
copy to source/profile/PPA/ppa.h
index 096638c..c8c67c0 100644
--- a/source/input/input.cpp
+++ b/source/profile/PPA/ppa.h
@@ -21,18 +21,23 @@
* For more information, contact us at license @ x265.com.
*****************************************************************************/
-#include "input.h"
-#include "yuv.h"
-#include "y4m.h"
+#ifndef PPA_H
+#define PPA_H
-using namespace x265;
-
-Input* Input::open(InputFileInfo& info, bool bForceY4m)
+/* declare enum list of users CPU events */
+#define CPU_EVENT(x) x,
+enum PPACpuEventEnum
{
- const char * s = strrchr(info.filename, '.');
+#include "../cpuEvents.h"
+ PPACpuGroupNums
+};
+#undef CPU_EVENT
+
+#include "ppaApi.h"
+
+void initializePPA();
+
+#define PPA_INIT() initializePPA()
+#define PPAScopeEvent(e) ppa::ProfileScope ppaScope_(e)
- if (bForceY4m || (s && !strcmp(s, ".y4m")))
- return new Y4MInput(info);
- else
- return new YUVInput(info);
-}
+#endif /* PPA_H */
diff --git a/source/PPA/ppaApi.h b/source/profile/PPA/ppaApi.h
similarity index 89%
rename from source/PPA/ppaApi.h
rename to source/profile/PPA/ppaApi.h
index 149de6d..15fa76b 100644
--- a/source/PPA/ppaApi.h
+++ b/source/profile/PPA/ppaApi.h
@@ -54,6 +54,17 @@ protected:
virtual void init(const char **pNames, int eventCount) = 0;
};
+
+extern ppa::Base *ppabase;
+
+struct ProfileScope
+{
+ ppa::EventID id;
+
+ ProfileScope(int e) { if (ppabase) { id = ppabase->getEventId(e); ppabase->triggerStartEvent(id); } else id = 0; }
+ ~ProfileScope() { if (ppabase) ppabase->triggerEndEvent(id); }
+};
+
}
#endif //_PPA_API_H_
diff --git a/source/profile/cpuEvents.h b/source/profile/cpuEvents.h
new file mode 100644
index 0000000..d2548c7
--- /dev/null
+++ b/source/profile/cpuEvents.h
@@ -0,0 +1,10 @@
+CPU_EVENT(frameRead)
+CPU_EVENT(bitstreamWrite)
+CPU_EVENT(frameThread)
+CPU_EVENT(encodeCTU)
+CPU_EVENT(filterCTURow)
+CPU_EVENT(slicetypeDecideEV)
+CPU_EVENT(prelookahead)
+CPU_EVENT(costEstimateRow)
+CPU_EVENT(pmode)
+CPU_EVENT(pme)
diff --git a/source/profile/vtune/CMakeLists.txt b/source/profile/vtune/CMakeLists.txt
new file mode 100644
index 0000000..cd7d3b6
--- /dev/null
+++ b/source/profile/vtune/CMakeLists.txt
@@ -0,0 +1,2 @@
+include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
+add_library(vtune vtune.h vtune.cpp ../cpuEvents.h)
diff --git a/source/output/output.h b/source/profile/vtune/vtune.cpp
similarity index 63%
copy from source/output/output.h
copy to source/profile/vtune/vtune.cpp
index a754846..7117b9b 100644
--- a/source/output/output.h
+++ b/source/profile/vtune/vtune.cpp
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2013 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -21,35 +21,38 @@
* For more information, contact us at license @ x265.com.
*****************************************************************************/
-#ifndef X265_OUTPUT_H
-#define X265_OUTPUT_H
+#include "vtune.h"
+#include <stdio.h>
-#include "x265.h"
+namespace {
-namespace x265 {
-// private x265 namespace
-
-class Output
+#define CPU_EVENT(x) #x,
+const char *stringNames[] =
{
-protected:
-
- virtual ~Output() {}
-
-public:
-
- Output() {}
+#include "../cpuEvents.h"
+ ""
+};
+#undef CPU_EVENT
- static Output* open(const char *fname, int width, int height, uint32_t bitdepth,
- uint32_t fpsNum, uint32_t fpsDenom, int csp);
+}
- virtual bool isFail() const = 0;
+namespace x265 {
- virtual void release() = 0;
+__itt_domain* domain;
+__itt_string_handle* taskHandle[NUM_VTUNE_TASKS];
- virtual bool writePicture(const x265_picture& pic) = 0;
+void vtuneInit()
+{
+ domain = __itt_domain_create("x265");
+ for (size_t i = 0; i < sizeof(stringNames) / sizeof(const char *); i++)
+ taskHandle[i] = __itt_string_handle_create(stringNames[i]);
+}
- virtual const char *getName() const = 0;
-};
+void vtuneSetThreadName(const char *name, int id)
+{
+ char threadname[128];
+ sprintf(threadname, "%s %d", name, id);
+ __itt_thread_set_name(threadname);
}
-#endif // ifndef X265_OUTPUT_H
+}
diff --git a/source/encoder/level.h b/source/profile/vtune/vtune.h
similarity index 66%
copy from source/encoder/level.h
copy to source/profile/vtune/vtune.h
index 03ca40d..a0239a7 100644
--- a/source/encoder/level.h
+++ b/source/profile/vtune/vtune.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * Copyright (C) 2013 x265 project
+ * Copyright (C) 2015 x265 project
*
* Authors: Steve Borho <steve at borho.org>
*
@@ -21,19 +21,33 @@
* For more information, contact us at license @ x265.com.
*****************************************************************************/
-#ifndef X265_LEVEL_H
-#define X265_LEVEL_H 1
+#ifndef VTUNE_H
+#define VTUNE_H
-#include "common.h"
-#include "x265.h"
+#include "ittnotify.h"
namespace x265 {
-// encoder private namespace
-struct VPS;
-void determineLevel(const x265_param ¶m, VPS& vps);
-bool enforceLevel(x265_param& param, VPS& vps);
+#define CPU_EVENT(x) x,
+enum VTuneTasksEnum
+{
+#include "../cpuEvents.h"
+ NUM_VTUNE_TASKS
+};
+#undef CPU_EVENT
+
+extern __itt_domain* domain;
+extern __itt_string_handle* taskHandle[NUM_VTUNE_TASKS];
+
+struct VTuneScopeEvent
+{
+ VTuneScopeEvent(int e) { __itt_task_begin(domain, __itt_null, __itt_null, taskHandle[e]); }
+ ~VTuneScopeEvent() { __itt_task_end(domain); }
+};
+
+void vtuneInit();
+void vtuneSetThreadName(const char *name, int id);
}
-#endif // ifndef X265_LEVEL_H
+#endif
diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt
index ff3312f..ff1e141 100644
--- a/source/test/CMakeLists.txt
+++ b/source/test/CMakeLists.txt
@@ -23,6 +23,3 @@ add_executable(TestBench ${YASM_SRC}
ipfilterharness.cpp ipfilterharness.h
intrapredharness.cpp intrapredharness.h)
target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
-
-add_executable(PoolTest testpool.cpp)
-target_link_libraries(PoolTest x265-static ${PLATFORM_LIBS})
diff --git a/source/test/intrapredharness.cpp b/source/test/intrapredharness.cpp
index 97eff94..e69c6b5 100644
--- a/source/test/intrapredharness.cpp
+++ b/source/test/intrapredharness.cpp
@@ -31,8 +31,6 @@ IntraPredHarness::IntraPredHarness()
{
for (int i = 0; i < INPUT_SIZE; i++)
pixel_buff[i] = rand() % PIXEL_MAX;
-
- initROM();
}
bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
@@ -51,14 +49,8 @@ bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, in
if (width > 16)
rand_filter = 0;
- pixel left[MAX_CU_SIZE * 2 + 1];
- for (int k = 0; k < width * 2 + 1; k++)
- {
- left[k] = pixel_buff[j - 1 + k * Predict::ADI_BUF_STRIDE];
- }
-
- ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, rand_filter);
- checked(opt, pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, rand_filter);
+ ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
+ checked(opt, pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
for (int k = 0; k < width; k++)
{
@@ -85,14 +77,8 @@ bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt
for (int i = 0; i <= 100; i++)
{
- pixel left[MAX_CU_SIZE * 2 + 1];
- for (int k = 0; k < width * 2 + 1; k++)
- {
- left[k] = pixel_buff[j - 1 + k * Predict::ADI_BUF_STRIDE];
- }
-
- ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, 0);
- checked(opt, pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, 0);
+ ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, 0);
+ checked(opt, pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, 0);
for (int k = 0; k < width; k++)
{
@@ -107,7 +93,7 @@ bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt
return true;
}
-bool IntraPredHarness::check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE])
+bool IntraPredHarness::check_angular_primitive(const intra_pred_t ref[], const intra_pred_t opt[], int sizeIdx)
{
int j = Predict::ADI_BUF_STRIDE;
intptr_t stride = FENC_STRIDE;
@@ -117,41 +103,37 @@ bool IntraPredHarness::check_angular_primitive(const intra_pred_t ref[][NUM_TR_S
memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
#endif
- for (int size = 2; size <= 5; size++)
+ int width = 1 << (sizeIdx + 2);
+ for (int i = 0; i <= 100; i++)
{
- int width = (1 << size);
- for (int i = 0; i <= 100; i++)
+ int bFilter = (width <= 16) && (rand() % 2);
+ for (int pmode = 2; pmode <= 34; pmode++)
{
- int bFilter = (width <= 16) && (rand() % 2);
- for (int pmode = 2; pmode <= 34; pmode++)
- {
- if (!opt[pmode][size - 2])
- continue;
-
- pixel * refAbove = pixel_buff + j;
- pixel * refLeft = refAbove + 3 * width;
- refLeft[0] = refAbove[0];
+ if (!opt[pmode])
+ continue;
- checked(opt[pmode][size - 2], pixel_out_vec, stride, refLeft, refAbove, pmode, bFilter);
- ref[pmode][size - 2](pixel_out_c, stride, refLeft, refAbove, pmode, bFilter);
+ checked(opt[pmode], pixel_out_vec, stride, pixel_buff + j, pmode, bFilter);
+ ref[pmode](pixel_out_c, stride, pixel_buff + j, pmode, bFilter);
- for (int k = 0; k < width; k++)
+ for (int k = 0; k < width; k++)
+ {
+ if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
{
- if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
- return false;
+ printf("ang_%dx%d, Mode = %d, Row = %d failed !!\n", width, width, pmode, k);
+ return false;
}
-
- reportfail();
}
- j += FENC_STRIDE;
+ reportfail();
}
+
+ j += FENC_STRIDE;
}
return true;
}
-bool IntraPredHarness::check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[])
+bool IntraPredHarness::check_allangs_primitive(const intra_allangs_t ref, const intra_allangs_t opt, int sizeIdx)
{
int j = Predict::ADI_BUF_STRIDE;
int isLuma;
@@ -161,42 +143,35 @@ bool IntraPredHarness::check_allangs_primitive(const intra_allangs_t ref[], cons
memset(pixel_out_33_c, 0xCD, OUTPUT_SIZE_33);
#endif
- for (int size = 2; size <= 5; size++)
- {
- if (opt[size - 2] == NULL) continue;
-
- const int width = (1 << size);
+ const int width = 1 << (sizeIdx + 2);
- for (int i = 0; i <= 100; i++)
- {
- isLuma = (width <= 16) ? true : false; // bFilter is true for 4x4, 8x8, 16x16 and false for 32x32
+ for (int i = 0; i <= 100; i++)
+ {
+ isLuma = (width <= 16) ? true : false; // bFilter is true for 4x4, 8x8, 16x16 and false for 32x32
- pixel * refAbove0 = pixel_buff + j;
- pixel * refLeft0 = refAbove0 + 3 * width;
+ pixel * refAbove0 = pixel_buff + j + 3 * FENC_STRIDE; // keep this offset, since vector code may broken input buffer range [-(width-1), 0];
+ pixel * refLeft0 = refAbove0 + 3 * width + FENC_STRIDE;
- pixel * refAbove1 = pixel_buff + j + 3 * FENC_STRIDE; // keep this offset, since vector code may broken input buffer range [-(width-1), 0]
- pixel * refLeft1 = refAbove1 + 3 * width + FENC_STRIDE;
- refLeft0[0] = refAbove0[0] = refLeft1[0] = refAbove1[0];
+ refLeft0[0] = refAbove0[0];
- ref[size - 2](pixel_out_33_c, refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
- checked(opt[size - 2], pixel_out_33_vec, refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
+ ref(pixel_out_33_c, refAbove0, refLeft0, isLuma);
+ checked(opt, pixel_out_33_vec, refAbove0, refLeft0, isLuma);
- for (int p = 2 - 2; p <= 34 - 2; p++)
+ for (int p = 2 - 2; p <= 34 - 2; p++)
+ {
+ for (int k = 0; k < width; k++)
{
- for (int k = 0; k < width; k++)
+ if (memcmp(pixel_out_33_c + p * (width * width) + k * width, pixel_out_33_vec + p * (width * width) + k * width, width * sizeof(pixel)))
{
- if (memcmp(pixel_out_33_c + p * (width * width) + k * width, pixel_out_33_vec + p * (width * width) + k * width, width * sizeof(pixel)))
- {
- printf("\nFailed: (%dx%d) Mode(%2d), Line[%2d], bfilter=%d\n", width, width, p + 2, k, isLuma);
- opt[size - 2](pixel_out_33_vec, refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
- return false;
- }
+ printf("\nFailed: (%dx%d) Mode(%2d), Line[%2d], bfilter=%d\n", width, width, p + 2, k, isLuma);
+ opt(pixel_out_33_vec, refAbove0, refLeft0, isLuma);
+ return false;
}
}
-
- reportfail();
- j += FENC_STRIDE;
}
+
+ reportfail();
+ j += FENC_STRIDE;
}
return true;
@@ -206,40 +181,38 @@ bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const Encod
{
for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++)
{
- if (opt.intra_pred[1][i])
+ const int size = (1 << (i + 2));
+ if (opt.cu[i].intra_pred[PLANAR_IDX])
{
- const int size = (1 << (i + 2));
- if (!check_dc_primitive(ref.intra_pred[1][i], opt.intra_pred[1][i], size))
+ if (!check_planar_primitive(ref.cu[i].intra_pred[PLANAR_IDX], opt.cu[i].intra_pred[PLANAR_IDX], size))
{
- printf("intra_dc %dx%d failed\n", size, size);
+ printf("intra_planar %dx%d failed\n", size, size);
return false;
}
}
- if (opt.intra_pred[0][i])
+ if (opt.cu[i].intra_pred[DC_IDX])
{
- const int size = (1 << (i + 2));
- if (!check_planar_primitive(ref.intra_pred[0][i], opt.intra_pred[0][i], size))
+ if (!check_dc_primitive(ref.cu[i].intra_pred[DC_IDX], opt.cu[i].intra_pred[DC_IDX], size))
{
- printf("intra_planar %dx%d failed\n", size, size);
+ printf("intra_dc %dx%d failed\n", size, size);
return false;
}
}
- }
-
- // NOTE: always call since this function have check pointer in loop
- if (!check_angular_primitive(ref.intra_pred, opt.intra_pred))
- {
- printf("intra_angular failed\n");
- return false;
- }
- if (opt.intra_pred_allangs[0])
- {
- if (!check_allangs_primitive(ref.intra_pred_allangs, opt.intra_pred_allangs))
+ if (!check_angular_primitive(ref.cu[i].intra_pred, opt.cu[i].intra_pred, i))
{
- printf("intra_allangs failed\n");
+ printf("intra_angular failed\n");
return false;
}
+
+ if (opt.cu[i].intra_pred_allangs)
+ {
+ if (!check_allangs_primitive(ref.cu[i].intra_pred_allangs, opt.cu[i].intra_pred_allangs, i))
+ {
+ printf("intra_allangs failed\n");
+ return false;
+ }
+ }
}
return true;
@@ -253,51 +226,46 @@ void IntraPredHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderP
for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++)
{
const int size = (1 << (i + 2));
- if (opt.intra_pred[1][i])
+ if (opt.cu[i].intra_pred[PLANAR_IDX])
+ {
+ printf("intra_planar_%dx%d", size, size);
+ REPORT_SPEEDUP(opt.cu[i].intra_pred[PLANAR_IDX], ref.cu[i].intra_pred[PLANAR_IDX],
+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
+ }
+ if (opt.cu[i].intra_pred[DC_IDX])
{
printf("intra_dc_%dx%d[f=0]", size, size);
- REPORT_SPEEDUP(opt.intra_pred[1][i], ref.intra_pred[1][i],
- pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, pixel_buff, 0, 0);
+ REPORT_SPEEDUP(opt.cu[i].intra_pred[DC_IDX], ref.cu[i].intra_pred[DC_IDX],
+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
if (size <= 16)
{
printf("intra_dc_%dx%d[f=1]", size, size);
- REPORT_SPEEDUP(opt.intra_pred[1][i], ref.intra_pred[1][i],
- pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, pixel_buff, 0, 1);
+ REPORT_SPEEDUP(opt.cu[i].intra_pred[DC_IDX], ref.cu[i].intra_pred[DC_IDX],
+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);
}
}
- if (opt.intra_pred[0][i])
- {
- printf("intra_planar %2dx%d", size, size);
- REPORT_SPEEDUP(opt.intra_pred[0][i], ref.intra_pred[0][i],
- pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, pixel_buff, 0, 0);
- }
- if (opt.intra_pred_allangs[i])
+ if (opt.cu[i].intra_pred_allangs)
{
bool bFilter = (size <= 16);
pixel * refAbove = pixel_buff + srcStride;
pixel * refLeft = refAbove + 3 * size;
refLeft[0] = refAbove[0];
printf("intra_allangs%dx%d", size, size);
- REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],
- pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);
+ REPORT_SPEEDUP(opt.cu[i].intra_pred_allangs, ref.cu[i].intra_pred_allangs,
+ pixel_out_33_vec, refAbove, refLeft, bFilter);
}
- }
-
- for (int ii = 2; ii <= 5; ii++)
- {
- for (int p = 2; p <= 34; p += 1)
+ for (int mode = 2; mode <= 34; mode += 1)
{
- int pmode = p; //(rand()%33)+2;
- if (opt.intra_pred[pmode][ii - 2])
+ if (opt.cu[i].intra_pred[mode])
{
- width = (1 << ii);
+ width = 1 << (i + 2);
bool bFilter = (width <= 16);
pixel * refAbove = pixel_buff + srcStride;
pixel * refLeft = refAbove + 3 * width;
refLeft[0] = refAbove[0];
- printf("intra_ang%dx%d[%2d]", width, width, pmode);
- REPORT_SPEEDUP(opt.intra_pred[pmode][ii - 2], ref.intra_pred[pmode][ii - 2],
- pixel_out_vec, FENC_STRIDE, refAbove, refLeft, pmode, bFilter);
+ printf("intra_ang_%dx%d[%2d]", width, width, mode);
+ REPORT_SPEEDUP(opt.cu[i].intra_pred[mode], ref.cu[i].intra_pred[mode],
+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, mode, bFilter);
}
}
}
diff --git a/source/test/intrapredharness.h b/source/test/intrapredharness.h
index 622880d..2a6e0c2 100644
--- a/source/test/intrapredharness.h
+++ b/source/test/intrapredharness.h
@@ -43,8 +43,8 @@ protected:
bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);
bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
- bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
- bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
+ bool check_angular_primitive(const intra_pred_t ref[], const intra_pred_t opt[], int size);
+ bool check_allangs_primitive(const intra_allangs_t ref, const intra_allangs_t opt, int size);
public:
diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp
index f23e84b..cb375f0 100644
--- a/source/test/ipfilterharness.cpp
+++ b/source/test/ipfilterharness.cpp
@@ -84,10 +84,10 @@ bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t op
rand_srcStride = rand_width;
rand_width &= ~(min_size - 1);
- rand_width = Clip3(min_size, max_size, rand_width);
+ rand_width = x265_clip3(min_size, max_size, rand_width);
rand_height &= ~(min_size - 1);
- rand_height = Clip3(min_size, max_size, rand_height);
+ rand_height = x265_clip3(min_size, max_size, rand_height);
ref(pixel_test_buff[index],
rand_srcStride,
@@ -487,14 +487,14 @@ bool IPFilterHarness::check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_
rand_srcStride = rand() % 100;
rand_dstStride = rand() % 100 + 64;
- ref(pixel_test_buff[index] + 3 * rand_srcStride,
+ ref(pixel_test_buff[index] + 3 * rand_srcStride + 3,
rand_srcStride,
IPF_C_output_p,
rand_dstStride,
coeffIdxX,
coeffIdxY);
- checked(opt, pixel_test_buff[index] + 3 * rand_srcStride,
+ checked(opt, pixel_test_buff[index] + 3 * rand_srcStride + 3,
rand_srcStride,
IPF_vec_output_p,
rand_dstStride,
@@ -524,59 +524,59 @@ bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const Encode
}
}
- for (int value = 0; value < NUM_LUMA_PARTITIONS; value++)
+ for (int value = 0; value < NUM_PU_SIZES; value++)
{
- if (opt.luma_hpp[value])
+ if (opt.pu[value].luma_hpp)
{
- if (!check_IPFilterLuma_primitive(ref.luma_hpp[value], opt.luma_hpp[value]))
+ if (!check_IPFilterLuma_primitive(ref.pu[value].luma_hpp, opt.pu[value].luma_hpp))
{
printf("luma_hpp[%s]", lumaPartStr[value]);
return false;
}
}
- if (opt.luma_hps[value])
+ if (opt.pu[value].luma_hps)
{
- if (!check_IPFilterLuma_hps_primitive(ref.luma_hps[value], opt.luma_hps[value]))
+ if (!check_IPFilterLuma_hps_primitive(ref.pu[value].luma_hps, opt.pu[value].luma_hps))
{
printf("luma_hps[%s]", lumaPartStr[value]);
return false;
}
}
- if (opt.luma_vpp[value])
+ if (opt.pu[value].luma_vpp)
{
- if (!check_IPFilterLuma_primitive(ref.luma_vpp[value], opt.luma_vpp[value]))
+ if (!check_IPFilterLuma_primitive(ref.pu[value].luma_vpp, opt.pu[value].luma_vpp))
{
printf("luma_vpp[%s]", lumaPartStr[value]);
return false;
}
}
- if (opt.luma_vps[value])
+ if (opt.pu[value].luma_vps)
{
- if (!check_IPFilterLuma_ps_primitive(ref.luma_vps[value], opt.luma_vps[value]))
+ if (!check_IPFilterLuma_ps_primitive(ref.pu[value].luma_vps, opt.pu[value].luma_vps))
{
printf("luma_vps[%s]", lumaPartStr[value]);
return false;
}
}
- if (opt.luma_vsp[value])
+ if (opt.pu[value].luma_vsp)
{
- if (!check_IPFilterLuma_sp_primitive(ref.luma_vsp[value], opt.luma_vsp[value]))
+ if (!check_IPFilterLuma_sp_primitive(ref.pu[value].luma_vsp, opt.pu[value].luma_vsp))
{
printf("luma_vsp[%s]", lumaPartStr[value]);
return false;
}
}
- if (opt.luma_vss[value])
+ if (opt.pu[value].luma_vss)
{
- if (!check_IPFilterLuma_ss_primitive(ref.luma_vss[value], opt.luma_vss[value]))
+ if (!check_IPFilterLuma_ss_primitive(ref.pu[value].luma_vss, opt.pu[value].luma_vss))
{
printf("luma_vss[%s]", lumaPartStr[value]);
return false;
}
}
- if (opt.luma_hvpp[value])
+ if (opt.pu[value].luma_hvpp)
{
- if (!check_IPFilterLumaHV_primitive(ref.luma_hvpp[value], opt.luma_hvpp[value]))
+ if (!check_IPFilterLumaHV_primitive(ref.pu[value].luma_hvpp, opt.pu[value].luma_hvpp))
{
printf("luma_hvpp[%s]", lumaPartStr[value]);
return false;
@@ -586,59 +586,59 @@ bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const Encode
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
{
- if (opt.chroma_p2s[csp])
+ if (opt.chroma[csp].p2s)
{
- if (!check_IPFilter_primitive(ref.chroma_p2s[csp], opt.chroma_p2s[csp], 1, csp))
+ if (!check_IPFilter_primitive(ref.chroma[csp].p2s, opt.chroma[csp].p2s, 1, csp))
{
printf("chroma_p2s[%s]", x265_source_csp_names[csp]);
return false;
}
}
- for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
+ for (int value = 0; value < NUM_PU_SIZES; value++)
{
- if (opt.chroma[csp].filter_hpp[value])
+ if (opt.chroma[csp].pu[value].filter_hpp)
{
- if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_hpp[value], opt.chroma[csp].filter_hpp[value]))
+ if (!check_IPFilterChroma_primitive(ref.chroma[csp].pu[value].filter_hpp, opt.chroma[csp].pu[value].filter_hpp))
{
printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
return false;
}
}
- if (opt.chroma[csp].filter_hps[value])
+ if (opt.chroma[csp].pu[value].filter_hps)
{
- if (!check_IPFilterChroma_hps_primitive(ref.chroma[csp].filter_hps[value], opt.chroma[csp].filter_hps[value]))
+ if (!check_IPFilterChroma_hps_primitive(ref.chroma[csp].pu[value].filter_hps, opt.chroma[csp].pu[value].filter_hps))
{
printf("chroma_hps[%s]", chromaPartStr[csp][value]);
return false;
}
}
- if (opt.chroma[csp].filter_vpp[value])
+ if (opt.chroma[csp].pu[value].filter_vpp)
{
- if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_vpp[value], opt.chroma[csp].filter_vpp[value]))
+ if (!check_IPFilterChroma_primitive(ref.chroma[csp].pu[value].filter_vpp, opt.chroma[csp].pu[value].filter_vpp))
{
printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
return false;
}
}
- if (opt.chroma[csp].filter_vps[value])
+ if (opt.chroma[csp].pu[value].filter_vps)
{
- if (!check_IPFilterChroma_ps_primitive(ref.chroma[csp].filter_vps[value], opt.chroma[csp].filter_vps[value]))
+ if (!check_IPFilterChroma_ps_primitive(ref.chroma[csp].pu[value].filter_vps, opt.chroma[csp].pu[value].filter_vps))
{
printf("chroma_vps[%s]", chromaPartStr[csp][value]);
return false;
}
}
- if (opt.chroma[csp].filter_vsp[value])
+ if (opt.chroma[csp].pu[value].filter_vsp)
{
- if (!check_IPFilterChroma_sp_primitive(ref.chroma[csp].filter_vsp[value], opt.chroma[csp].filter_vsp[value]))
+ if (!check_IPFilterChroma_sp_primitive(ref.chroma[csp].pu[value].filter_vsp, opt.chroma[csp].pu[value].filter_vsp))
{
printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
return false;
}
}
- if (opt.chroma[csp].filter_vss[value])
+ if (opt.chroma[csp].pu[value].filter_vss)
{
- if (!check_IPFilterChroma_ss_primitive(ref.chroma[csp].filter_vss[value], opt.chroma[csp].filter_vss[value]))
+ if (!check_IPFilterChroma_ss_primitive(ref.chroma[csp].pu[value].filter_vss, opt.chroma[csp].pu[value].filter_vss))
{
printf("chroma_vss[%s]", chromaPartStr[csp][value]);
return false;
@@ -665,59 +665,59 @@ void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPr
pixel_buff, srcStride, IPF_vec_output_s, width, height);
}
- for (int value = 0; value < NUM_LUMA_PARTITIONS; value++)
+ for (int value = 0; value < NUM_PU_SIZES; value++)
{
- if (opt.luma_hpp[value])
+ if (opt.pu[value].luma_hpp)
{
printf("luma_hpp[%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_hpp[value], ref.luma_hpp[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_hpp, ref.pu[value].luma_hpp,
pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
- if (opt.luma_hps[value])
+ if (opt.pu[value].luma_hps)
{
printf("luma_hps[%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_hps[value], ref.luma_hps[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_hps, ref.pu[value].luma_hps,
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1, 1);
}
- if (opt.luma_vpp[value])
+ if (opt.pu[value].luma_vpp)
{
printf("luma_vpp[%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_vpp[value], ref.luma_vpp[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_vpp, ref.pu[value].luma_vpp,
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
- if (opt.luma_vps[value])
+ if (opt.pu[value].luma_vps)
{
printf("luma_vps[%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_vps[value], ref.luma_vps[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_vps, ref.pu[value].luma_vps,
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1);
}
- if (opt.luma_vsp[value])
+ if (opt.pu[value].luma_vsp)
{
printf("luma_vsp[%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_vsp[value], ref.luma_vsp[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_vsp, ref.pu[value].luma_vsp,
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
- if (opt.luma_vss[value])
+ if (opt.pu[value].luma_vss)
{
printf("luma_vss[%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_vss[value], ref.luma_vss[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_vss, ref.pu[value].luma_vss,
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1);
}
- if (opt.luma_hvpp[value])
+ if (opt.pu[value].luma_hvpp)
{
printf("luma_hv [%s]\t", lumaPartStr[value]);
- REPORT_SPEEDUP(opt.luma_hvpp[value], ref.luma_hvpp[value],
+ REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
}
}
@@ -725,51 +725,51 @@ void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPr
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
{
printf("= Color Space %s =\n", x265_source_csp_names[csp]);
- if (opt.chroma_p2s[csp])
+ if (opt.chroma[csp].p2s)
{
printf("chroma_p2s\t");
- REPORT_SPEEDUP(opt.chroma_p2s[csp], ref.chroma_p2s[csp],
+ REPORT_SPEEDUP(opt.chroma[csp].p2s, ref.chroma[csp].p2s,
pixel_buff, srcStride, IPF_vec_output_s, width, height);
}
- for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
+ for (int value = 0; value < NUM_PU_SIZES; value++)
{
- if (opt.chroma[csp].filter_hpp[value])
+ if (opt.chroma[csp].pu[value].filter_hpp)
{
printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
- REPORT_SPEEDUP(opt.chroma[csp].filter_hpp[value], ref.chroma[csp].filter_hpp[value],
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hpp, ref.chroma[csp].pu[value].filter_hpp,
pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
- if (opt.chroma[csp].filter_hps[value])
+ if (opt.chroma[csp].pu[value].filter_hps)
{
printf("chroma_hps[%s]", chromaPartStr[csp][value]);
- REPORT_SPEEDUP(opt.chroma[csp].filter_hps[value], ref.chroma[csp].filter_hps[value],
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hps, ref.chroma[csp].pu[value].filter_hps,
pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
}
- if (opt.chroma[csp].filter_vpp[value])
+ if (opt.chroma[csp].pu[value].filter_vpp)
{
printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
- REPORT_SPEEDUP(opt.chroma[csp].filter_vpp[value], ref.chroma[csp].filter_vpp[value],
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vpp, ref.chroma[csp].pu[value].filter_vpp,
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
- if (opt.chroma[csp].filter_vps[value])
+ if (opt.chroma[csp].pu[value].filter_vps)
{
printf("chroma_vps[%s]", chromaPartStr[csp][value]);
- REPORT_SPEEDUP(opt.chroma[csp].filter_vps[value], ref.chroma[csp].filter_vps[value],
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vps, ref.chroma[csp].pu[value].filter_vps,
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1);
}
- if (opt.chroma[csp].filter_vsp[value])
+ if (opt.chroma[csp].pu[value].filter_vsp)
{
printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
- REPORT_SPEEDUP(opt.chroma[csp].filter_vsp[value], ref.chroma[csp].filter_vsp[value],
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vsp, ref.chroma[csp].pu[value].filter_vsp,
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
- if (opt.chroma[csp].filter_vss[value])
+ if (opt.chroma[csp].pu[value].filter_vss)
{
printf("chroma_vss[%s]", chromaPartStr[csp][value]);
- REPORT_SPEEDUP(opt.chroma[csp].filter_vss[value], ref.chroma[csp].filter_vss[value],
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vss, ref.chroma[csp].pu[value].filter_vss,
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1);
}
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index 88e4676..afea4da 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -37,7 +37,6 @@ struct DctConf
const DctConf dctInfo[] =
{
- { "dst4x4\t", 4 },
{ "dct4x4\t", 4 },
{ "dct8x8\t", 8 },
{ "dct16x16", 16 },
@@ -46,7 +45,6 @@ const DctConf dctInfo[] =
const DctConf idctInfo[] =
{
- { "idst4x4\t", 4 },
{ "idct4x4\t", 4 },
{ "idct8x8\t", 8 },
{ "idct16x16", 16 },
@@ -65,17 +63,17 @@ MBDstHarness::MBDstHarness()
short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
int_test_buff[0][i] = rand() % PIXEL_MAX;
int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
- int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX);
+ short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
short_test_buff[1][i] = -PIXEL_MAX;
int_test_buff[1][i] = -PIXEL_MAX;
int_idct_test_buff[1][i] = SHORT_MIN;
- int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX;
+ short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
short_test_buff[2][i] = PIXEL_MAX;
int_test_buff[2][i] = PIXEL_MAX;
int_idct_test_buff[2][i] = SHORT_MAX;
- int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX;
+ short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
mbuf1[i] = rand() & PIXEL_MAX;
mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
@@ -96,16 +94,16 @@ MBDstHarness::MBDstHarness()
bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width)
{
int j = 0;
- intptr_t cmp_size = sizeof(int) * width * width;
+ intptr_t cmp_size = sizeof(short) * width * width;
for (int i = 0; i < ITERS; i++)
{
int index = rand() % TEST_CASES;
- ref(short_test_buff[index] + j, mintbuf3, width);
- checked(opt, short_test_buff[index] + j, mintbuf4, width);
+ ref(short_test_buff[index] + j, mshortbuf2, width);
+ checked(opt, short_test_buff[index] + j, mshortbuf3, width);
- if (memcmp(mintbuf3, mintbuf4, cmp_size))
+ if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
reportfail();
@@ -124,8 +122,8 @@ bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, intptr_t width)
{
int index = rand() % TEST_CASES;
- ref(int_idct_test_buff[index] + j, mshortbuf2, width);
- checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width);
+ ref(short_test_buff[index] + j, mshortbuf2, width);
+ checked(opt, short_test_buff[index] + j, mshortbuf3, width);
if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
@@ -156,10 +154,10 @@ bool MBDstHarness::check_dequant_primitive(dequant_normal_t ref, dequant_normal_
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
- ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift);
- checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift);
+ ref(short_test_buff[index] + j, mshortbuf2, width * height, scale, shift);
+ checked(opt, short_test_buff[index] + j, mshortbuf3, width * height, scale, shift);
- if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width))
+ if (memcmp(mshortbuf2, mshortbuf3, sizeof(int16_t) * height * width))
return false;
reportfail();
@@ -175,6 +173,10 @@ bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scalin
for (int i = 0; i < ITERS; i++)
{
+
+ memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t));
+ memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t));
+
int log2TrSize = (rand() % 4) + 2;
int width = (1 << log2TrSize);
@@ -185,13 +187,13 @@ bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scalin
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
- int cmp_size = sizeof(int) * height * width;
+ int cmp_size = sizeof(int16_t) * height * width;
int index1 = rand() % TEST_CASES;
- ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
- checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
+ ref(short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf2, width * height, per, shift);
+ checked(opt, short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf3, width * height, per, shift);
- if (memcmp(mintbuf1, mintbuf2, cmp_size))
+ if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
reportfail();
@@ -222,8 +224,8 @@ bool MBDstHarness::check_quant_primitive(quant_t ref, quant_t opt)
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
- optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
+ refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
+ optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
if (memcmp(mintbuf1, mintbuf3, cmp_size))
return false;
@@ -261,8 +263,8 @@ bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
- optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
+ refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
+ optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
return false;
@@ -324,6 +326,7 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op
int log2TrSize = s + 2;
int num = 1 << (log2TrSize * 2);
int cmp_size = sizeof(int) * num;
+ int cmp_short = sizeof(short) * num;
for (int i = 0; i < ITERS; i++)
{
@@ -336,10 +339,10 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op
int index = rand() % TEST_CASES;
- ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
- checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
+ ref(short_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
+ checked(opt, short_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
- if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size))
+ if (memcmp(short_denoise_test_buff1[index] + j, short_denoise_test_buff2[index] + j, cmp_short))
return false;
if (memcmp(mubuf1, mubuf2, cmp_size))
@@ -357,11 +360,11 @@ bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t op
bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
- for (int i = 0; i < NUM_DCTS; i++)
+ for (int i = 0; i < NUM_TR_SIZE; i++)
{
- if (opt.dct[i])
+ if (opt.cu[i].dct)
{
- if (!check_dct_primitive(ref.dct[i], opt.dct[i], dctInfo[i].width))
+ if (!check_dct_primitive(ref.cu[i].dct, opt.cu[i].dct, dctInfo[i].width))
{
printf("\n%s failed\n", dctInfo[i].name);
return false;
@@ -369,11 +372,11 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
}
}
- for (int i = 0; i < NUM_IDCTS; i++)
+ for (int i = 0; i < NUM_TR_SIZE; i++)
{
- if (opt.idct[i])
+ if (opt.cu[i].idct)
{
- if (!check_idct_primitive(ref.idct[i], opt.idct[i], idctInfo[i].width))
+ if (!check_idct_primitive(ref.cu[i].idct, opt.cu[i].idct, idctInfo[i].width))
{
printf("%s failed\n", idctInfo[i].name);
return false;
@@ -381,6 +384,24 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
}
}
+ if (opt.dst4x4)
+ {
+ if (!check_dct_primitive(ref.dst4x4, opt.dst4x4, 4))
+ {
+ printf("dst4x4: Failed\n");
+ return false;
+ }
+ }
+
+ if (opt.idst4x4)
+ {
+ if (!check_idct_primitive(ref.idst4x4, opt.idst4x4, 4))
+ {
+ printf("idst4x4: Failed\n");
+ return false;
+ }
+ }
+
if (opt.dequant_normal)
{
if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal))
@@ -449,46 +470,58 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
- for (int value = 0; value < NUM_DCTS; value++)
+ if (opt.dst4x4)
+ {
+ printf("dst4x4\t");
+ REPORT_SPEEDUP(opt.dst4x4, ref.dst4x4, mbuf1, mshortbuf2, 4);
+ }
+
+ for (int value = 0; value < NUM_TR_SIZE; value++)
{
- if (opt.dct[value])
+ if (opt.cu[value].dct)
{
printf("%s\t", dctInfo[value].name);
- REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width);
+ REPORT_SPEEDUP(opt.cu[value].dct, ref.cu[value].dct, mbuf1, mshortbuf2, dctInfo[value].width);
}
}
- for (int value = 0; value < NUM_IDCTS; value++)
+ if (opt.idst4x4)
+ {
+ printf("idst4x4\t");
+ REPORT_SPEEDUP(opt.idst4x4, ref.idst4x4, mbuf1, mshortbuf2, 4);
+ }
+
+ for (int value = 0; value < NUM_TR_SIZE; value++)
{
- if (opt.idct[value])
+ if (opt.cu[value].idct)
{
printf("%s\t", idctInfo[value].name);
- REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width);
+ REPORT_SPEEDUP(opt.cu[value].idct, ref.cu[value].idct, mshortbuf3, mshortbuf2, idctInfo[value].width);
}
}
if (opt.dequant_normal)
{
printf("dequant_normal\t");
- REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1);
+ REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
}
if (opt.dequant_scaling)
{
printf("dequant_scaling\t");
- REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
+ REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mshortbuf2, 32 * 32, 5, 1);
}
if (opt.quant)
{
printf("quant\t\t");
- REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
+ REPORT_SPEEDUP(opt.quant, ref.quant, short_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
}
if (opt.nquant)
{
printf("nquant\t\t");
- REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
+ REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
}
if (opt.count_nonzero)
@@ -503,7 +536,6 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
if (opt.denoiseDct)
{
printf("denoiseDct\t");
- REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
+ REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, short_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
}
-
}
diff --git a/source/test/mbdstharness.h b/source/test/mbdstharness.h
index a8b4de2..284892a 100644
--- a/source/test/mbdstharness.h
+++ b/source/test/mbdstharness.h
@@ -60,8 +60,8 @@ protected:
uint32_t mubuf2[MAX_TU_SIZE];
uint16_t mushortbuf1[MAX_TU_SIZE];
- int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
- int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
+ int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
+ int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index bb6e0e6..4ee7923 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -65,7 +65,9 @@ PixelHarness::PixelHarness()
sbuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
- psbuf1[i] = (rand() % 65) - 32; // range is between -32 to 32
+ psbuf1[i] = psbuf4[i] = (rand() % 65) - 32; // range is between -32 to 32
+ psbuf2[i] = psbuf5[i] = (rand() % 3) - 1; // possible values {-1,0,1}
+ psbuf3[i] = (rand() % 129) - 128;
sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
}
}
@@ -91,27 +93,6 @@ bool PixelHarness::check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt)
return true;
}
-bool PixelHarness::check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt)
-{
- int j = 0;
- intptr_t stride = STRIDE;
-
- for (int i = 0; i < ITERS; i++)
- {
- int index1 = rand() % TEST_CASES;
- int index2 = rand() % TEST_CASES;
- int vres = (int)checked(opt, short_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
- int cres = ref(short_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
- if (vres != cres)
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
bool PixelHarness::check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
{
int j = 0;
@@ -228,9 +209,7 @@ bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
if (cres != vres)
- {
return false;
- }
reportfail();
j += INCR;
@@ -241,8 +220,8 @@ bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
{
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * (64 + 1)]);
memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
@@ -250,18 +229,23 @@ bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
int width = 2 * (rand() % 32 + 1);
int height = 8;
int w0 = rand() % 128;
- int shift = rand() % 15;
+ int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()
int round = shift ? (1 << (shift - 1)) : 0;
int offset = (rand() % 256) - 128;
intptr_t stride = 64;
+ const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+
for (int i = 0; i < ITERS; i++)
{
int index = i % TEST_CASES;
- checked(opt, short_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round, shift, offset);
- ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round, shift, offset);
+ checked(opt, short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
+ ref(short_test_buff[index] + j, ref_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ {
+ opt(short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
return false;
+ }
reportfail();
j += INCR;
@@ -281,18 +265,22 @@ bool PixelHarness::check_weightp(weightp_pp_t ref, weightp_pp_t opt)
int width = 16 * (rand() % 4 + 1);
int height = 8;
int w0 = rand() % 128;
- int shift = rand() % 15;
+ int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()
int round = shift ? (1 << (shift - 1)) : 0;
int offset = (rand() % 256) - 128;
intptr_t stride = 64;
+ const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
for (int i = 0; i < ITERS; i++)
{
int index = i % TEST_CASES;
- checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round, shift, offset);
- ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round, shift, offset);
+ checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round << correction, shift + correction, offset);
+ ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round << correction, shift + correction, offset);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ {
+ checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round << correction, shift + correction, offset);
return false;
+ }
reportfail();
j += INCR;
@@ -344,7 +332,7 @@ bool PixelHarness::check_downscale_t(downscale_t ref, downscale_t opt)
return true;
}
-bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)
+bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -359,8 +347,8 @@ bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t op
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
- ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
@@ -372,60 +360,7 @@ bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t op
return true;
}
-bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt)
-{
- ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
- int j = 0;
- intptr_t stride = STRIDE;
- for (int i = 0; i < ITERS; i++)
- {
- int shift = (rand() % 7 + 1);
-
- int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
-bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
-{
- ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
- memset(ref_dest, 0xCD, sizeof(ref_dest));
- memset(opt_dest, 0xCD, sizeof(opt_dest));
-
- int j = 0;
- intptr_t stride = STRIDE;
- for (int i = 0; i < ITERS; i++)
- {
- int shift = (rand() % 7 + 1);
-
- int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
-bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
+bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -440,8 +375,8 @@ bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t op
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
- ref(ref_dest, int_test_buff[index] + j, stride, shift);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
@@ -479,7 +414,7 @@ bool PixelHarness::check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt)
return true;
}
-bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -494,8 +429,8 @@ bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
@@ -507,7 +442,7 @@ bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
return true;
}
-bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
+bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -853,7 +788,6 @@ bool PixelHarness::check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_
return true;
}
-/* TODO: This function causes crashes when checked. Is this a real bug? */
bool PixelHarness::check_ssim_end(ssim_end4_t ref, ssim_end4_t opt)
{
ALIGN_VAR_32(int, sum0[5][4]);
@@ -909,6 +843,33 @@ bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
return true;
}
+bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
+{
+ ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int width = 16 * (rand() % 4 + 1);
+
+ ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
+ checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -924,9 +885,7 @@ bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt)
int width = 16 * (rand() % 4 + 1);
int8_t sign = rand() % 3;
if (sign == 2)
- {
sign = -1;
- }
ref(ref_dest, psbuf1 + j, width, sign);
checked(opt, opt_dest, psbuf1 + j, width, sign);
@@ -941,6 +900,94 @@ bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt)
return true;
}
+bool PixelHarness::check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int width = 16 * (rand() % 4 + 1);
+ int stride = width + 1;
+
+ ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, width);
+ checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, width);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int width = 16 * (rand() % 4 + 1);
+ int stride = width + 1;
+
+ ref(ref_dest, psbuf1 + j, psbuf2 + j, psbuf3 + j, width, stride);
+ checked(opt, opt_dest, psbuf4 + j, psbuf2 + j, psbuf3 + j, width, stride);
+
+ if (memcmp(psbuf1 + j, psbuf4 + j, width * sizeof(int8_t)))
+ return false;
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int stride = 16 * (rand() % 4 + 1);
+ int start = rand() % 2;
+ int end = (16 * (rand() % 4 + 1)) - rand() % 2;
+
+ ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end);
+ checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1001,215 +1048,299 @@ bool PixelHarness::check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt)
return true;
}
-bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
+bool PixelHarness::check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt)
{
- if (opt.satd[part])
+ ALIGN_VAR_16(int, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ double fps = 1.0;
+ int width = 16 + rand() % 64;
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
{
- if (!check_pixelcmp(ref.satd[part], opt.satd[part]))
- {
- printf("satd[%s]: failed!\n", lumaPartStr[part]);
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
+ ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
+
+ if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
return false;
- }
+
+ reportfail();
+ j += INCR;
}
- if (opt.sa8d_inter[part])
+ return true;
+}
+
+bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
+{
+ int j = 0, index1, index2, optres, refres;
+ intptr_t stride = STRIDE;
+
+ for (int i = 0; i < ITERS; i++)
{
- if (!check_pixelcmp(ref.sa8d_inter[part], opt.sa8d_inter[part]))
- {
- printf("sa8d_inter[%s]: failed!\n", lumaPartStr[part]);
+ index1 = rand() % TEST_CASES;
+ index2 = rand() % TEST_CASES;
+ optres = (int)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+ refres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+
+ if (optres != refres)
return false;
- }
+
+ reportfail();
+ j += INCR;
}
- if (opt.sad[part])
+ return true;
+}
+
+bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
+{
+ int j = 0, index1, index2, optres, refres;
+ intptr_t stride = STRIDE;
+
+ for (int i = 0; i < ITERS; i++)
{
- if (!check_pixelcmp(ref.sad[part], opt.sad[part]))
- {
- printf("sad[%s]: failed!\n", lumaPartStr[part]);
+ index1 = rand() % TEST_CASES;
+ index2 = rand() % TEST_CASES;
+ optres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+ refres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+
+ if (optres != refres)
return false;
- }
+
+ reportfail();
+ j += INCR;
}
- if (opt.sse_pp[part])
+ return true;
+}
+
+bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
{
- if (!check_pixelcmp(ref.sse_pp[part], opt.sse_pp[part]))
- {
- printf("sse_pp[%s]: failed!\n", lumaPartStr[part]);
+ int width = 16 * (rand() % 4 + 1);
+ int height = rand() % 64 +1;
+ int stride = rand() % 65;
+
+ ref(ref_dest, psbuf1 + j, width, height, stride);
+ checked(opt, opt_dest, psbuf1 + j, width, height, stride);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
return false;
- }
+
+ reportfail();
+ j += INCR;
}
- if (opt.sse_sp[part])
+ return true;
+}
+
+
+bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
+{
+ if (opt.pu[part].satd)
{
- if (!check_pixelcmp_sp(ref.sse_sp[part], opt.sse_sp[part]))
+ if (!check_pixelcmp(ref.pu[part].satd, opt.pu[part].satd))
{
- printf("sse_sp[%s]: failed!\n", lumaPartStr[part]);
+ printf("satd[%s]: failed!\n", lumaPartStr[part]);
return false;
}
}
- if (opt.sse_ss[part])
+ if (opt.pu[part].sad)
{
- if (!check_pixelcmp_ss(ref.sse_ss[part], opt.sse_ss[part]))
+ if (!check_pixelcmp(ref.pu[part].sad, opt.pu[part].sad))
{
- printf("sse_ss[%s]: failed!\n", lumaPartStr[part]);
+ printf("sad[%s]: failed!\n", lumaPartStr[part]);
return false;
}
}
- if (opt.sad_x3[part])
+ if (opt.pu[part].sad_x3)
{
- if (!check_pixelcmp_x3(ref.sad_x3[part], opt.sad_x3[part]))
+ if (!check_pixelcmp_x3(ref.pu[part].sad_x3, opt.pu[part].sad_x3))
{
printf("sad_x3[%s]: failed!\n", lumaPartStr[part]);
return false;
}
}
- if (opt.sad_x4[part])
+ if (opt.pu[part].sad_x4)
{
- if (!check_pixelcmp_x4(ref.sad_x4[part], opt.sad_x4[part]))
+ if (!check_pixelcmp_x4(ref.pu[part].sad_x4, opt.pu[part].sad_x4))
{
printf("sad_x4[%s]: failed!\n", lumaPartStr[part]);
return false;
}
}
- if (opt.pixelavg_pp[part])
+ if (opt.pu[part].pixelavg_pp)
{
- if (!check_pixelavg_pp(ref.pixelavg_pp[part], opt.pixelavg_pp[part]))
+ if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp, opt.pu[part].pixelavg_pp))
{
printf("pixelavg_pp[%s]: failed!\n", lumaPartStr[part]);
return false;
}
}
- if (opt.luma_copy_pp[part])
- {
- if (!check_copy_pp(ref.luma_copy_pp[part], opt.luma_copy_pp[part]))
- {
- printf("luma_copy_pp[%s] failed\n", lumaPartStr[part]);
- return false;
- }
- }
-
- if (opt.luma_copy_sp[part])
+ if (opt.pu[part].copy_pp)
{
- if (!check_copy_sp(ref.luma_copy_sp[part], opt.luma_copy_sp[part]))
+ if (!check_copy_pp(ref.pu[part].copy_pp, opt.pu[part].copy_pp))
{
- printf("luma_copy_sp[%s] failed\n", lumaPartStr[part]);
+ printf("copy_pp[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- if (opt.luma_copy_ps[part])
+ if (opt.pu[part].addAvg)
{
- if (!check_copy_ps(ref.luma_copy_ps[part], opt.luma_copy_ps[part]))
+ if (!check_addAvg(ref.pu[part].addAvg, opt.pu[part].addAvg))
{
- printf("luma_copy_ps[%s] failed\n", lumaPartStr[part]);
+ printf("addAvg[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- if (opt.luma_copy_ss[part])
+ if (part < NUM_CU_SIZES)
{
- if (!check_copy_ss(ref.luma_copy_ss[part], opt.luma_copy_ss[part]))
+ if (opt.cu[part].sse_pp)
{
- printf("luma_copy_ss[%s] failed\n", lumaPartStr[part]);
- return false;
+ if (!check_pixelcmp(ref.cu[part].sse_pp, opt.cu[part].sse_pp))
+ {
+ printf("sse_pp[%s]: failed!\n", lumaPartStr[part]);
+ return false;
+ }
}
- }
- if (opt.luma_addAvg[part])
- {
- if (!check_addAvg(ref.luma_addAvg[part], opt.luma_addAvg[part]))
+ if (opt.cu[part].sse_ss)
{
- printf("luma_addAvg[%s] failed\n", lumaPartStr[part]);
- return false;
+ if (!check_pixelcmp_ss(ref.cu[part].sse_ss, opt.cu[part].sse_ss))
+ {
+ printf("sse_ss[%s]: failed!\n", lumaPartStr[part]);
+ return false;
+ }
}
- }
- if (part < NUM_SQUARE_BLOCKS)
- {
- if (opt.luma_sub_ps[part])
+ if (opt.cu[part].sub_ps)
{
- if (!check_pixel_sub_ps(ref.luma_sub_ps[part], opt.luma_sub_ps[part]))
+ if (!check_pixel_sub_ps(ref.cu[part].sub_ps, opt.cu[part].sub_ps))
{
- printf("luma_sub_ps[%s] failed\n", lumaPartStr[part]);
+ printf("sub_ps[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- if (opt.luma_add_ps[part])
+ if (opt.cu[part].add_ps)
{
- if (!check_pixel_add_ps(ref.luma_add_ps[part], opt.luma_add_ps[part]))
+ if (!check_pixel_add_ps(ref.cu[part].add_ps, opt.cu[part].add_ps))
{
- printf("luma_add_ps[%s] failed\n", lumaPartStr[part]);
+ printf("add_ps[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- }
- for (int i = 0; i < X265_CSP_COUNT; i++)
- {
- if (opt.chroma[i].copy_pp[part])
+ if (opt.cu[part].copy_ss)
{
- if (!check_copy_pp(ref.chroma[i].copy_pp[part], opt.chroma[i].copy_pp[part]))
+ if (!check_copy_ss(ref.cu[part].copy_ss, opt.cu[part].copy_ss))
{
- printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ printf("copy_ss[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- if (opt.chroma[i].copy_sp[part])
+
+ if (opt.cu[part].copy_sp)
{
- if (!check_copy_sp(ref.chroma[i].copy_sp[part], opt.chroma[i].copy_sp[part]))
+ if (!check_copy_sp(ref.cu[part].copy_sp, opt.cu[part].copy_sp))
{
- printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ printf("copy_sp[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- if (opt.chroma[i].copy_ps[part])
+
+ if (opt.cu[part].copy_ps)
{
- if (!check_copy_ps(ref.chroma[i].copy_ps[part], opt.chroma[i].copy_ps[part]))
+ if (!check_copy_ps(ref.cu[part].copy_ps, opt.cu[part].copy_ps))
{
- printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ printf("copy_ps[%s] failed\n", lumaPartStr[part]);
return false;
}
}
- if (opt.chroma[i].copy_ss[part])
+ }
+
+ for (int i = 0; i < X265_CSP_COUNT; i++)
+ {
+ if (opt.chroma[i].pu[part].copy_pp)
{
- if (!check_copy_ss(ref.chroma[i].copy_ss[part], opt.chroma[i].copy_ss[part]))
+ if (!check_copy_pp(ref.chroma[i].pu[part].copy_pp, opt.chroma[i].pu[part].copy_pp))
{
- printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
- if (opt.chroma[i].addAvg[part])
+ if (opt.chroma[i].pu[part].addAvg)
{
- if (!check_addAvg(ref.chroma[i].addAvg[part], opt.chroma[i].addAvg[part]))
+ if (!check_addAvg(ref.chroma[i].pu[part].addAvg, opt.chroma[i].pu[part].addAvg))
{
printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
- if (part < NUM_SQUARE_BLOCKS)
+ if (part < NUM_CU_SIZES)
{
- if (opt.chroma[i].sub_ps[part])
+ if (opt.chroma[i].cu[part].sub_ps)
{
- if (!check_pixel_sub_ps(ref.chroma[i].sub_ps[part], opt.chroma[i].sub_ps[part]))
+ if (!check_pixel_sub_ps(ref.chroma[i].cu[part].sub_ps, opt.chroma[i].cu[part].sub_ps))
{
printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
- if (opt.chroma[i].add_ps[part])
+ if (opt.chroma[i].cu[part].add_ps)
{
- if (!check_pixel_add_ps(ref.chroma[i].add_ps[part], opt.chroma[i].add_ps[part]))
+ if (!check_pixel_add_ps(ref.chroma[i].cu[part].add_ps, opt.chroma[i].cu[part].add_ps))
{
printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
+ if (opt.chroma[i].cu[part].copy_sp)
+ {
+ if (!check_copy_sp(ref.chroma[i].cu[part].copy_sp, opt.chroma[i].cu[part].copy_sp))
+ {
+ printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
+ if (opt.chroma[i].cu[part].copy_ps)
+ {
+ if (!check_copy_ps(ref.chroma[i].cu[part].copy_ps, opt.chroma[i].cu[part].copy_ps))
+ {
+ printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
+ if (opt.chroma[i].cu[part].copy_ss)
+ {
+ if (!check_copy_ss(ref.chroma[i].cu[part].copy_ss, opt.chroma[i].cu[part].copy_ss))
+ {
+ printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
}
}
@@ -1221,137 +1352,152 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
for (int size = 4; size <= 64; size *= 2)
{
int part = partitionFromSizes(size, size); // 2Nx2N
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
if (size > 4)
{
part = partitionFromSizes(size, size >> 1); // 2NxN
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
part = partitionFromSizes(size >> 1, size); // Nx2N
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
}
if (size > 8)
{
// 4 AMP modes
part = partitionFromSizes(size, size >> 2);
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
part = partitionFromSizes(size, 3 * (size >> 2));
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
part = partitionFromSizes(size >> 2, size);
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
part = partitionFromSizes(3 * (size >> 2), size);
- if (!testPartition(part, ref, opt)) return false;
+ if (!testPU(part, ref, opt)) return false;
}
}
- for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
+ for (int i = 0; i < NUM_CU_SIZES; i++)
{
- if (opt.calcresidual[i])
+ if (opt.cu[i].sa8d)
{
- if (!check_calresidual(ref.calcresidual[i], opt.calcresidual[i]))
+ if (!check_pixelcmp(ref.cu[i].sa8d, opt.cu[i].sa8d))
{
- printf("calcresidual width: %d failed!\n", 4 << i);
+ printf("sa8d[%dx%d]: failed!\n", 4 << i, 4 << i);
return false;
}
}
- if (opt.sa8d[i])
+
+ if (opt.cu[i].blockfill_s)
{
- if (!check_pixelcmp(ref.sa8d[i], opt.sa8d[i]))
+ if (!check_blockfill_s(ref.cu[i].blockfill_s, opt.cu[i].blockfill_s))
{
- printf("sa8d[%dx%d]: failed!\n", 4 << i, 4 << i);
+ printf("blockfill_s[%dx%d]: failed!\n", 4 << i, 4 << i);
return false;
}
}
- if ((i <= BLOCK_32x32) && opt.ssd_s[i])
+ if (opt.cu[i].var)
{
- if (!check_ssd_s(ref.ssd_s[i], opt.ssd_s[i]))
+ if (!check_pixel_var(ref.cu[i].var, opt.cu[i].var))
{
- printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
+ printf("var[%dx%d] failed\n", 4 << i, 4 << i);
return false;
}
}
- if (opt.blockfill_s[i])
+ if (opt.cu[i].psy_cost_pp)
{
- if (!check_blockfill_s(ref.blockfill_s[i], opt.blockfill_s[i]))
+ if (!check_psyCost_pp(ref.cu[i].psy_cost_pp, opt.cu[i].psy_cost_pp))
{
- printf("blockfill_s[%dx%d]: failed!\n", 4 << i, 4 << i);
+ printf("\npsy_cost_pp[%dx%d] failed!\n", 4 << i, 4 << i);
return false;
}
}
- if (opt.transpose[i])
+
+ if (opt.cu[i].psy_cost_ss)
{
- if (!check_transpose(ref.transpose[i], opt.transpose[i]))
+ if (!check_psyCost_ss(ref.cu[i].psy_cost_ss, opt.cu[i].psy_cost_ss))
{
- printf("transpose[%dx%d] failed\n", 4 << i, 4 << i);
+ printf("\npsy_cost_ss[%dx%d] failed!\n", 4 << i, 4 << i);
return false;
}
}
- if (opt.var[i])
+ if (i < BLOCK_64x64)
{
- if (!check_pixel_var(ref.var[i], opt.var[i]))
+ /* TU only primitives */
+
+ if (opt.cu[i].calcresidual)
{
- printf("var[%dx%d] failed\n", 4 << i, 4 << i);
- return false;
+ if (!check_calresidual(ref.cu[i].calcresidual, opt.cu[i].calcresidual))
+ {
+ printf("calcresidual width: %d failed!\n", 4 << i);
+ return false;
+ }
}
- }
- if ((i < BLOCK_64x64) && opt.copy_cnt[i])
- {
- if (!check_copy_cnt_t(ref.copy_cnt[i], opt.copy_cnt[i]))
+ if (opt.cu[i].transpose)
{
- printf("copy_cnt[%dx%d] failed!\n", 4 << i, 4 << i);
- return false;
+ if (!check_transpose(ref.cu[i].transpose, opt.cu[i].transpose))
+ {
+ printf("transpose[%dx%d] failed\n", 4 << i, 4 << i);
+ return false;
+ }
}
- }
- if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
- {
- if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
+ if (opt.cu[i].ssd_s)
{
- printf("cvt16to32_shr failed!\n");
- return false;
+ if (!check_ssd_s(ref.cu[i].ssd_s, opt.cu[i].ssd_s))
+ {
+ printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
+ return false;
+ }
}
- }
- if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
- {
- if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
+ if (opt.cu[i].copy_cnt)
{
- printf("cvt32to16_shl failed!\n");
- return false;
+ if (!check_copy_cnt_t(ref.cu[i].copy_cnt, opt.cu[i].copy_cnt))
+ {
+ printf("copy_cnt[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
}
- }
- if ((i < BLOCK_64x64) && opt.copy_shl[i])
- {
- if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
+ if (opt.cu[i].cpy2Dto1D_shl)
{
- printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
- return false;
+ if (!check_cpy2Dto1D_shl_t(ref.cu[i].cpy2Dto1D_shl, opt.cu[i].cpy2Dto1D_shl))
+ {
+ printf("cpy2Dto1D_shl failed!\n");
+ return false;
+ }
}
- }
- }
+ if (opt.cu[i].cpy2Dto1D_shr)
+ {
+ if (!check_cpy2Dto1D_shr_t(ref.cu[i].cpy2Dto1D_shr, opt.cu[i].cpy2Dto1D_shr))
+ {
+ printf("cpy2Dto1D_shr failed!\n");
+ return false;
+ }
+ }
- if (opt.cvt32to16_shr)
- {
- if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))
- {
- printf("cvt32to16 failed!\n");
- return false;
- }
- }
+ if (opt.cu[i].cpy1Dto2D_shl)
+ {
+ if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl, opt.cu[i].cpy1Dto2D_shl))
+ {
+ printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
- if (opt.cvt16to32_shl)
- {
- if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl))
- {
- printf("cvt16to32_shl failed!\n");
- return false;
+ if (opt.cu[i].cpy1Dto2D_shr)
+ {
+ if (!check_cpy1Dto2D_shr_t(ref.cu[i].cpy1Dto2D_shr, opt.cu[i].cpy1Dto2D_shr))
+ {
+ printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
}
}
@@ -1373,9 +1519,9 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
}
}
- if (opt.frame_init_lowres_core)
+ if (opt.frameInitLowres)
{
- if (!check_downscale_t(ref.frame_init_lowres_core, opt.frame_init_lowres_core))
+ if (!check_downscale_t(ref.frameInitLowres, opt.frameInitLowres))
{
printf("downscale failed!\n");
return false;
@@ -1418,6 +1564,15 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
}
}
+ if (opt.sign)
+ {
+ if (!check_calSign(ref.sign, opt.sign))
+ {
+ printf("calSign failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuOrgE0)
{
if (!check_saoCuOrgE0_t(ref.saoCuOrgE0, opt.saoCuOrgE0))
@@ -1427,6 +1582,42 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
}
}
+ if (opt.saoCuOrgE1)
+ {
+ if (!check_saoCuOrgE1_t(ref.saoCuOrgE1, opt.saoCuOrgE1))
+ {
+ printf("SAO_EO_1 failed\n");
+ return false;
+ }
+ }
+
+ if (opt.saoCuOrgE2)
+ {
+ if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
+ {
+ printf("SAO_EO_2 failed\n");
+ return false;
+ }
+ }
+
+ if (opt.saoCuOrgE3)
+ {
+ if (!check_saoCuOrgE3_t(ref.saoCuOrgE3, opt.saoCuOrgE3))
+ {
+ printf("SAO_EO_3 failed\n");
+ return false;
+ }
+ }
+
+ if (opt.saoCuOrgB0)
+ {
+ if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
+ {
+ printf("SAO_BO_0 failed\n");
+ return false;
+ }
+ }
+
if (opt.planecopy_sp)
{
if (!check_planecopy_sp(ref.planecopy_sp, opt.planecopy_sp))
@@ -1445,11 +1636,11 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
}
}
- if (opt.copy_shr)
+ if (opt.propagateCost)
{
- if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
+ if (!check_cutree_propagate_cost(ref.propagateCost, opt.propagateCost))
{
- printf("copy_shr failed!\n");
+ printf("propagateCost failed\n");
return false;
}
}
@@ -1464,139 +1655,126 @@ void PixelHarness::measurePartition(int part, const EncoderPrimitives& ref, cons
char header[128];
#define HEADER(str, ...) sprintf(header, str, __VA_ARGS__); printf("%22s", header);
- if (opt.satd[part])
+ if (opt.pu[part].satd)
{
HEADER("satd[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.satd[part], ref.satd[part], pbuf1, STRIDE, fref, STRIDE);
+ REPORT_SPEEDUP(opt.pu[part].satd, ref.pu[part].satd, pbuf1, STRIDE, fref, STRIDE);
}
- if (opt.pixelavg_pp[part])
+ if (opt.pu[part].pixelavg_pp)
{
HEADER("avg_pp[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.pixelavg_pp[part], ref.pixelavg_pp[part], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
+ REPORT_SPEEDUP(opt.pu[part].pixelavg_pp, ref.pu[part].pixelavg_pp, pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
}
- if (opt.sa8d_inter[part])
- {
- HEADER("sa8d[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sa8d_inter[part], ref.sa8d_inter[part], pbuf1, STRIDE, fref, STRIDE);
- }
-
- if (opt.sad[part])
+ if (opt.pu[part].sad)
{
HEADER("sad[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sad[part], ref.sad[part], pbuf1, STRIDE, fref, STRIDE);
+ REPORT_SPEEDUP(opt.pu[part].sad, ref.pu[part].sad, pbuf1, STRIDE, fref, STRIDE);
}
- if (opt.sad_x3[part])
+ if (opt.pu[part].sad_x3)
{
HEADER("sad_x3[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sad_x3[part], ref.sad_x3[part], pbuf1, fref, fref + 1, fref - 1, FENC_STRIDE + 5, &cres[0]);
+ REPORT_SPEEDUP(opt.pu[part].sad_x3, ref.pu[part].sad_x3, pbuf1, fref, fref + 1, fref - 1, FENC_STRIDE + 5, &cres[0]);
}
- if (opt.sad_x4[part])
+ if (opt.pu[part].sad_x4)
{
HEADER("sad_x4[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sad_x4[part], ref.sad_x4[part], pbuf1, fref, fref + 1, fref - 1, fref - INCR, FENC_STRIDE + 5, &cres[0]);
+ REPORT_SPEEDUP(opt.pu[part].sad_x4, ref.pu[part].sad_x4, pbuf1, fref, fref + 1, fref - 1, fref - INCR, FENC_STRIDE + 5, &cres[0]);
}
- if (opt.sse_pp[part])
+ if (opt.pu[part].copy_pp)
{
- HEADER("sse_pp[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sse_pp[part], ref.sse_pp[part], pbuf1, STRIDE, fref, STRIDE);
+ HEADER("copy_pp[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 128);
}
- if (opt.sse_sp[part])
+ if (opt.pu[part].addAvg)
{
- HEADER("sse_sp[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sse_sp[part], ref.sse_sp[part], (int16_t*)pbuf1, STRIDE, fref, STRIDE);
+ HEADER("addAvg[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.pu[part].addAvg, ref.pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
- if (opt.sse_ss[part])
+ if (part < NUM_CU_SIZES)
{
- HEADER("sse_ss[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.sse_ss[part], ref.sse_ss[part], (int16_t*)pbuf1, STRIDE, (int16_t*)fref, STRIDE);
- }
-
- if (opt.luma_copy_pp[part])
- {
- HEADER("luma_copy_pp[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_copy_pp[part], ref.luma_copy_pp[part], pbuf1, 64, pbuf2, 128);
- }
-
- if (opt.luma_copy_sp[part])
- {
- HEADER("luma_copy_sp[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_copy_sp[part], ref.luma_copy_sp[part], pbuf1, 64, sbuf3, 128);
- }
+ if (opt.cu[part].sse_pp)
+ {
+ HEADER("sse_pp[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].sse_pp, ref.cu[part].sse_pp, pbuf1, STRIDE, fref, STRIDE);
+ }
- if (opt.luma_copy_ps[part])
- {
- HEADER("luma_copy_ps[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_copy_ps[part], ref.luma_copy_ps[part], sbuf1, 64, pbuf1, 128);
- }
- if (opt.luma_copy_ss[part])
- {
- HEADER("luma_copy_ss[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_copy_ss[part], ref.luma_copy_ss[part], sbuf1, 64, sbuf2, 128);
- }
- if (opt.luma_addAvg[part])
- {
- HEADER("luma_addAvg[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
- }
- if (part < NUM_SQUARE_BLOCKS)
- {
- if (opt.luma_sub_ps[part])
+ if (opt.cu[part].sse_ss)
{
- HEADER("luma_sub_ps[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ HEADER("sse_ss[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].sse_ss, ref.cu[part].sse_ss, (int16_t*)pbuf1, STRIDE, (int16_t*)fref, STRIDE);
}
- if (opt.luma_add_ps[part])
+ if (opt.cu[part].sub_ps)
{
- HEADER("luma_add_ps[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_add_ps[part], ref.luma_add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+ HEADER("sub_ps[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].sub_ps, ref.cu[part].sub_ps, (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
}
- }
-
- for (int i = 0; i < X265_CSP_COUNT; i++)
- {
- if (opt.chroma[i].copy_pp[part])
+ if (opt.cu[part].add_ps)
{
- HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].copy_pp[part], ref.chroma[i].copy_pp[part], pbuf1, 64, pbuf2, 128);
+ HEADER("add_ps[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].add_ps, ref.cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
}
- if (opt.chroma[i].copy_sp[part])
+ if (opt.cu[part].copy_ss)
{
- HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].copy_sp[part], ref.chroma[i].copy_sp[part], pbuf1, 64, sbuf3, 128);
+ HEADER("copy_ss[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 64, sbuf2, 128);
}
- if (opt.chroma[i].copy_ps[part])
+ if (opt.cu[part].copy_sp)
{
- HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].copy_ps[part], ref.chroma[i].copy_ps[part], sbuf1, 64, pbuf1, 128);
+ HEADER("copy_sp[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].copy_sp, ref.cu[part].copy_sp, pbuf1, 64, sbuf3, 128);
}
- if (opt.chroma[i].copy_ss[part])
+ if (opt.cu[part].copy_ps)
{
- HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].copy_ss[part], ref.chroma[i].copy_ss[part], sbuf1, 64, sbuf2, 128);
+ HEADER("copy_ps[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 64, pbuf1, 128);
}
- if (opt.chroma[i].addAvg[part])
+ }
+
+ for (int i = 0; i < X265_CSP_COUNT; i++)
+ {
+ if (opt.chroma[i].pu[part].copy_pp)
+ {
+ HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].pu[part].copy_pp, ref.chroma[i].pu[part].copy_pp, pbuf1, 64, pbuf2, 128);
+ }
+ if (opt.chroma[i].pu[part].addAvg)
{
HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
+ REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
- if (part < NUM_SQUARE_BLOCKS)
+ if (part < NUM_CU_SIZES)
{
- if (opt.chroma[i].sub_ps[part])
+ if (opt.chroma[i].cu[part].copy_ss)
+ {
+ HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].copy_ss, ref.chroma[i].cu[part].copy_ss, sbuf1, 64, sbuf2, 128);
+ }
+ if (opt.chroma[i].cu[part].copy_ps)
+ {
+ HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].copy_ps, ref.chroma[i].cu[part].copy_ps, sbuf1, 64, pbuf1, 128);
+ }
+ if (opt.chroma[i].cu[part].copy_sp)
+ {
+ HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].copy_sp, ref.chroma[i].cu[part].copy_sp, pbuf1, 64, sbuf3, 128);
+ }
+ if (opt.chroma[i].cu[part].sub_ps)
{
HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].sub_ps[part], ref.chroma[i].sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].sub_ps, ref.chroma[i].cu[part].sub_ps, (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
}
- if (opt.chroma[i].add_ps[part])
+ if (opt.chroma[i].cu[part].add_ps)
{
HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].add_ps[part], ref.chroma[i].add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps, ref.chroma[i].cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
}
}
}
@@ -1638,78 +1816,83 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
}
}
- for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
+ for (int i = 0; i < NUM_CU_SIZES; i++)
{
- if ((i <= BLOCK_32x32) && opt.ssd_s[i])
+ if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s)
{
HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.ssd_s[i], ref.ssd_s[i], sbuf1, STRIDE);
+ REPORT_SPEEDUP(opt.cu[i].ssd_s, ref.cu[i].ssd_s, sbuf1, STRIDE);
}
- if (opt.sa8d[i])
+ if (opt.cu[i].sa8d)
{
HEADER("sa8d[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.sa8d[i], ref.sa8d[i], pbuf1, STRIDE, pbuf2, STRIDE);
+ REPORT_SPEEDUP(opt.cu[i].sa8d, ref.cu[i].sa8d, pbuf1, STRIDE, pbuf2, STRIDE);
}
- if (opt.calcresidual[i])
+ if (opt.cu[i].calcresidual)
{
HEADER("residual[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.calcresidual[i], ref.calcresidual[i], pbuf1, pbuf2, sbuf1, 64);
+ REPORT_SPEEDUP(opt.cu[i].calcresidual, ref.cu[i].calcresidual, pbuf1, pbuf2, sbuf1, 64);
}
- if (opt.blockfill_s[i])
+ if (opt.cu[i].blockfill_s)
{
HEADER("blkfill[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.blockfill_s[i], ref.blockfill_s[i], sbuf1, 64, SHORT_MAX);
+ REPORT_SPEEDUP(opt.cu[i].blockfill_s, ref.cu[i].blockfill_s, sbuf1, 64, SHORT_MAX);
}
- if (opt.transpose[i])
+ if (opt.cu[i].transpose)
{
HEADER("transpose[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.transpose[i], ref.transpose[i], pbuf1, pbuf2, STRIDE);
+ REPORT_SPEEDUP(opt.cu[i].transpose, ref.cu[i].transpose, pbuf1, pbuf2, STRIDE);
}
- if (opt.var[i])
+ if (opt.cu[i].var)
{
HEADER("var[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
+ REPORT_SPEEDUP(opt.cu[i].var, ref.cu[i].var, pbuf1, STRIDE);
}
- if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy2Dto1D_shl)
{
- HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
+ HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shl, ref.cu[i].cpy2Dto1D_shl, sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
}
- if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy2Dto1D_shr)
{
- HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
+ HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shr, ref.cu[i].cpy2Dto1D_shr, sbuf1, sbuf2, STRIDE, 3);
}
- if ((i < BLOCK_64x64) && opt.copy_cnt[i])
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl)
{
- HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
+ HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl, ref.cu[i].cpy1Dto2D_shl, sbuf1, sbuf2, STRIDE, 64);
}
- if ((i < BLOCK_64x64) && opt.copy_shl[i])
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shr)
{
- HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
+ HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shr, ref.cu[i].cpy1Dto2D_shr, sbuf1, sbuf2, STRIDE, 64);
}
- }
+ if ((i < BLOCK_64x64) && opt.cu[i].copy_cnt)
+ {
+ HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].copy_cnt, ref.cu[i].copy_cnt, sbuf1, sbuf2, STRIDE);
+ }
- if (opt.cvt32to16_shr)
- {
- HEADER0("cvt32to16_shr");
- REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64);
- }
+ if (opt.cu[i].psy_cost_pp)
+ {
+ HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].psy_cost_pp, ref.cu[i].psy_cost_pp, pbuf1, STRIDE, pbuf2, STRIDE);
+ }
- if (opt.cvt16to32_shl)
- {
- HEADER0("cvt16to32_shl");
- REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64);
+ if (opt.cu[i].psy_cost_ss)
+ {
+ HEADER("psy_cost_ss[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].psy_cost_ss, ref.cu[i].psy_cost_ss, sbuf1, STRIDE, sbuf2, STRIDE);
+ }
}
if (opt.weight_pp)
@@ -1724,10 +1907,10 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
REPORT_SPEEDUP(opt.weight_sp, ref.weight_sp, (int16_t*)sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100);
}
- if (opt.frame_init_lowres_core)
+ if (opt.frameInitLowres)
{
HEADER0("downscale");
- REPORT_SPEEDUP(opt.frame_init_lowres_core, ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
+ REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
}
if (opt.scale1D_128to64)
@@ -1754,12 +1937,42 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4);
}
+ if (opt.sign)
+ {
+ HEADER0("calSign");
+ REPORT_SPEEDUP(opt.sign, ref.sign, psbuf1, pbuf1, pbuf2, 64);
+ }
+
if (opt.saoCuOrgE0)
{
HEADER0("SAO_EO_0");
REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
}
+ if (opt.saoCuOrgE1)
+ {
+ HEADER0("SAO_EO_1");
+ REPORT_SPEEDUP(opt.saoCuOrgE1, ref.saoCuOrgE1, pbuf1, psbuf2, psbuf1, 64, 64);
+ }
+
+ if (opt.saoCuOrgE2)
+ {
+ HEADER0("SAO_EO_2");
+ REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
+ }
+
+ if (opt.saoCuOrgE3)
+ {
+ HEADER0("SAO_EO_3");
+ REPORT_SPEEDUP(opt.saoCuOrgE3, ref.saoCuOrgE3, pbuf1, psbuf2, psbuf1, 64, 0, 64);
+ }
+
+ if (opt.saoCuOrgB0)
+ {
+ HEADER0("SAO_BO_0");
+ REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
+ }
+
if (opt.planecopy_sp)
{
HEADER0("planecopy_sp");
@@ -1772,10 +1985,9 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
}
- if (opt.copy_shr)
+ if (opt.propagateCost)
{
- HEADER0("copy_shr");
- REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
+ HEADER0("propagateCost");
+ REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
}
-
}
diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h
index 1255d99..7f5db0f 100644
--- a/source/test/pixelharness.h
+++ b/source/test/pixelharness.h
@@ -47,6 +47,10 @@ protected:
pixel pbuf4[BUFFSIZE];
int ibuf1[BUFFSIZE];
int8_t psbuf1[BUFFSIZE];
+ int8_t psbuf2[BUFFSIZE];
+ int8_t psbuf3[BUFFSIZE];
+ int8_t psbuf4[BUFFSIZE];
+ int8_t psbuf5[BUFFSIZE];
int16_t sbuf1[BUFFSIZE];
int16_t sbuf2[BUFFSIZE];
@@ -59,9 +63,9 @@ protected:
int int_test_buff[TEST_CASES][BUFFSIZE];
uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
uint8_t uchar_test_buff[TEST_CASES][BUFFSIZE];
+ double double_test_buff[TEST_CASES][BUFFSIZE];
bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
- bool check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt);
bool check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
bool check_pixelcmp_x3(pixelcmp_x3_t ref, pixelcmp_x3_t opt);
bool check_pixelcmp_x4(pixelcmp_x4_t ref, pixelcmp_x4_t opt);
@@ -80,20 +84,26 @@ protected:
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
bool check_downscale_t(downscale_t ref, downscale_t opt);
- bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
- bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt);
- bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
- bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
+ bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
+ bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
+ bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+ bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
- bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
- bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
bool check_pixel_var(var_t ref, var_t opt);
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
bool check_addAvg(addAvg_t, addAvg_t);
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
+ bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
+ bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
+ bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
+ bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
+ bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
+ bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
+ bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
+ bool check_calSign(sign_t ref, sign_t opt);
public:
@@ -102,7 +112,7 @@ public:
const char *getName() const { return "pixel"; }
bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt);
- bool testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt);
+ bool testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt);
void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt);
void measurePartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt);
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index ef2d9a1..cdb11c1 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -34,7 +34,7 @@
using namespace x265;
-const char* lumaPartStr[NUM_LUMA_PARTITIONS] =
+const char* lumaPartStr[NUM_PU_SIZES] =
{
" 4x4", " 8x8", "16x16", "32x32", "64x64",
" 8x4", " 4x8",
@@ -46,7 +46,7 @@ const char* lumaPartStr[NUM_LUMA_PARTITIONS] =
"64x48", "48x64", "64x16", "16x64",
};
-const char* chromaPartStr420[NUM_CHROMA_PARTITIONS] =
+const char* chromaPartStr420[NUM_PU_SIZES] =
{
" 2x2", " 4x4", " 8x8", "16x16", "32x32",
" 4x2", " 2x4",
@@ -58,7 +58,7 @@ const char* chromaPartStr420[NUM_CHROMA_PARTITIONS] =
"32x24", "24x32", " 32x8", " 8x32",
};
-const char* chromaPartStr422[NUM_CHROMA_PARTITIONS] =
+const char* chromaPartStr422[NUM_PU_SIZES] =
{
" 2x4", " 4x8", " 8x16", "16x32", "32x64",
" 4x4", " 2x8",
@@ -152,8 +152,8 @@ int main(int argc, char *argv[])
EncoderPrimitives cprim;
memset(&cprim, 0, sizeof(EncoderPrimitives));
- Setup_C_Primitives(cprim);
- Setup_Alias_Primitives(cprim);
+ setupCPrimitives(cprim);
+ setupAliasPrimitives(cprim);
struct test_arch_t
{
@@ -180,7 +180,8 @@ int main(int argc, char *argv[])
EncoderPrimitives vecprim;
memset(&vecprim, 0, sizeof(vecprim));
- Setup_Instrinsic_Primitives(vecprim, test_arch[i].flag);
+ setupInstrinsicPrimitives(vecprim, test_arch[i].flag);
+ setupAliasPrimitives(vecprim);
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
{
if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -194,7 +195,8 @@ int main(int argc, char *argv[])
EncoderPrimitives asmprim;
memset(&asmprim, 0, sizeof(asmprim));
- Setup_Assembly_Primitives(asmprim, test_arch[i].flag);
+ setupAssemblyPrimitives(asmprim, test_arch[i].flag);
+ setupAliasPrimitives(asmprim);
memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
{
@@ -212,9 +214,11 @@ int main(int argc, char *argv[])
EncoderPrimitives optprim;
memset(&optprim, 0, sizeof(optprim));
- Setup_Instrinsic_Primitives(optprim, cpuid);
- Setup_Assembly_Primitives(optprim, cpuid);
- Setup_Alias_Primitives(optprim);
+ setupInstrinsicPrimitives(optprim, cpuid);
+ setupAssemblyPrimitives(optprim, cpuid);
+
+ /* Note that we do not setup aliases for performance tests, that would be
+ * redundant. The testbench only verifies they are correctly aliased */
/* some hybrid primitives may rely on other primitives in the
* global primitive table, so set up those pointers. This is a
diff --git a/source/test/testharness.h b/source/test/testharness.h
index 1704f3e..70881e5 100644
--- a/source/test/testharness.h
+++ b/source/test/testharness.h
@@ -44,7 +44,7 @@
using namespace x265;
-extern const char* lumaPartStr[NUM_LUMA_PARTITIONS];
+extern const char* lumaPartStr[NUM_PU_SIZES];
extern const char* const* chromaPartStr[X265_CSP_COUNT];
class TestHarness
diff --git a/source/test/testpool.cpp b/source/test/testpool.cpp
deleted file mode 100644
index 01f037b..0000000
--- a/source/test/testpool.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Steve Borho <steve at borho.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com
- *****************************************************************************/
-
-#include "common.h"
-#include "threadpool.h"
-#include "wavefront.h"
-#include "threading.h"
-#include "md5.h"
-#include "PPA/ppa.h"
-
-#include <sstream>
-#include <iostream>
-
-using namespace x265;
-
-struct CUData
-{
- CUData()
- {
- memset(digest, 0, sizeof(digest));
- }
-
- unsigned char digest[16];
-};
-
-struct RowData
-{
- RowData() : active(false), curCol(0) {}
-
- Lock lock;
- volatile bool active;
- volatile int curCol;
-};
-
-// Create a fake frame class with manufactured data in each CU block. We
-// need to create an MD5 hash such that each CU's hash includes the hashes
-// of the blocks that would have HEVC data dependencies (left, top-left,
-// top, top-right). This will give us one deterministic output hash. We
-// then generate the same hash using the thread pool and wave-front parallelism
-// to verify the thread-pool behavior and the wave-front schedule data
-// structures.
-class MD5Frame : public WaveFront
-{
-private:
-
- CUData *cu;
- RowData *row;
- int numrows;
- int numcols;
- Event complete;
-
-public:
-
- MD5Frame(ThreadPool *pool) : WaveFront(pool), cu(0), row(0) {}
-
- virtual ~MD5Frame()
- {
- // ensure no threads are lingering on FindJob() before allowing
- // this object's vtable to be destroyed
- JobProvider::flush();
-
- delete[] this->cu;
- delete[] this->row;
- }
-
- void initialize(int cols, int rows);
-
- void encode();
-
- void processRow(int row, int threadid);
-};
-
-void MD5Frame::initialize(int cols, int rows)
-{
- this->cu = new CUData[rows * cols];
- this->row = new RowData[rows];
- this->numrows = rows;
- this->numcols = cols;
-
- if (!this->WaveFront::init(rows))
- {
- assert(!"Unable to initialize job queue");
- }
-}
-
-void MD5Frame::encode()
-{
- this->JobProvider::enqueue();
-
- this->WaveFront::enqueueRow(0);
-
- // NOTE: When EnableRow after enqueueRow at first row, we'd better call pokeIdleThread, it will release a thread to do job
- this->WaveFront::enableRow(0);
- this->m_pool->pokeIdleThread();
-
- this->complete.wait();
-
- this->JobProvider::dequeue();
-
- unsigned int *outdigest = (unsigned int*)this->cu[this->numrows * this->numcols - 1].digest;
-
- std::stringstream ss;
-
- for (int i = 0; i < 4; i++)
- {
- ss << std::hex << outdigest[i];
- }
-
- if (ss.str().compare("da667b741a7a9d0ee862158da2dd1882"))
- std::cout << "Bad hash: " << ss.str() << std::endl;
-}
-
-void MD5Frame::processRow(int rownum, int)
-{
- // Called by worker thread
- RowData &curRow = this->row[rownum];
-
- assert(rownum < this->numrows && rownum >= 0);
- assert(curRow.curCol < this->numcols);
-
- while (curRow.curCol < this->numcols)
- {
- int id = rownum * this->numcols + curRow.curCol;
- CUData &curCTU = this->cu[id];
- MD5 hash;
-
- // * Fake CTU processing *
- PPAStartCpuEventFunc(encode_block);
- memset(curCTU.digest, id, sizeof(curCTU.digest));
- hash.update(curCTU.digest, sizeof(curCTU.digest));
- if (curRow.curCol > 0)
- hash.update(this->cu[id - 1].digest, sizeof(curCTU.digest));
-
- if (rownum > 0)
- {
- if (curRow.curCol > 0)
- hash.update(this->cu[id - this->numcols - 1].digest, sizeof(curCTU.digest));
-
- hash.update(this->cu[id - this->numcols].digest, sizeof(curCTU.digest));
- if (curRow.curCol < this->numcols - 1)
- hash.update(this->cu[id - this->numcols + 1].digest, sizeof(curCTU.digest));
- }
-
- hash.finalize(curCTU.digest);
- PPAStopCpuEventFunc(encode_block);
-
- curRow.curCol++;
-
- if (curRow.curCol >= 2 && rownum < this->numrows - 1)
- {
- ScopedLock below(this->row[rownum + 1].lock);
-
- if (this->row[rownum + 1].active == false &&
- this->row[rownum + 1].curCol + 2 <= curRow.curCol)
- {
- // set active indicator so row is only enqueued once
- // row stays marked active until blocked or done
- this->row[rownum + 1].active = true;
- this->WaveFront::enqueueRow(rownum + 1);
- this->WaveFront::enableRow(rownum + 1);
- }
- }
-
- ScopedLock self(curRow.lock);
-
- if (rownum > 0 &&
- curRow.curCol < this->numcols - 1 &&
- this->row[rownum - 1].curCol < curRow.curCol + 2)
- {
- // row is blocked, quit job
- curRow.active = false;
- return;
- }
- }
-
- // * Row completed *
-
- if (rownum == this->numrows - 1)
- this->complete.trigger();
-}
-
-int main(int, char **)
-{
- ThreadPool *pool;
-
- PPA_INIT();
-
- pool = ThreadPool::allocThreadPool(1);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
- pool = ThreadPool::allocThreadPool(2);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
- pool = ThreadPool::allocThreadPool(4);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
- pool = ThreadPool::allocThreadPool(8);
- {
- MD5Frame frame(pool);
- frame.initialize(60, 40);
- frame.encode();
- }
- pool->release();
-
- return 0;
-}
diff --git a/source/x265.cpp b/source/x265.cpp
index 474cea9..9eae5c5 100644
--- a/source/x265.cpp
+++ b/source/x265.cpp
@@ -32,17 +32,16 @@
#include "param.h"
#include "cpu.h"
#include "x265.h"
+#include "x265cli.h"
#if HAVE_VLD
/* Visual Leak Detector */
#include <vld.h>
#endif
-#include "PPA/ppa.h"
#include <signal.h>
#include <errno.h>
#include <fcntl.h>
-#include <getopt.h>
#include <string>
#include <ostream>
@@ -57,162 +56,6 @@
using namespace x265;
-static const char short_options[] = "o:p:f:F:r:I:i:b:s:t:q:m:hwV?";
-static const struct option long_options[] =
-{
- { "help", no_argument, NULL, 'h' },
- { "version", no_argument, NULL, 'V' },
- { "asm", required_argument, NULL, 0 },
- { "no-asm", no_argument, NULL, 0 },
- { "threads", required_argument, NULL, 0 },
- { "preset", required_argument, NULL, 'p' },
- { "tune", required_argument, NULL, 't' },
- { "frame-threads", required_argument, NULL, 'F' },
- { "no-pmode", no_argument, NULL, 0 },
- { "pmode", no_argument, NULL, 0 },
- { "no-pme", no_argument, NULL, 0 },
- { "pme", no_argument, NULL, 0 },
- { "log-level", required_argument, NULL, 0 },
- { "profile", required_argument, NULL, 0 },
- { "level-idc", required_argument, NULL, 0 },
- { "high-tier", no_argument, NULL, 0 },
- { "no-high-tier", no_argument, NULL, 0 },
- { "csv", required_argument, NULL, 0 },
- { "no-cu-stats", no_argument, NULL, 0 },
- { "cu-stats", no_argument, NULL, 0 },
- { "y4m", no_argument, NULL, 0 },
- { "no-progress", no_argument, NULL, 0 },
- { "output", required_argument, NULL, 'o' },
- { "input", required_argument, NULL, 0 },
- { "input-depth", required_argument, NULL, 0 },
- { "input-res", required_argument, NULL, 0 },
- { "input-csp", required_argument, NULL, 0 },
- { "interlace", required_argument, NULL, 0 },
- { "no-interlace", no_argument, NULL, 0 },
- { "fps", required_argument, NULL, 0 },
- { "seek", required_argument, NULL, 0 },
- { "frame-skip", required_argument, NULL, 0 },
- { "frames", required_argument, NULL, 'f' },
- { "recon", required_argument, NULL, 'r' },
- { "recon-depth", required_argument, NULL, 0 },
- { "no-wpp", no_argument, NULL, 0 },
- { "wpp", no_argument, NULL, 0 },
- { "ctu", required_argument, NULL, 's' },
- { "tu-intra-depth", required_argument, NULL, 0 },
- { "tu-inter-depth", required_argument, NULL, 0 },
- { "me", required_argument, NULL, 0 },
- { "subme", required_argument, NULL, 'm' },
- { "merange", required_argument, NULL, 0 },
- { "max-merge", required_argument, NULL, 0 },
- { "no-temporal-mvp", no_argument, NULL, 0 },
- { "temporal-mvp", no_argument, NULL, 0 },
- { "rdpenalty", required_argument, NULL, 0 },
- { "no-rect", no_argument, NULL, 0 },
- { "rect", no_argument, NULL, 0 },
- { "no-amp", no_argument, NULL, 0 },
- { "amp", no_argument, NULL, 0 },
- { "no-early-skip", no_argument, NULL, 0 },
- { "early-skip", no_argument, NULL, 0 },
- { "no-fast-cbf", no_argument, NULL, 0 },
- { "fast-cbf", no_argument, NULL, 0 },
- { "no-tskip", no_argument, NULL, 0 },
- { "tskip", no_argument, NULL, 0 },
- { "no-tskip-fast", no_argument, NULL, 0 },
- { "tskip-fast", no_argument, NULL, 0 },
- { "cu-lossless", no_argument, NULL, 0 },
- { "no-cu-lossless", no_argument, NULL, 0 },
- { "no-constrained-intra", no_argument, NULL, 0 },
- { "constrained-intra", no_argument, NULL, 0 },
- { "fast-intra", no_argument, NULL, 0 },
- { "no-fast-intra", no_argument, NULL, 0 },
- { "no-open-gop", no_argument, NULL, 0 },
- { "open-gop", no_argument, NULL, 0 },
- { "keyint", required_argument, NULL, 'I' },
- { "min-keyint", required_argument, NULL, 'i' },
- { "scenecut", required_argument, NULL, 0 },
- { "no-scenecut", no_argument, NULL, 0 },
- { "rc-lookahead", required_argument, NULL, 0 },
- { "bframes", required_argument, NULL, 'b' },
- { "bframe-bias", required_argument, NULL, 0 },
- { "b-adapt", required_argument, NULL, 0 },
- { "no-b-adapt", no_argument, NULL, 0 },
- { "no-b-pyramid", no_argument, NULL, 0 },
- { "b-pyramid", no_argument, NULL, 0 },
- { "ref", required_argument, NULL, 0 },
- { "no-weightp", no_argument, NULL, 0 },
- { "weightp", no_argument, NULL, 'w' },
- { "no-weightb", no_argument, NULL, 0 },
- { "weightb", no_argument, NULL, 0 },
- { "crf", required_argument, NULL, 0 },
- { "crf-max", required_argument, NULL, 0 },
- { "crf-min", required_argument, NULL, 0 },
- { "vbv-maxrate", required_argument, NULL, 0 },
- { "vbv-bufsize", required_argument, NULL, 0 },
- { "vbv-init", required_argument, NULL, 0 },
- { "bitrate", required_argument, NULL, 0 },
- { "qp", required_argument, NULL, 'q' },
- { "aq-mode", required_argument, NULL, 0 },
- { "aq-strength", required_argument, NULL, 0 },
- { "ipratio", required_argument, NULL, 0 },
- { "pbratio", required_argument, NULL, 0 },
- { "cbqpoffs", required_argument, NULL, 0 },
- { "crqpoffs", required_argument, NULL, 0 },
- { "rd", required_argument, NULL, 0 },
- { "psy-rd", required_argument, NULL, 0 },
- { "psy-rdoq", required_argument, NULL, 0 },
- { "scaling-list", required_argument, NULL, 0 },
- { "lossless", no_argument, NULL, 0 },
- { "no-lossless", no_argument, NULL, 0 },
- { "no-signhide", no_argument, NULL, 0 },
- { "signhide", no_argument, NULL, 0 },
- { "no-lft", no_argument, NULL, 0 },
- { "lft", no_argument, NULL, 0 },
- { "no-sao", no_argument, NULL, 0 },
- { "sao", no_argument, NULL, 0 },
- { "no-sao-non-deblock", no_argument, NULL, 0 },
- { "sao-non-deblock", no_argument, NULL, 0 },
- { "no-ssim", no_argument, NULL, 0 },
- { "ssim", no_argument, NULL, 0 },
- { "no-psnr", no_argument, NULL, 0 },
- { "psnr", no_argument, NULL, 0 },
- { "hash", required_argument, NULL, 0 },
- { "no-strong-intra-smoothing", no_argument, NULL, 0 },
- { "strong-intra-smoothing", no_argument, NULL, 0 },
- { "no-cutree", no_argument, NULL, 0 },
- { "cutree", no_argument, NULL, 0 },
- { "no-hrd", no_argument, NULL, 0 },
- { "hrd", no_argument, NULL, 0 },
- { "sar", required_argument, NULL, 0 },
- { "overscan", required_argument, NULL, 0 },
- { "videoformat", required_argument, NULL, 0 },
- { "range", required_argument, NULL, 0 },
- { "colorprim", required_argument, NULL, 0 },
- { "transfer", required_argument, NULL, 0 },
- { "colormatrix", required_argument, NULL, 0 },
- { "chromaloc", required_argument, NULL, 0 },
- { "crop-rect", required_argument, NULL, 0 },
- { "no-dither", no_argument, NULL, 0 },
- { "dither", no_argument, NULL, 0 },
- { "no-repeat-headers", no_argument, NULL, 0 },
- { "repeat-headers", no_argument, NULL, 0 },
- { "aud", no_argument, NULL, 0 },
- { "no-aud", no_argument, NULL, 0 },
- { "info", no_argument, NULL, 0 },
- { "no-info", no_argument, NULL, 0 },
- { "qpfile", required_argument, NULL, 0 },
- { "lambda-file", required_argument, NULL, 0 },
- { "b-intra", no_argument, NULL, 0 },
- { "no-b-intra", no_argument, NULL, 0 },
- { "nr", required_argument, NULL, 0 },
- { "stats", required_argument, NULL, 0 },
- { "pass", required_argument, NULL, 0 },
- { "slow-firstpass", no_argument, NULL, 0 },
- { "no-slow-firstpass", no_argument, NULL, 0 },
- { "analysis-mode", required_argument, NULL, 0 },
- { "analysis-file", required_argument, NULL, 0 },
- { 0, 0, 0, 0 }
-};
-
/* Ctrl-C handler */
static volatile sig_atomic_t b_ctrl_c /* = 0 */;
static void sigint_handler(int)
@@ -246,6 +89,7 @@ struct CLIOptions
CLIOptions()
{
+ frameRate = 0.f;
input = NULL;
recon = NULL;
framesToBeEncoded = seek = 0;
@@ -264,12 +108,8 @@ struct CLIOptions
void destroy();
void writeNALs(const x265_nal* nal, uint32_t nalcount);
void printStatus(uint32_t frameNum, x265_param *param);
- void printVersion(x265_param *param);
- void showHelp(x265_param *param);
bool parse(int argc, char **argv, x265_param* param);
bool parseQPFile(x265_picture &pic_org);
- void readAnalysisFile(x265_picture* pic, x265_param*);
- void writeAnalysisFile(x265_picture* pic, x265_param*);
bool validateFanout(x265_param*);
};
@@ -291,7 +131,7 @@ void CLIOptions::destroy()
void CLIOptions::writeNALs(const x265_nal* nal, uint32_t nalcount)
{
- PPAScopeEvent(bitstream_write);
+ ProfileScopeEvent(bitstreamWrite);
for (uint32_t i = 0; i < nalcount; i++)
{
bitstreamFile.write((const char*)nal->payload, nal->sizeBytes);
@@ -327,177 +167,6 @@ void CLIOptions::printStatus(uint32_t frameNum, x265_param *param)
prevUpdateTime = time;
}
-void CLIOptions::printVersion(x265_param *param)
-{
- x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str);
- x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str);
-}
-
-void CLIOptions::showHelp(x265_param *param)
-{
- x265_param_default(param);
- printVersion(param);
-
-#define H0 printf
-#define OPT(value) (value ? "enabled" : "disabled")
- H0("\nSyntax: x265 [options] infile [-o] outfile\n");
- H0(" infile can be YUV or Y4M\n");
- H0(" outfile is raw HEVC bitstream\n");
- H0("\nExecutable Options:\n");
- H0("-h/--help Show this help text and exit\n");
- H0("-V/--version Show version info and exit\n");
- H0("\nOutput Options:\n");
- H0("-o/--output <filename> Bitstream output file name\n");
- H0(" --log-level <string> Logging level: none error warning info debug full. Default %s\n", logLevelNames[param->logLevel + 1]);
- H0(" --no-progress Disable CLI progress reports\n");
- H0(" --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats));
- H0(" --csv <filename> Comma separated log file, log level >= 3 frame log, else one line per run\n");
- H0("\nInput Options:\n");
- H0(" --input <filename> Raw YUV or Y4M input file name. `-` for stdin\n");
- H0(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
- H0(" --fps <float|rational> Source frame rate (float or num/denom), auto-detected if Y4M\n");
- H0(" --input-res WxH Source picture size [w x h], auto-detected if Y4M\n");
- H0(" --input-depth <integer> Bit-depth of input file. Default 8\n");
- H0(" --input-csp <string> Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
- H0("-f/--frames <integer> Maximum number of frames to encode. Default all\n");
- H0(" --seek <integer> First frame to encode\n");
- H0(" --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive\n");
- H0(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n");
- H0("\nQuality reporting metrics:\n");
- H0(" --[no-]ssim Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
- H0(" --[no-]psnr Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
- H0("\nProfile, Level, Tier:\n");
- H0(" --profile <string> Enforce an encode profile: main, main10, mainstillpicture\n");
- H0(" --level-idc <integer|float> Force a minumum required decoder level (as '5.0' or '50')\n");
- H0(" --[no-]high-tier If a decoder level is specified, this modifier selects High tier of that level\n");
- H0("\nThreading, performance:\n");
- H0(" --threads <integer> Number of threads for thread pool (0: detect CPU core count, default)\n");
- H0("-F/--frame-threads <integer> Number of concurrently encoded frames. 0: auto-determined by core count\n");
- H0(" --[no-]wpp Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
- H0(" --[no-]pmode Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
- H0(" --[no-]pme Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
- H0(" --[no-]asm <bool|int|string> Override CPU detection. Default: auto\n");
- H0("\nPresets:\n");
- H0("-p/--preset <string> Trade off performance for compression efficiency. Default medium\n");
- H0(" ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
- H0("-t/--tune <string> Tune the settings for a particular type of source or situation:\n");
- H0(" psnr, ssim, zerolatency, or fastdecode\n");
- H0("\nQuad-Tree size and depth:\n");
- H0("-s/--ctu <64|32|16> Maximum CU size (default: 64x64). Default %d\n", param->maxCUSize);
- H0(" --tu-intra-depth <integer> Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
- H0(" --tu-inter-depth <integer> Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
- H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
- H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
- H0("\nAnalysis:\n");
- H0(" --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
- H0(" --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %f\n", param->psyRd);
- H0(" --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %f\n", param->psyRdoq);
- H0(" --nr <integer> An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled\n");
- H0(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
- H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
- H0(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
- H0("\nCoding tools:\n");
- H0("-w/--[no-]weightp Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
- H0(" --[no-]weightb Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
- H0(" --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
- H0(" --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
- H0(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
- H0("\nTemporal / motion search options:\n");
- H0(" --me <string> Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
- H0("-m/--subme <integer> Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
- H0(" --merange <integer> Motion search range. Default %d\n", param->searchRange);
- H0(" --max-merge <1..5> Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
- H0(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
- H0("\nSpatial / intra options:\n");
- H0(" --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
- H0(" --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
- H0(" --[no-]b-intra Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames));
- H0(" --[no-]fast-intra Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra));
- H0(" --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
- H0("\nSlice decision options:\n");
- H0(" --[no-]open-gop Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
- H0("-I/--keyint <integer> Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
- H0("-i/--min-keyint <integer> Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
- H0(" --no-scenecut Disable adaptive I-frame decision\n");
- H0(" --scenecut <integer> How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
- H0(" --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
- H0(" --bframes <integer> Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
- H0(" --bframe-bias <integer> Bias towards B frame decisions. Default %d\n", param->bFrameBias);
- H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
- H0(" --[no-]b-pyramid Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
- H0(" --ref <integer> max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
- H0(" --qpfile <string> Force frametypes and QPs for some or all frames\n");
- H0(" Format of each line: framenumber frametype QP\n");
- H0(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
- H0(" QPs are restricted by qpmin/qpmax.\n");
- H0("\nRate control, Quantization:\n");
- H0(" --bitrate <integer> Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
- H0("-q/--qp <integer> QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
- H0(" --crf <float> Quality-based VBR (0-51). Default %f\n", param->rc.rfConstant);
- H0(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
- H0(" --crf-max <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
- H0(" May cause VBV underflows!\n");
- H0(" --crf-min <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
- H0(" this specifies a minimum rate factor value for encode!\n");
- H0(" --vbv-maxrate <integer> Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
- H0(" --vbv-bufsize <integer> Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
- H0(" --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %f\n", param->rc.vbvBufferInit);
- H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
- H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default %f\n", param->rc.aqStrength);
- H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
- H0(" --ipratio <float> QP factor between I and P. Default %f\n", param->rc.ipFactor);
- H0(" --pbratio <float> QP factor between P and B. Default %f\n", param->rc.pbFactor);
- H0(" --cbqpoffs <integer> Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
- H0(" --crqpoffs <integer> Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
- H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
- H0(" --pass Multi pass rate control.\n"
- " - 1 : First pass, creates stats file\n"
- " - 2 : Last pass, does not overwrite stats file\n"
- " - 3 : Nth pass, overwrites stats file\n");
- H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
- H0(" --analysis-mode <string|int> save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
- H0(" --analysis-file <filename> Specify file name used for either dumping or reading analysis data.\n");
- H0(" --scaling-list <string> Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
- H0(" --lambda-file <string> Specify a file containing replacement values for the lambda tables\n");
- H0(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
- H0(" Blank lines and lines starting with hash(#) are ignored\n");
- H0(" Comma is considered to be white-space\n");
- H0("\nLoop filters (deblock and SAO):\n");
- H0(" --[no-]lft Enable Deblocking Loop Filter. Default %s\n", OPT(param->bEnableLoopFilter));
- H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
- H0(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
- H0("\nVUI options:\n");
- H0(" --sar <width:height|int> Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
- H0(" Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
- H0(" 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
- H0(" 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
- H0(" --crop-rect <string> Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
- H0(" --overscan <string> Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
- H0(" --videoformat <string> Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
- H0(" --range <string> Specify black level and range of luma and chroma signals as full or limited Default limited\n");
- H0(" --colorprim <string> Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n");
- H0(" smpte240m, film, bt2020. Default undef\n");
- H0(" --transfer <string> Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
- H0(" smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
- H0(" bt2020-10, bt2020-12. Default undef\n");
- H0(" --colormatrix <string> Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
- H0(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
- H0(" --chromaloc <integer> Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
- H0("\nBitstream options:\n");
- H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
- H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
- H0(" --[no-]hrd Enable HRD parameters signalling. Default %s\n", OPT(param->bEmitHRDSEI));
- H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
- H0(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
- H0("\nReconstructed video options (debugging):\n");
- H0("-r/--recon <filename> Reconstructed raw image YUV or Y4M output file name\n");
- H0(" --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
-#undef OPT
-#undef H0
- printf("\n\nFull documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
- exit(0);
-}
-
bool CLIOptions::parse(int argc, char **argv, x265_param* param)
{
bool bError = 0;
@@ -510,7 +179,6 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
const char *preset = NULL;
const char *tune = NULL;
const char *profile = NULL;
- const char *analysisfn = "x265_analysis.dat";
if (argc <= 1)
{
@@ -603,7 +271,6 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
OPT("profile") profile = optarg; /* handled last */
OPT("preset") /* handled above */;
OPT("tune") /* handled above */;
- OPT("analysis-file") analysisfn = optarg;
OPT("qpfile")
{
this->qpfile = fopen(optarg, "rb");
@@ -645,19 +312,11 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
return true;
}
-#if HIGH_BIT_DEPTH
- if (param->internalBitDepth != 10)
+ if (param->internalBitDepth != x265_max_bit_depth)
{
- x265_log(param, X265_LOG_ERROR, "Only bit depths of 10 are supported in this build\n");
+ x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", x265_max_bit_depth);
return true;
}
-#else
- if (param->internalBitDepth != 8)
- {
- x265_log(param, X265_LOG_ERROR, "Only bit depths of 8 are supported in this build\n");
- return true;
- }
-#endif // if HIGH_BIT_DEPTH
InputFileInfo info;
info.filename = inputfn;
@@ -751,163 +410,9 @@ bool CLIOptions::parse(int argc, char **argv, x265_param* param)
x265_log(NULL, X265_LOG_ERROR, "failed to open bitstream file <%s> for writing\n", bitstreamfn);
return true;
}
-
- if (param->analysisMode)
- {
- const char *mode = param->analysisMode == X265_ANALYSIS_SAVE ? "wb" : "rb";
- this->analysisFile = fopen(analysisfn, mode);
- if (!this->analysisFile)
- {
- x265_log(NULL, X265_LOG_ERROR, "failed to open analysis file %s\n", analysisfn);
- return true;
- }
- }
-
return false;
}
-bool CLIOptions::validateFanout(x265_param *param)
-{
-#define CMP_OPT_FANOUT(opt, param_val)\
- {\
- bErr = 0;\
- p = strstr(paramBuf, opt "=");\
- char* q = strstr(paramBuf, "no-"opt);\
- if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
- bErr = 1;\
- else if (!param_val && !q)\
- bErr = 1;\
- else if (param_val && (q || !strstr(paramBuf, opt)))\
- bErr = 1;\
- if (bErr)\
- {\
- x265_log(param, X265_LOG_ERROR, "different " opt " setting than given in analysis file (%d vs %d)\n", param_val, i);\
- X265_FREE(paramBuf);\
- return false;\
- }\
- }
-
- char *p = NULL, *paramBuf;
- int i, j;
- uint32_t k , l;
- bool bErr = false;
-
- paramBuf = X265_MALLOC(char, MAXPARAMSIZE);
- if (!paramBuf)
- return false;
-
- fread(paramBuf, 1, MAXPARAMSIZE, this->analysisFile);
-
- /* check whether fanout options are compatible */
- if (strncmp(paramBuf, "#options:", 9))
- {
- x265_log(param, X265_LOG_ERROR, "options list in analysis file is not valid\n");
- X265_FREE(paramBuf);
- return false;
- }
-
- char* buf = strchr(paramBuf, '\n');
- if (!buf)
- {
- x265_log(param, X265_LOG_ERROR, "Malformed analysis file\n");
- X265_FREE(paramBuf);
- return false;
- }
- *buf = '\0';
- fseek(this->analysisFile, (int)strlen(paramBuf) + 1, SEEK_SET);
-
- if (sscanf(paramBuf, "#options: %dx%d", &i, &j) != 2)
- {
- x265_log(param, X265_LOG_ERROR, "Resolution specified in analysis file is not valid\n");
- X265_FREE(paramBuf);
- return false;
- }
- if ((p = strstr(paramBuf, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
- {
- x265_log(param, X265_LOG_ERROR, "fps specified in analysis file is not valid\n");
- X265_FREE(paramBuf);
- return false;
- }
- if (k != param->fpsNum || l != param->fpsDenom)
- {
- x265_log(param, X265_LOG_ERROR, "fps mismatch than given in analysis file (%u/%u vs %u/%u)\n",
- param->fpsNum, param->fpsDenom, k, l);
- X265_FREE(paramBuf);
- return false;
- }
-
- CMP_OPT_FANOUT("bitdepth", param->internalBitDepth);
- CMP_OPT_FANOUT("weightp", param->bEnableWeightedPred);
- CMP_OPT_FANOUT("bframes", param->bframes);
- CMP_OPT_FANOUT("b-pyramid", param->bBPyramid);
- CMP_OPT_FANOUT("b-adapt", param->bFrameAdaptive);
- CMP_OPT_FANOUT("open-gop", param->bOpenGOP);
- CMP_OPT_FANOUT("keyint", param->keyframeMax);
- CMP_OPT_FANOUT("min-keyint", param->keyframeMin);
- CMP_OPT_FANOUT("scenecut", param->scenecutThreshold);
- CMP_OPT_FANOUT("ctu", (int)param->maxCUSize);
- CMP_OPT_FANOUT("ref", param->maxNumReferences);
- CMP_OPT_FANOUT("rc-lookahead", param->lookaheadDepth);
-
-#undef CMP_OPT_FANOUT
-
- X265_FREE(paramBuf);
- return true;
-}
-
-void CLIOptions::readAnalysisFile(x265_picture* pic, x265_param* p)
-{
- int poc, width, height;
- uint32_t numPart, numCU;
- fread(&width, sizeof(int), 1, this->analysisFile);
- fread(&height, sizeof(int), 1, this->analysisFile);
- fread(&poc, sizeof(int), 1, this->analysisFile);
- fread(&pic->sliceType, sizeof(int), 1, this->analysisFile);
- fread(&numCU, sizeof(int), 1, this->analysisFile);
- fread(&numPart, sizeof(int), 1, this->analysisFile);
-
- if (poc != pic->poc || width != p->sourceWidth || height != p->sourceHeight)
- {
- x265_log(NULL, X265_LOG_WARNING, "Error in reading intra-inter data.\n");
- x265_free_analysis_data(pic);
- return;
- }
-
- fread(pic->analysisData.intraData->depth,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->modes,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->partSizes,
- sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->poc,
- sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.intraData->cuAddr,
- sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile);
- fread(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile);
-}
-
-void CLIOptions::writeAnalysisFile(x265_picture* pic, x265_param *p)
-{
- uint64_t seekTo = pic->poc * this->analysisRecordSize + this->analysisHeaderSize;
- fseeko(this->analysisFile, seekTo, SEEK_SET);
- fwrite(&p->sourceWidth, sizeof(int), 1, this->analysisFile);
- fwrite(&p->sourceHeight, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->poc, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->sliceType, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->analysisData.numCUsInFrame, sizeof(int), 1, this->analysisFile);
- fwrite(&pic->analysisData.numPartitions, sizeof(int), 1, this->analysisFile);
-
- fwrite(pic->analysisData.intraData->depth,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->modes,
- sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->partSizes,
- sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->poc, sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.intraData->cuAddr, sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile);
- fwrite(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile);
-}
-
bool CLIOptions::parseQPFile(x265_picture &pic_org)
{
int32_t num = -1, qp, ret;
@@ -942,13 +447,22 @@ bool CLIOptions::parseQPFile(x265_picture &pic_org)
return 1;
}
+/* CLI return codes:
+ *
+ * 0 - encode successful
+ * 1 - unable to parse command line
+ * 2 - unable to open encoder
+ * 3 - unable to generate stream headers
+ * 4 - encoder abort */
+
int main(int argc, char **argv)
{
#if HAVE_VLD
// This uses Microsoft's proprietary WCHAR type, but this only builds on Windows to start with
VLDSetReportOptions(VLD_OPT_REPORT_TO_DEBUGGER | VLD_OPT_REPORT_TO_FILE, L"x265_leaks.txt");
#endif
- PPA_INIT();
+ PROFILE_INIT();
+ THREAD_NAME("API", 0);
x265_param *param = x265_param_alloc();
CLIOptions cliopt;
@@ -967,7 +481,7 @@ int main(int argc, char **argv)
cliopt.destroy();
x265_param_free(param);
x265_cleanup();
- exit(1);
+ exit(2);
}
/* get the encoder parameters post-initialization */
@@ -979,19 +493,22 @@ int main(int argc, char **argv)
x265_picture pic_orig, pic_out;
x265_picture *pic_in = &pic_orig;
- x265_picture *pic_recon = cliopt.recon ? &pic_out : NULL;
+ /* Allocate recon picture if analysisMode is enabled */
+ x265_picture *pic_recon = (cliopt.recon || !!param->analysisMode) ? &pic_out : NULL;
uint32_t inFrameCount = 0;
uint32_t outFrameCount = 0;
x265_nal *p_nal;
x265_stats stats;
uint32_t nal;
int16_t *errorBuf = NULL;
+ int ret = 0;
if (!param->bRepeatHeaders)
{
if (x265_encoder_headers(encoder, &p_nal, &nal) < 0)
{
x265_log(param, X265_LOG_ERROR, "Failure generating stream headers\n");
+ ret = 3;
goto fail;
}
else
@@ -1000,38 +517,6 @@ int main(int argc, char **argv)
x265_picture_init(param, pic_in);
- if (param->analysisMode && !pic_recon)
- {
- x265_log(NULL, X265_LOG_ERROR, "Must specify recon with analysis-mode option.\n");
- goto fail;
- }
- if (param->analysisMode)
- {
- if (param->analysisMode == X265_ANALYSIS_SAVE)
- {
- char *p = x265_param2string(param);
- if (!p)
- {
- x265_log(NULL, X265_LOG_ERROR, "analysis: buffer allocation failure, aborting");
- goto fail;
- }
- uint32_t numCU = pic_in->analysisData.numCUsInFrame;
- uint32_t numPart = pic_in->analysisData.numPartitions;
-
- cliopt.analysisRecordSize = ((sizeof(int) * 4 + sizeof(uint32_t) * 2) + sizeof(x265_inter_data) * numCU * 85 +
- sizeof(uint8_t) * 2 * numPart * numCU + sizeof(char) * numPart * numCU + sizeof(int) * numCU + sizeof(uint32_t) * numCU);
-
- fprintf(cliopt.analysisFile, "#options: %s\n", p);
- cliopt.analysisHeaderSize = ftell(cliopt.analysisFile);
- X265_FREE(p);
- }
- else
- {
- if (!cliopt.validateFanout(param))
- goto fail;
- }
- }
-
if (cliopt.bDither)
{
errorBuf = X265_MALLOC(int16_t, param->sourceWidth + 1);
@@ -1069,52 +554,37 @@ int main(int argc, char **argv)
ditherImage(*pic_in, param->sourceWidth, param->sourceHeight, errorBuf, X265_DEPTH);
pic_in->bitDepth = X265_DEPTH;
}
- if (param->analysisMode)
- {
- x265_alloc_analysis_data(pic_in);
-
- if (param->analysisMode == X265_ANALYSIS_LOAD)
- cliopt.readAnalysisFile(pic_in, param);
- }
}
int numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, pic_in, pic_recon);
if (numEncoded < 0)
{
b_ctrl_c = 1;
+ ret = 4;
break;
}
outFrameCount += numEncoded;
- if (numEncoded && pic_recon)
- {
- cliopt.recon->writePicture(pic_out);
- if (param->analysisMode == X265_ANALYSIS_SAVE)
- cliopt.writeAnalysisFile(pic_recon, param);
- if (param->analysisMode)
- x265_free_analysis_data(pic_recon);
- }
+ if (numEncoded && pic_recon && cliopt.recon)
+ cliopt.recon->writePicture(pic_out);
if (nal)
cliopt.writeNALs(p_nal, nal);
- // Because x265_encoder_encode() lazily encodes entire GOPs, updates are per-GOP
cliopt.printStatus(outFrameCount, param);
}
/* Flush the encoder */
while (!b_ctrl_c)
{
- uint32_t numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon);
- outFrameCount += numEncoded;
- if (numEncoded && pic_recon)
+ int numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon);
+ if (numEncoded < 0)
{
- cliopt.recon->writePicture(pic_out);
- if (param->analysisMode == X265_ANALYSIS_SAVE)
- cliopt.writeAnalysisFile(pic_recon, param);
- if (param->analysisMode)
- x265_free_analysis_data(pic_recon);
+ ret = 4;
+ break;
}
-
+ outFrameCount += numEncoded;
+ if (numEncoded && pic_recon && cliopt.recon)
+ cliopt.recon->writePicture(pic_out);
if (nal)
cliopt.writeNALs(p_nal, nal);
@@ -1168,5 +638,6 @@ fail:
#if HAVE_VLD
assert(VLDReportLeaks() == 0);
#endif
- return 0;
+
+ return ret;
}
diff --git a/source/x265.def.in b/source/x265.def.in
index e78bfc1..9e964f6 100644
--- a/source/x265.def.in
+++ b/source/x265.def.in
@@ -9,8 +9,6 @@ x265_param_free
x265_picture_init
x265_picture_alloc
x265_picture_free
-x265_alloc_analysis_data
-x265_free_analysis_data
x265_param_apply_profile
x265_max_bit_depth
x265_version_str
diff --git a/source/x265.h b/source/x265.h
index e5474b7..4808491 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -88,36 +88,16 @@ typedef struct x265_nal
uint8_t* payload;
} x265_nal;
-/* Stores inter (motion estimation) analysis data for a single frame */
-typedef struct x265_inter_data
-{
- uint32_t zOrder;
- int ref[2];
- int costZero[2];
- int16_t mvx[2];
- int16_t mvy[2];
- uint32_t depth;
- int poc;
- uint32_t cuAddr;
-} x265_inter_data;
-
-/* Stores intra (motion estimation) analysis data for a single frame */
-typedef struct x265_intra_data
-{
- uint8_t* depth;
- uint8_t* modes;
- char* partSizes;
- int* poc;
- uint32_t* cuAddr;
-} x265_intra_data;
-
/* Stores all analysis data for a single frame */
typedef struct x265_analysis_data
{
- x265_inter_data* interData;
- x265_intra_data* intraData;
+ uint32_t frameRecordSize;
+ int32_t poc;
+ int32_t sliceType;
uint32_t numCUsInFrame;
uint32_t numPartitions;
+ void* interData;
+ void* intraData;
} x265_analysis_data;
/* Used to pass pictures into the encoder, and to get picture data back out of
@@ -241,8 +221,9 @@ typedef enum
#define X265_LOG_ERROR 0
#define X265_LOG_WARNING 1
#define X265_LOG_INFO 2
-#define X265_LOG_DEBUG 3
-#define X265_LOG_FULL 4
+#define X265_LOG_FRAME 3
+#define X265_LOG_DEBUG 4
+#define X265_LOG_FULL 5
#define X265_B_ADAPT_NONE 0
#define X265_B_ADAPT_FAST 1
@@ -291,7 +272,7 @@ typedef enum
#define X265_ANALYSIS_SAVE 1
#define X265_ANALYSIS_LOAD 2
-typedef struct
+typedef struct x265_cli_csp
{
int planes;
int width[3];
@@ -349,6 +330,16 @@ static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11",
static const char * const x265_interlace_names[] = { "prog", "tff", "bff", 0 };
static const char * const x265_analysis_names[] = { "off", "save", "load", 0 };
+/* Zones: override ratecontrol for specific sections of the video.
+ * If zones overlap, whichever comes later in the list takes precedence. */
+typedef struct x265_zone
+{
+ int startFrame, endFrame; /* range of frame numbers */
+ int bForceQp; /* whether to use qp vs bitrate factor */
+ int qp;
+ float bitrateFactor;
+} x265_zone;
+
/* x265 input parameters
*
* For version safety you may use x265_param_alloc/free() to manage the
@@ -415,11 +406,12 @@ typedef struct x265_param
/* Enable the measurement and reporting of SSIM. Default is disabled */
int bEnableSsim;
- /* filename of CSV log. If logLevel is X265_LOG_DEBUG, the encoder will emit
- * per-slice statistics to this log file in encode order. Otherwise the
- * encoder will emit per-stream statistics into the log file when
- * x265_encoder_log is called (presumably at the end of the encode) */
- const char *csvfn;
+ /* filename of CSV log. If logLevel greater than or equal to X265_LOG_FRAME,
+ * the encoder will emit per-slice statistics to this log file in encode
+ * order. Otherwise the encoder will emit per-stream statistics into the log
+ * file when x265_encoder_log is called (presumably at the end of the
+ * encode) */
+ char* csvfn;
/* Enable the generation of SEI messages for each encoded frame containing
* the hashes of the three reconstructed picture planes. Most decoders will
@@ -504,13 +496,13 @@ typedef struct x265_param
/* The additional depth the residual quadtree is allowed to recurse beyond
* the coding quadtree, for inter coded blocks. This must be between 1 and
- * 3. The higher the value the more efficiently the residual can be
+ * 4. The higher the value the more efficiently the residual can be
* compressed by the DCT transforms, at the expense of much more compute */
uint32_t tuQTMaxInterDepth;
/* The additional depth the residual quadtree is allowed to recurse beyond
* the coding quadtree, for intra coded blocks. This must be between 1 and
- * 3. The higher the value the more efficiently the residual can be
+ * 4. The higher the value the more efficiently the residual can be
* compressed by the DCT transforms, at the expense of much more compute */
uint32_t tuQTMaxIntraDepth;
@@ -664,7 +656,7 @@ typedef struct x265_param
/* Enable the use of `coded block flags` (flags set to true when a residual
* has been coded for a given block) to avoid intra analysis in likely skip
- * blocks. Default is disabled */
+ * blocks. Only applicable in RD levels 5 and 6. Default is disabled */
int bEnableCbfFastMode;
/* Enable early skip decisions to avoid intra and inter analysis in likely
@@ -678,7 +670,7 @@ typedef struct x265_param
* Default is 0 */
int rdPenalty;
- /* A value betwen X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines
+ /* A value between X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines
* the level of rate distortion optimizations to perform during mode
* decisions and quantization. The more RDO the better the compression
* efficiency at a major cost of performance. Default is no RDO (0) */
@@ -687,7 +679,7 @@ typedef struct x265_param
/* Psycho-visual rate-distortion strength. Only has an effect in presets
* which use RDO. It makes mode decision favor options which preserve the
* energy of the source, at the cost of lost compression. The value must
- * be between 0 and 2.0, 1.0 is typical. Default 0.0 */
+ * be between 0 and 2.0, 1.0 is typical. Default 1.0 */
double psyRd;
/* Quantization scaling lists. HEVC supports 6 quantization scaling lists to
@@ -704,7 +696,7 @@ typedef struct x265_param
/* Strength of psycho-visual optimizations in quantization. Only has an
* effect in presets which use RDOQ (rd-levels 4 and 5). The value must be
- * between 0 and 50, 1.0 is typical. Default 0.0 */
+ * between 0 and 50, 1.0 is typical. Default 1.0 */
double psyRdoq;
/* If X265_ANALYSIS_SAVE, write per-frame analysis information into analysis
@@ -713,8 +705,10 @@ typedef struct x265_param
* the encoder must perform. Default X265_ANALYSIS_OFF */
int analysisMode;
- /*== Coding tools ==*/
+ /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */
+ char* analysisFileName;
+ /*== Coding tools ==*/
/* Enable the implicit signaling of the sign bit of the last coefficient of
* each transform unit. This saves one bit per TU at the expense of figuring
* out which coefficient can be toggled with the least distortion.
@@ -735,9 +729,17 @@ typedef struct x265_param
/* Enable the deblocking loop filter, which improves visual quality by
* reducing blocking effects at block edges, particularly at lower bitrates
* or higher QP. When enabled it adds another CU row of reference lag,
- * reducing frame parallelism effectiveness. Default is enabled */
+ * reducing frame parallelism effectiveness. Default is enabled */
int bEnableLoopFilter;
+ /* deblocking filter tC offset [-6, 6] -6 light filter, 6 strong.
+ * This is the coded div2 value, actual offset is doubled at use */
+ int deblockingFilterTCOffset;
+
+ /* deblocking filter Beta offset [-6, 6] -6 light filter, 6 strong
+ * This is the coded div2 value, actual offset is doubled at use */
+ int deblockingFilterBetaOffset;
+
/* Enable the Sample Adaptive Offset loop filter, which reduces distortion
* effects by adjusting reconstructed sample values based on histogram
* analysis to better approximate the original samples. When enabled it adds
@@ -769,9 +771,13 @@ typedef struct x265_param
* regardless of this setting. */
int bIntraInBFrames;
- /* An integer value in range of 100 to 1000, which denotes strength of noise
- * reduction */
- int noiseReduction;
+ /* An integer value in range of 0 to 2000, which denotes strength of noise
+ * reduction in intra CUs. 0 means disabled */
+ int noiseReductionIntra;
+
+ /* An integer value in range of 0 to 2000, which denotes strength of noise
+ * reduction in inter CUs. 0 means disabled */
+ int noiseReductionInter;
/* The lossless flag enables true lossless coding, by bypassing scaling,
* transform, quantization and in-loop filter processes. This is used for
@@ -801,11 +807,6 @@ typedef struct x265_param
* bitrate is specified on the command line, ABR is implied. Default 0 */
int bitrate;
- /* The degree of rate fluctuation that x265 tolerates. Rate tolerance is used
- * alongwith overflow (difference between actual and target bitrate), to adjust
- * qp. Default is 1.0 */
- double rateTolerance;
-
/* qComp sets the quantizer curve compression factor. It weights the frame
* quantizer based on the complexity of residual (measured by lookahead).
* Default value is 0.6. Increasing it to 1 will effectively generate CQP */
@@ -824,12 +825,12 @@ typedef struct x265_param
double rfConstant;
/* Enable adaptive quantization. This mode distributes available bits between all
- * macroblocks of a frame, assigning more bits to low complexity areas. Turning
+ * CTUs of a frame, assigning more bits to low complexity areas. Turning
* this ON will usually affect PSNR negatively, however SSIM and visual quality
- * generally improves. Default: X265_AQ_AUTO_VARIANCE */
+ * generally improves. Default: X265_AQ_VARIANCE */
int aqMode;
- /* Sets the strength of AQ bias towards low detail macroblocks. Valid only if
+ /* Sets the strength of AQ bias towards low detail CTUs. Valid only if
* AQ is enabled. Default value: 1.0. Acceptable values between 0.0 and 3.0 */
double aqStrength;
@@ -856,14 +857,15 @@ typedef struct x265_param
/* In CRF mode, minimum CRF as caused by VBV */
double rfConstantMin;
- /* Two pass (INCOMPLETE) */
+ /* Multi-pass encoding */
/* Enable writing the stats in a multipass encode to the stat output file */
int bStatWrite;
/* Enable loading data from the stat input file in a multi pass encode */
int bStatRead;
- /* Filename of the 2pass output/input stats file */
+ /* Filename of the 2pass output/input stats file, if unspecified the
+ * encoder will default to using x265_2pass.log */
char* statFileName;
/* temporally blur quants */
@@ -874,6 +876,10 @@ typedef struct x265_param
/* Enable slow and a more detailed first pass encode in multi pass rate control */
int bEnableSlowFirstPass;
+
+ /* ratecontrol overrides */
+ int zoneCount;
+ x265_zone* zones;
/* specify a text file which contains MAX_MAX_QP + 1 floating point
* values to be copied into x265_lambda_tab and a second set of
@@ -882,6 +888,10 @@ typedef struct x265_param
* ignored. The lambda tables are process-global, so these new lambda
* values will affect all encoders in the same process */
const char* lambdaFileName;
+
+ /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise
+ quality to maintain bitrate adherence */
+ int bStrictCbr;
} rc;
/*== Video Usability Information ==*/
@@ -991,7 +1001,7 @@ void x265_setup_primitives(x265_param *param, int cpu);
* special in any way, but using this method together with x265_param_free()
* and x265_param_parse() to set values by name allows the application to treat
* x265_param as an opaque data struct for version safety */
-x265_param *x265_param_alloc();
+x265_param *x265_param_alloc(void);
/* x265_param_free:
* Use x265_param_free() to release storage for an x265_param instance
@@ -1039,7 +1049,7 @@ static const char * const x265_preset_names[] = { "ultrafast", "superfast", "ver
* 100 times faster than placebo!
*
* Currently available tunings are: */
-static const char * const x265_tune_names[] = { "psnr", "ssim", "zerolatency", "fastdecode", 0 };
+static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", 0 };
/* returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
@@ -1049,22 +1059,12 @@ int x265_param_default_preset(x265_param *, const char *preset, const char *tune
* special in any way, but using this method together with x265_picture_free()
* and x265_picture_init() allows some version safety. New picture fields will
* always be added to the end of x265_picture */
-x265_picture *x265_picture_alloc();
+x265_picture *x265_picture_alloc(void);
/* x265_picture_free:
* Use x265_picture_free() to release storage for an x265_picture instance
* allocated by x265_picture_alloc() */
void x265_picture_free(x265_picture *);
-
-/* x265_alloc_analysis_data:
- * Allocate memory to hold analysis data, returns 0 on success else negative */
-int x265_alloc_analysis_data(x265_picture*);
-
-/* x265_free_analysis_data:
- * Use x265_free_analysis_data to release storage of members allocated by
- * x265_alloc_analysis_data */
-void x265_free_analysis_data(x265_picture*);
-
/***
* Initialize an x265_picture structure to default values. It sets the pixel
* depth and color space to the encoder's internal values and sets the slice
diff --git a/source/x265cli.h b/source/x265cli.h
new file mode 100644
index 0000000..dde834c
--- /dev/null
+++ b/source/x265cli.h
@@ -0,0 +1,402 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve at borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265CLI_H
+#define X265CLI_H 1
+
+#include <getopt.h>
+
+#ifdef __cplusplus
+namespace x265 {
+#endif
+
+static const char short_options[] = "o:p:f:F:r:I:i:b:s:t:q:m:hwV?";
+static const struct option long_options[] =
+{
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, 'V' },
+ { "asm", required_argument, NULL, 0 },
+ { "no-asm", no_argument, NULL, 0 },
+ { "threads", required_argument, NULL, 0 },
+ { "preset", required_argument, NULL, 'p' },
+ { "tune", required_argument, NULL, 't' },
+ { "frame-threads", required_argument, NULL, 'F' },
+ { "no-pmode", no_argument, NULL, 0 },
+ { "pmode", no_argument, NULL, 0 },
+ { "no-pme", no_argument, NULL, 0 },
+ { "pme", no_argument, NULL, 0 },
+ { "log-level", required_argument, NULL, 0 },
+ { "profile", required_argument, NULL, 0 },
+ { "level-idc", required_argument, NULL, 0 },
+ { "high-tier", no_argument, NULL, 0 },
+ { "no-high-tier", no_argument, NULL, 0 },
+ { "csv", required_argument, NULL, 0 },
+ { "no-cu-stats", no_argument, NULL, 0 },
+ { "cu-stats", no_argument, NULL, 0 },
+ { "y4m", no_argument, NULL, 0 },
+ { "no-progress", no_argument, NULL, 0 },
+ { "output", required_argument, NULL, 'o' },
+ { "input", required_argument, NULL, 0 },
+ { "input-depth", required_argument, NULL, 0 },
+ { "input-res", required_argument, NULL, 0 },
+ { "input-csp", required_argument, NULL, 0 },
+ { "interlace", required_argument, NULL, 0 },
+ { "no-interlace", no_argument, NULL, 0 },
+ { "fps", required_argument, NULL, 0 },
+ { "seek", required_argument, NULL, 0 },
+ { "frame-skip", required_argument, NULL, 0 },
+ { "frames", required_argument, NULL, 'f' },
+ { "recon", required_argument, NULL, 'r' },
+ { "recon-depth", required_argument, NULL, 0 },
+ { "no-wpp", no_argument, NULL, 0 },
+ { "wpp", no_argument, NULL, 0 },
+ { "ctu", required_argument, NULL, 's' },
+ { "tu-intra-depth", required_argument, NULL, 0 },
+ { "tu-inter-depth", required_argument, NULL, 0 },
+ { "me", required_argument, NULL, 0 },
+ { "subme", required_argument, NULL, 'm' },
+ { "merange", required_argument, NULL, 0 },
+ { "max-merge", required_argument, NULL, 0 },
+ { "no-temporal-mvp", no_argument, NULL, 0 },
+ { "temporal-mvp", no_argument, NULL, 0 },
+ { "rdpenalty", required_argument, NULL, 0 },
+ { "no-rect", no_argument, NULL, 0 },
+ { "rect", no_argument, NULL, 0 },
+ { "no-amp", no_argument, NULL, 0 },
+ { "amp", no_argument, NULL, 0 },
+ { "no-early-skip", no_argument, NULL, 0 },
+ { "early-skip", no_argument, NULL, 0 },
+ { "no-fast-cbf", no_argument, NULL, 0 },
+ { "fast-cbf", no_argument, NULL, 0 },
+ { "no-tskip", no_argument, NULL, 0 },
+ { "tskip", no_argument, NULL, 0 },
+ { "no-tskip-fast", no_argument, NULL, 0 },
+ { "tskip-fast", no_argument, NULL, 0 },
+ { "cu-lossless", no_argument, NULL, 0 },
+ { "no-cu-lossless", no_argument, NULL, 0 },
+ { "no-constrained-intra", no_argument, NULL, 0 },
+ { "constrained-intra", no_argument, NULL, 0 },
+ { "fast-intra", no_argument, NULL, 0 },
+ { "no-fast-intra", no_argument, NULL, 0 },
+ { "no-open-gop", no_argument, NULL, 0 },
+ { "open-gop", no_argument, NULL, 0 },
+ { "keyint", required_argument, NULL, 'I' },
+ { "min-keyint", required_argument, NULL, 'i' },
+ { "scenecut", required_argument, NULL, 0 },
+ { "no-scenecut", no_argument, NULL, 0 },
+ { "rc-lookahead", required_argument, NULL, 0 },
+ { "bframes", required_argument, NULL, 'b' },
+ { "bframe-bias", required_argument, NULL, 0 },
+ { "b-adapt", required_argument, NULL, 0 },
+ { "no-b-adapt", no_argument, NULL, 0 },
+ { "no-b-pyramid", no_argument, NULL, 0 },
+ { "b-pyramid", no_argument, NULL, 0 },
+ { "ref", required_argument, NULL, 0 },
+ { "no-weightp", no_argument, NULL, 0 },
+ { "weightp", no_argument, NULL, 'w' },
+ { "no-weightb", no_argument, NULL, 0 },
+ { "weightb", no_argument, NULL, 0 },
+ { "crf", required_argument, NULL, 0 },
+ { "crf-max", required_argument, NULL, 0 },
+ { "crf-min", required_argument, NULL, 0 },
+ { "vbv-maxrate", required_argument, NULL, 0 },
+ { "vbv-bufsize", required_argument, NULL, 0 },
+ { "vbv-init", required_argument, NULL, 0 },
+ { "bitrate", required_argument, NULL, 0 },
+ { "qp", required_argument, NULL, 'q' },
+ { "aq-mode", required_argument, NULL, 0 },
+ { "aq-strength", required_argument, NULL, 0 },
+ { "ipratio", required_argument, NULL, 0 },
+ { "pbratio", required_argument, NULL, 0 },
+ { "qcomp", required_argument, NULL, 0 },
+ { "qpstep", required_argument, NULL, 0 },
+ { "ratetol", required_argument, NULL, 0 },
+ { "cplxblur", required_argument, NULL, 0 },
+ { "qblur", required_argument, NULL, 0 },
+ { "cbqpoffs", required_argument, NULL, 0 },
+ { "crqpoffs", required_argument, NULL, 0 },
+ { "rd", required_argument, NULL, 0 },
+ { "psy-rd", required_argument, NULL, 0 },
+ { "psy-rdoq", required_argument, NULL, 0 },
+ { "no-psy-rd", no_argument, NULL, 0 },
+ { "no-psy-rdoq", no_argument, NULL, 0 },
+ { "scaling-list", required_argument, NULL, 0 },
+ { "lossless", no_argument, NULL, 0 },
+ { "no-lossless", no_argument, NULL, 0 },
+ { "no-signhide", no_argument, NULL, 0 },
+ { "signhide", no_argument, NULL, 0 },
+ { "no-lft", no_argument, NULL, 0 }, /* DEPRECATED */
+ { "lft", no_argument, NULL, 0 }, /* DEPRECATED */
+ { "no-deblock", no_argument, NULL, 0 },
+ { "deblock", required_argument, NULL, 0 },
+ { "no-sao", no_argument, NULL, 0 },
+ { "sao", no_argument, NULL, 0 },
+ { "no-sao-non-deblock", no_argument, NULL, 0 },
+ { "sao-non-deblock", no_argument, NULL, 0 },
+ { "no-ssim", no_argument, NULL, 0 },
+ { "ssim", no_argument, NULL, 0 },
+ { "no-psnr", no_argument, NULL, 0 },
+ { "psnr", no_argument, NULL, 0 },
+ { "hash", required_argument, NULL, 0 },
+ { "no-strong-intra-smoothing", no_argument, NULL, 0 },
+ { "strong-intra-smoothing", no_argument, NULL, 0 },
+ { "no-cutree", no_argument, NULL, 0 },
+ { "cutree", no_argument, NULL, 0 },
+ { "no-hrd", no_argument, NULL, 0 },
+ { "hrd", no_argument, NULL, 0 },
+ { "sar", required_argument, NULL, 0 },
+ { "overscan", required_argument, NULL, 0 },
+ { "videoformat", required_argument, NULL, 0 },
+ { "range", required_argument, NULL, 0 },
+ { "colorprim", required_argument, NULL, 0 },
+ { "transfer", required_argument, NULL, 0 },
+ { "colormatrix", required_argument, NULL, 0 },
+ { "chromaloc", required_argument, NULL, 0 },
+ { "crop-rect", required_argument, NULL, 0 },
+ { "no-dither", no_argument, NULL, 0 },
+ { "dither", no_argument, NULL, 0 },
+ { "no-repeat-headers", no_argument, NULL, 0 },
+ { "repeat-headers", no_argument, NULL, 0 },
+ { "aud", no_argument, NULL, 0 },
+ { "no-aud", no_argument, NULL, 0 },
+ { "info", no_argument, NULL, 0 },
+ { "no-info", no_argument, NULL, 0 },
+ { "zones", required_argument, NULL, 0 },
+ { "qpfile", required_argument, NULL, 0 },
+ { "lambda-file", required_argument, NULL, 0 },
+ { "b-intra", no_argument, NULL, 0 },
+ { "no-b-intra", no_argument, NULL, 0 },
+ { "nr-intra", required_argument, NULL, 0 },
+ { "nr-inter", required_argument, NULL, 0 },
+ { "stats", required_argument, NULL, 0 },
+ { "pass", required_argument, NULL, 0 },
+ { "slow-firstpass", no_argument, NULL, 0 },
+ { "no-slow-firstpass", no_argument, NULL, 0 },
+ { "analysis-mode", required_argument, NULL, 0 },
+ { "analysis-file", required_argument, NULL, 0 },
+ { "strict-cbr", no_argument, NULL, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 }
+};
+
+static void printVersion(x265_param *param)
+{
+ x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str);
+ x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str);
+}
+
+static void showHelp(x265_param *param)
+{
+ int level = param->logLevel;
+ x265_param_default(param);
+ printVersion(param);
+
+#define OPT(value) (value ? "enabled" : "disabled")
+#define H0 printf
+#define H1 if (level >= X265_LOG_DEBUG) printf
+
+ H0("\nSyntax: x265 [options] infile [-o] outfile\n");
+ H0(" infile can be YUV or Y4M\n");
+ H0(" outfile is raw HEVC bitstream\n");
+ H0("\nExecutable Options:\n");
+ H0("-h/--help Show this help text and exit\n");
+ H0("-V/--version Show version info and exit\n");
+ H0("\nOutput Options:\n");
+ H0("-o/--output <filename> Bitstream output file name\n");
+ H0(" --log-level <string> Logging level: none error warning info debug full. Default %s\n", x265::logLevelNames[param->logLevel + 1]);
+ H0(" --no-progress Disable CLI progress reports\n");
+ H0(" --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats));
+ H1(" --csv <filename> Comma separated log file, log level >= 3 frame log, else one line per run\n");
+ H0("\nInput Options:\n");
+ H0(" --input <filename> Raw YUV or Y4M input file name. `-` for stdin\n");
+ H1(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
+ H0(" --fps <float|rational> Source frame rate (float or num/denom), auto-detected if Y4M\n");
+ H0(" --input-res WxH Source picture size [w x h], auto-detected if Y4M\n");
+ H1(" --input-depth <integer> Bit-depth of input file. Default 8\n");
+ H1(" --input-csp <string> Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
+ H0("-f/--frames <integer> Maximum number of frames to encode. Default all\n");
+ H0(" --seek <integer> First frame to encode\n");
+ H1(" --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive\n");
+ H1(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+ H0("\nQuality reporting metrics:\n");
+ H0(" --[no-]ssim Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
+ H0(" --[no-]psnr Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
+ H0("\nProfile, Level, Tier:\n");
+ H0(" --profile <string> Enforce an encode profile: main, main10, mainstillpicture\n");
+ H0(" --level-idc <integer|float> Force a minumum required decoder level (as '5.0' or '50')\n");
+ H0(" --[no-]high-tier If a decoder level is specified, this modifier selects High tier of that level\n");
+ H0("\nThreading, performance:\n");
+ H0(" --threads <integer> Number of threads for thread pool (0: detect CPU core count, default)\n");
+ H0("-F/--frame-threads <integer> Number of concurrently encoded frames. 0: auto-determined by core count\n");
+ H0(" --[no-]wpp Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
+ H0(" --[no-]pmode Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
+ H0(" --[no-]pme Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
+ H0(" --[no-]asm <bool|int|string> Override CPU detection. Default: auto\n");
+ H0("\nPresets:\n");
+ H0("-p/--preset <string> Trade off performance for compression efficiency. Default medium\n");
+ H0(" ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
+ H0("-t/--tune <string> Tune the settings for a particular type of source or situation:\n");
+ H0(" psnr, ssim, grain, zerolatency, fastdecode\n");
+ H0("\nQuad-Tree size and depth:\n");
+ H0("-s/--ctu <64|32|16> Maximum CU size (WxH). Default %d\n", param->maxCUSize);
+ H0(" --tu-intra-depth <integer> Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
+ H0(" --tu-inter-depth <integer> Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
+ H0("\nAnalysis:\n");
+ H0(" --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
+ H0(" --[no-]psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+ H0(" --[no-]psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
+ H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
+ H1(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
+ H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+ H1(" --nr-intra <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
+ H1(" --nr-inter <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
+ H0("\nCoding tools:\n");
+ H0("-w/--[no-]weightp Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
+ H0(" --[no-]weightb Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
+ H0(" --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
+ H0(" --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
+ H1(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
+ H0("\nTemporal / motion search options:\n");
+ H0(" --me <string> Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
+ H0("-m/--subme <integer> Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
+ H0(" --merange <integer> Motion search range. Default %d\n", param->searchRange);
+ H0(" --max-merge <1..5> Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
+ H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
+ H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+ H1(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+ H0("\nSpatial / intra options:\n");
+ H0(" --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
+ H0(" --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
+ H0(" --[no-]b-intra Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames));
+ H0(" --[no-]fast-intra Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra));
+ H0(" --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
+ H0("\nSlice decision options:\n");
+ H0(" --[no-]open-gop Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
+ H0("-I/--keyint <integer> Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
+ H0("-i/--min-keyint <integer> Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
+ H0(" --no-scenecut Disable adaptive I-frame decision\n");
+ H0(" --scenecut <integer> How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+ H0(" --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
+ H0(" --bframes <integer> Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
+ H1(" --bframe-bias <integer> Bias towards B frame decisions. Default %d\n", param->bFrameBias);
+ H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
+ H0(" --[no-]b-pyramid Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
+ H0(" --ref <integer> max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
+ H1(" --zones <zone0>/<zone1>/... Tweak the bitrate of regions of the video\n");
+ H1(" Each zone is of the form\n");
+ H1(" <start frame>,<end frame>,<option>\n");
+ H1(" where <option> is either\n");
+ H1(" q=<integer> (force QP)\n");
+ H1(" or b=<float> (bitrate multiplier)\n");
+ H1(" --qpfile <string> Force frametypes and QPs for some or all frames\n");
+ H1(" Format of each line: framenumber frametype QP\n");
+ H1(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
+ H1(" QPs are restricted by qpmin/qpmax.\n");
+ H0("\nRate control, Adaptive Quantization:\n");
+ H0(" --bitrate <integer> Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
+ H1("-q/--qp <integer> QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
+ H0(" --crf <float> Quality-based VBR (0-51). Default %.1f\n", param->rc.rfConstant);
+ H1(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
+ H1(" --crf-max <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
+ H1(" May cause VBV underflows!\n");
+ H1(" --crf-min <float> With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
+ H1(" this specifies a minimum rate factor value for encode!\n");
+ H0(" --vbv-maxrate <integer> Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
+ H0(" --vbv-bufsize <integer> Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
+ H0(" --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
+ H0(" --pass Multi pass rate control.\n"
+ " - 1 : First pass, creates stats file\n"
+ " - 2 : Last pass, does not overwrite stats file\n"
+ " - 3 : Nth pass, overwrites stats file\n");
+ H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
+ H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
+ H0(" --[no-]strict-cbr Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr));
+ H0(" --analysis-mode <string|int> save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
+ H0(" --analysis-file <filename> Specify file name used for either dumping or reading analysis data.\n");
+ H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
+ H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
+ H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
+ H1(" --ipratio <float> QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
+ H1(" --pbratio <float> QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
+ H1(" --qcomp <float> Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
+ H1(" --cbqpoffs <integer> Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
+ H1(" --crqpoffs <integer> Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
+ H1(" --scaling-list <string> Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
+ H1(" --lambda-file <string> Specify a file containing replacement values for the lambda tables\n");
+ H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
+ H1(" Blank lines and lines starting with hash(#) are ignored\n");
+ H1(" Comma is considered to be white-space\n");
+ H0("\nLoop filters (deblock and SAO):\n");
+ H0(" --[no-]deblock Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
+ H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
+ H1(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
+ H0("\nVUI options:\n");
+ H0(" --sar <width:height|int> Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
+ H0(" Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
+ H0(" 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
+ H0(" 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
+ H1(" --crop-rect <string> Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
+ H1(" --overscan <string> Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
+ H0(" --videoformat <string> Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
+ H0(" --range <string> Specify black level and range of luma and chroma signals as full or limited Default limited\n");
+ H0(" --colorprim <string> Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n");
+ H0(" smpte240m, film, bt2020. Default undef\n");
+ H0(" --transfer <string> Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
+ H0(" smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
+ H0(" bt2020-10, bt2020-12. Default undef\n");
+ H1(" --colormatrix <string> Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
+ H1(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
+ H1(" --chromaloc <integer> Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
+ H0("\nBitstream options:\n");
+ H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
+ H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
+ H0(" --[no-]hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
+ H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
+ H1(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
+ H1("\nReconstructed video options (debugging):\n");
+ H1("-r/--recon <filename> Reconstructed raw image YUV or Y4M output file name\n");
+ H1(" --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
+ H1("\nExecutable return codes:\n");
+ H1(" 0 - encode successful\n");
+ H1(" 1 - unable to parse command line\n");
+ H1(" 2 - unable to open encoder\n");
+ H1(" 3 - unable to generate stream headers\n");
+ H1(" 4 - encoder abort\n");
+#undef OPT
+#undef H0
+#undef H1
+
+ if (level < X265_LOG_DEBUG)
+ printf("\nUse --log-level full --help for a full listing\n");
+ printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
+ exit(1);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--
x265 packaging
More information about the pkg-multimedia-commits
mailing list