[SCM] x264 packaging branch, ubuntu, updated. debian/0.svn20090621+git364d7d-0ubuntu2-10-g4af53c3
siretart at users.alioth.debian.org
siretart at users.alioth.debian.org
Wed Feb 17 23:29:34 UTC 2010
The following commit has been merged in the ubuntu branch:
commit 2c2e34a016dc6c32c1596ceb67d34e924177799d
Author: Reinhard Tartler <siretart at tauware.de>
Date: Sat Feb 13 18:04:43 2010 +0100
Imported Upstream version 0.svn20100213+gitfcf70c
diff --git a/.gitignore b/.gitignore
index 308b793..9d8cb70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@
.depend
config.h
config.mak
+config.log
x264
checkasm
diff --git a/AUTHORS b/AUTHORS
index 289a2a9..a0f3329 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -34,7 +34,7 @@ D: BeOS and MacOS X ports.
S: France
N: Gabriel Bouvigne
-E: gabriel.bouvigne AT joost DOT com
+E: bouvigne AT mp3-tech DOT org
D: 2pass VBV
N: Guillaume Poirier
diff --git a/Makefile b/Makefile
index 594e98b..3ac975d 100644
--- a/Makefile
+++ b/Makefile
@@ -10,9 +10,36 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/quant.c common/vlc.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
- encoder/cavlc.c encoder/encoder.c
+ encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
-SRCCLI = x264.c matroska.c muxers.c
+SRCCLI = x264.c input/yuv.c input/y4m.c output/raw.c \
+ output/matroska.c output/matroska_ebml.c \
+ output/flv.c output/flv_bytestream.c
+
+SRCSO =
+
+CONFIG := $(shell cat config.h)
+
+# Optional muxer module sources
+ifneq ($(findstring AVS_INPUT, $(CONFIG)),)
+SRCCLI += input/avs.c
+endif
+
+ifneq ($(findstring HAVE_PTHREAD, $(CONFIG)),)
+SRCCLI += input/thread.c
+endif
+
+ifneq ($(findstring LAVF_INPUT, $(CONFIG)),)
+SRCCLI += input/lavf.c
+endif
+
+ifneq ($(findstring FFMS_INPUT, $(CONFIG)),)
+SRCCLI += input/ffms.c
+endif
+
+ifneq ($(findstring MP4_OUTPUT, $(CONFIG)),)
+SRCCLI += output/mp4.c
+endif
# Visualization sources
ifeq ($(VIS),yes)
@@ -48,11 +75,20 @@ endif
# AltiVec optims
ifeq ($(ARCH),PPC)
-ALTIVECSRC += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
- common/ppc/quant.c common/ppc/deblock.c \
- common/ppc/predict.c
-SRCS += $(ALTIVECSRC)
-$(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
+SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
+ common/ppc/quant.c common/ppc/deblock.c \
+ common/ppc/predict.c
+endif
+
+# NEON optims
+ifeq ($(ARCH),ARM)
+ifneq ($(AS),)
+ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
+ common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
+ common/arm/predict-a.S
+SRCS += common/arm/mc-c.c common/arm/predict-c.c
+OBJASM = $(ASMSRC:%.S=%.o)
+endif
endif
# VIS optims
@@ -65,8 +101,15 @@ ifneq ($(HAVE_GETOPT_LONG),1)
SRCS += extras/getopt.c
endif
+ifneq ($(SONAME),)
+ifeq ($(SYS),MINGW)
+SRCSO += x264dll.c
+endif
+endif
+
OBJS = $(SRCS:%.c=%.o)
OBJCLI = $(SRCCLI:%.c=%.o)
+OBJSO = $(SRCSO:%.c=%.o)
DEP = depend
.PHONY: all default fprofiled clean distclean install uninstall dox test testclean
@@ -77,23 +120,26 @@ libx264.a: .depend $(OBJS) $(OBJASM)
$(AR) rc libx264.a $(OBJS) $(OBJASM)
$(RANLIB) libx264.a
-$(SONAME): .depend $(OBJS) $(OBJASM)
- $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS)
+$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+ $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
-x264$(EXE): $(OBJCLI) libx264.a
- $(CC) -o $@ $+ $(LDFLAGS)
+x264$(EXE): $(OBJCLI) libx264.a
+ $(CC) -o $@ $+ $(LDFLAGS) $(LDFLAGSCLI)
checkasm: tools/checkasm.o libx264.a
$(CC) -o $@ $+ $(LDFLAGS)
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
-# delete local/anonymous symbols, so they don't show up in oprofile
- -@ $(STRIP) -x $@
+ -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
+
+%.o: %.S
+ $(AS) $(ASFLAGS) -o $@ $<
+ -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
.depend: config.mak
- rm -f .depend
- $(foreach SRC, $(SRCS) $(SRCCLI), $(CC) $(CFLAGS) $(ALTIVECFLAGS) $(SRC) -MT $(SRC:%.c=%.o) -MM -g0 1>> .depend;)
+ @rm -f .depend
+ @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) -MT $(SRC:%.c=%.o) -MM -g0 1>> .depend;)
config.mak:
./configure
@@ -105,12 +151,12 @@ endif
SRC2 = $(SRCS) $(SRCCLI)
# These should cover most of the important codepaths
-OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --no-ssim --no-psnr
-OPT1 = --crf 16 -b2 -m3 -r3 --me hex -8 --direct spatial --no-dct-decimate
-OPT2 = --crf 26 -b2 -m5 -r2 --me hex -8 -w --cqm jvt --nr 100
-OPT3 = --crf 18 -b3 -m9 -r5 --me umh -8 -t1 -A all --mixed-refs -w --b-pyramid --direct auto --no-fast-pskip
-OPT4 = --crf 22 -b3 -m7 -r4 --me esa -8 -t2 -A all --mixed-refs --psy-rd 1.0:1.0
-OPT5 = --frames 50 --crf 24 -b3 -m9 -r3 --me tesa -8 -t1 --mixed-refs
+OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
+OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50
+OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
+OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
+OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
+OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac
@@ -125,7 +171,7 @@ fprofiled:
mv config.mak config.mak2
sed -e 's/CFLAGS.*/& -fprofile-generate/; s/LDFLAGS.*/& -fprofile-generate/' config.mak2 > config.mak
$(MAKE) x264$(EXE)
- $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) $(V) --progress -o $(DEVNULL) ;))
+ $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
rm -f $(SRC2:%.c=%.o)
sed -e 's/CFLAGS.*/& -fprofile-use/; s/LDFLAGS.*/& -fprofile-use/' config.mak2 > config.mak
$(MAKE)
@@ -134,13 +180,13 @@ fprofiled:
endif
clean:
- rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS
+ rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a x264 x264.exe .depend TAGS
rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno)
- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak
distclean: clean
- rm -f config.mak config.h x264.pc
+ rm -f config.mak config.h config.log x264.pc
rm -rf test/
install: x264$(EXE) $(SONAME)
diff --git a/build/win32/libx264.vcproj b/build/win32/libx264.vcproj
deleted file mode 100644
index 8497a35..0000000
--- a/build/win32/libx264.vcproj
+++ /dev/null
@@ -1,995 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="7.10"
- Name="libx264"
- ProjectGUID="{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}"
- SccProjectName=""
- SccLocalPath="">
- <Platforms>
- <Platform
- Name="Win32"/>
- </Platforms>
- <Configurations>
- <Configuration
- Name="Release|Win32"
- OutputDirectory=".\Release"
- IntermediateDirectory=".\Release"
- ConfigurationType="4"
- UseOfMFC="0"
- ATLMinimizesCRunTimeLibraryUsage="FALSE"
- CharacterSet="2">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- InlineFunctionExpansion="2"
- OptimizeForProcessor="2"
- AdditionalIncludeDirectories="../../common;../../extras;../.."
- PreprocessorDefinitions="NDEBUG;_LIB;WIN32;__X264__;HAVE_MMX;ARCH_X86;HAVE_STDINT_H"
- StringPooling="TRUE"
- RuntimeLibrary="4"
- EnableFunctionLevelLinking="TRUE"
- PrecompiledHeaderFile=".\Release/libx264.pch"
- AssemblerListingLocation=".\Release/"
- ObjectFile=".\Release/"
- ProgramDataBaseFileName=".\Release/"
- WarningLevel="3"
- SuppressStartupBanner="TRUE"
- CompileAs="0"/>
- <Tool
- Name="VCCustomBuildTool"/>
- <Tool
- Name="VCLibrarianTool"
- OutputFile="bin/libx264.lib"
- SuppressStartupBanner="TRUE"/>
- <Tool
- Name="VCMIDLTool"/>
- <Tool
- Name="VCPostBuildEventTool"/>
- <Tool
- Name="VCPreBuildEventTool"/>
- <Tool
- Name="VCPreLinkEventTool"/>
- <Tool
- Name="VCResourceCompilerTool"
- PreprocessorDefinitions="NDEBUG"
- Culture="2052"/>
- <Tool
- Name="VCWebServiceProxyGeneratorTool"/>
- <Tool
- Name="VCXMLDataGeneratorTool"/>
- <Tool
- Name="VCManagedWrapperGeneratorTool"/>
- <Tool
- Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
- </Configuration>
- <Configuration
- Name="Debug|Win32"
- OutputDirectory=".\Debug"
- IntermediateDirectory=".\Debug"
- ConfigurationType="4"
- UseOfMFC="0"
- ATLMinimizesCRunTimeLibraryUsage="FALSE"
- CharacterSet="2">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories="../../common;../../extras;../.."
- PreprocessorDefinitions="_DEBUG;_LIB;WIN32;__X264__;HAVE_MMX;ARCH_X86;HAVE_STDINT_H"
- BasicRuntimeChecks="3"
- RuntimeLibrary="5"
- PrecompiledHeaderFile=".\Debug/libx264.pch"
- AssemblerListingLocation=".\Debug/"
- ObjectFile=".\Debug/"
- ProgramDataBaseFileName=".\Debug/"
- WarningLevel="3"
- SuppressStartupBanner="TRUE"
- DebugInformationFormat="3"
- CompileAs="0"/>
- <Tool
- Name="VCCustomBuildTool"/>
- <Tool
- Name="VCLibrarianTool"
- OutputFile="bin/libx264.lib"
- SuppressStartupBanner="TRUE"/>
- <Tool
- Name="VCMIDLTool"/>
- <Tool
- Name="VCPostBuildEventTool"/>
- <Tool
- Name="VCPreBuildEventTool"/>
- <Tool
- Name="VCPreLinkEventTool"/>
- <Tool
- Name="VCResourceCompilerTool"
- PreprocessorDefinitions="_DEBUG"
- Culture="2052"/>
- <Tool
- Name="VCWebServiceProxyGeneratorTool"/>
- <Tool
- Name="VCXMLDataGeneratorTool"/>
- <Tool
- Name="VCManagedWrapperGeneratorTool"/>
- <Tool
- Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <Filter
- Name="Enc"
- Filter=".c">
- <File
- RelativePath="..\..\encoder\analyse.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\cabac.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\cavlc.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\encoder.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\macroblock.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\me.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\ratecontrol.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\encoder\set.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/enc_release/"
- ProgramDataBaseFileName="obj/enc_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/enc_debug/"
- ProgramDataBaseFileName="obj/enc_debug/"/>
- </FileConfiguration>
- </File>
- <Filter
- Name="enc_h"
- Filter=".h">
- <File
- RelativePath="..\..\encoder\analyse.h">
- </File>
- <File
- RelativePath="..\..\encoder\macroblock.h">
- </File>
- <File
- RelativePath="..\..\encoder\me.h">
- </File>
- <File
- RelativePath="..\..\encoder\ratecontrol.h">
- </File>
- <File
- RelativePath="..\..\encoder\set.h">
- </File>
- </Filter>
- </Filter>
- <Filter
- Name="Core"
- Filter=".c;.h;">
- <File
- RelativePath="..\..\common\cabac.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\common.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\cpu.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\dct.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\vlc.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\frame.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\macroblock.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\mc-c.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\mc.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\mdate.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\pixel.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\predict.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\predict-c.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/core_release/"
- ProgramDataBaseFileName="obj/core_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/core_debug/"
- ProgramDataBaseFileName="obj/core_debug/"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\quant.c">
- </File>
- <File
- RelativePath="..\..\common\set.c">
- </File>
- <Filter
- Name="core_h"
- Filter=".h">
- <File
- RelativePath="..\..\common\bs.h">
- </File>
- <File
- RelativePath="..\..\common\cabac.h">
- </File>
- <File
- RelativePath="..\..\common\common.h">
- </File>
- <File
- RelativePath="..\..\common\cpu.h">
- </File>
- <File
- RelativePath="..\..\common\dct.h">
- </File>
- <File
- RelativePath="..\..\common\frame.h">
- </File>
- <File
- RelativePath="..\..\common\macroblock.h">
- </File>
- <File
- RelativePath="..\..\common\mc.h">
- </File>
- <File
- RelativePath="..\..\common\osdep.h">
- </File>
- <File
- RelativePath="..\..\common\pixel.h">
- </File>
- <File
- RelativePath="..\..\common\predict.h">
- </File>
- <File
- RelativePath="..\..\common\quant.h">
- </File>
- <File
- RelativePath="..\..\common\set.h">
- </File>
- <File
- RelativePath="..\..\common\vlc.h">
- </File>
- </Filter>
- <Filter
- Name="x86"
- Filter=".asm">
- <File
- RelativePath="..\..\common\x86\cpu-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\dct-32.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\dct-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\cabac-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\deblock-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\x86inc.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\mc-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\mc-a2.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\pixel-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\pixel-32.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\predict-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\quant-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\common\x86\sad-a.asm">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCustomBuildTool"
- Description="Assembly $(InputPath)"
- CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o "$(IntDir)\$(InputName)".obj "$(InputPath)""
- Outputs="$(IntDir)\$(InputName).obj"/>
- </FileConfiguration>
- </File>
- </Filter>
- </Filter>
- <Filter
- Name="extras"
- Filter=".c">
- <File
- RelativePath="..\..\extras\getopt.c">
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- ObjectFile=".\obj/util_release/"
- ProgramDataBaseFileName="obj/util_release/"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"
- ObjectFile=".\obj/util_debug/"
- ProgramDataBaseFileName="obj/util_debug/"/>
- </FileConfiguration>
- </File>
- <Filter
- Name="extras_h"
- Filter=".h">
- <File
- RelativePath="..\..\extras\getopt.h">
- </File>
- <File
- RelativePath="..\..\extras\stdint.h">
- </File>
- </Filter>
- </Filter>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
diff --git a/build/win32/x264.sln b/build/win32/x264.sln
deleted file mode 100644
index 8fb518d..0000000
--- a/build/win32/x264.sln
+++ /dev/null
@@ -1,30 +0,0 @@
-Microsoft Visual Studio Solution File, Format Version 8.00
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libx264", "libx264.vcproj", "{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}"
- ProjectSection(ProjectDependencies) = postProject
- EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "x264", "x264.vcproj", "{22E1814D-7955-4456-AEA5-0C9BA7500792}"
- ProjectSection(ProjectDependencies) = postProject
- {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6} = {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}
- EndProjectSection
-EndProject
-Global
- GlobalSection(SolutionConfiguration) = preSolution
- Debug = Debug
- Release = Release
- EndGlobalSection
- GlobalSection(ProjectConfiguration) = postSolution
- {22E1814D-7955-4456-AEA5-0C9BA7500792}.Debug.ActiveCfg = Debug|Win32
- {22E1814D-7955-4456-AEA5-0C9BA7500792}.Debug.Build.0 = Debug|Win32
- {22E1814D-7955-4456-AEA5-0C9BA7500792}.Release.ActiveCfg = Release|Win32
- {22E1814D-7955-4456-AEA5-0C9BA7500792}.Release.Build.0 = Release|Win32
- {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Debug.ActiveCfg = Debug|Win32
- {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Debug.Build.0 = Debug|Win32
- {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Release.ActiveCfg = Release|Win32
- {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Release.Build.0 = Release|Win32
- EndGlobalSection
- GlobalSection(ExtensibilityGlobals) = postSolution
- EndGlobalSection
- GlobalSection(ExtensibilityAddIns) = postSolution
- EndGlobalSection
-EndGlobal
diff --git a/build/win32/x264.vcproj b/build/win32/x264.vcproj
deleted file mode 100644
index c567265..0000000
--- a/build/win32/x264.vcproj
+++ /dev/null
@@ -1,178 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="7.10"
- Name="x264"
- ProjectGUID="{22E1814D-7955-4456-AEA5-0C9BA7500792}"
- SccProjectName=""
- SccLocalPath="">
- <Platforms>
- <Platform
- Name="Win32"/>
- </Platforms>
- <Configurations>
- <Configuration
- Name="Debug|Win32"
- OutputDirectory=".\obj/x264_Debug"
- IntermediateDirectory=".\obj/x264_Debug"
- ConfigurationType="1"
- UseOfMFC="0"
- ATLMinimizesCRunTimeLibraryUsage="FALSE"
- CharacterSet="2">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories="../../common;../../extras;../.."
- PreprocessorDefinitions="_DEBUG;_CONSOLE;WIN32;__X264__;HAVE_MMX;HAVE_STDINT_H;AVIS_INPUT"
- BasicRuntimeChecks="3"
- RuntimeLibrary="5"
- UsePrecompiledHeader="2"
- PrecompiledHeaderFile=".\obj/x264_Debug/x264.pch"
- AssemblerListingLocation=".\obj/x264_Debug/"
- ObjectFile=".\obj/x264_Debug/"
- ProgramDataBaseFileName=".\obj/x264_Debug/"
- WarningLevel="3"
- SuppressStartupBanner="TRUE"
- DebugInformationFormat="3"
- CompileAs="0"/>
- <Tool
- Name="VCCustomBuildTool"/>
- <Tool
- Name="VCLinkerTool"
- AdditionalDependencies="vfw32.lib winmm.lib"
- OutputFile="bin/x264.exe"
- LinkIncremental="1"
- SuppressStartupBanner="TRUE"
- GenerateDebugInformation="TRUE"
- ProgramDatabaseFile=".\obj/x264_Debug/x264.pdb"
- SubSystem="1"
- TargetMachine="1"/>
- <Tool
- Name="VCMIDLTool"
- TypeLibraryName=".\obj/x264_Debug/x264.tlb"
- HeaderFileName=""/>
- <Tool
- Name="VCPostBuildEventTool"/>
- <Tool
- Name="VCPreBuildEventTool"/>
- <Tool
- Name="VCPreLinkEventTool"/>
- <Tool
- Name="VCResourceCompilerTool"
- PreprocessorDefinitions="_DEBUG"
- Culture="2052"/>
- <Tool
- Name="VCWebServiceProxyGeneratorTool"/>
- <Tool
- Name="VCXMLDataGeneratorTool"/>
- <Tool
- Name="VCWebDeploymentTool"/>
- <Tool
- Name="VCManagedWrapperGeneratorTool"/>
- <Tool
- Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
- </Configuration>
- <Configuration
- Name="Release|Win32"
- OutputDirectory=".\obj/x264_Release"
- IntermediateDirectory=".\obj/x264_Release"
- ConfigurationType="1"
- UseOfMFC="0"
- ATLMinimizesCRunTimeLibraryUsage="FALSE"
- CharacterSet="2">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="../../common;../../extras;../.."
- PreprocessorDefinitions="NDEBUG;_CONSOLE;WIN32;__X264__;HAVE_MMX;HAVE_STDINT_H;AVIS_INPUT"
- StringPooling="TRUE"
- RuntimeLibrary="0"
- EnableFunctionLevelLinking="TRUE"
- UsePrecompiledHeader="2"
- PrecompiledHeaderFile=".\obj/x264_Release/x264.pch"
- AssemblerListingLocation=".\obj/x264_Release/"
- ObjectFile=".\obj/x264_Release/"
- ProgramDataBaseFileName=".\obj/x264_Release/"
- WarningLevel="3"
- SuppressStartupBanner="TRUE"
- CompileAs="0"/>
- <Tool
- Name="VCCustomBuildTool"/>
- <Tool
- Name="VCLinkerTool"
- AdditionalDependencies="vfw32.lib winmm.lib"
- OutputFile="bin/x264.exe"
- LinkIncremental="1"
- SuppressStartupBanner="TRUE"
- ProgramDatabaseFile=".\obj/x264_Release/x264.pdb"
- SubSystem="1"
- TargetMachine="1"/>
- <Tool
- Name="VCMIDLTool"
- TypeLibraryName=".\obj/x264_Release/x264.tlb"
- HeaderFileName=""/>
- <Tool
- Name="VCPostBuildEventTool"/>
- <Tool
- Name="VCPreBuildEventTool"/>
- <Tool
- Name="VCPreLinkEventTool"/>
- <Tool
- Name="VCResourceCompilerTool"
- PreprocessorDefinitions="NDEBUG"
- Culture="2052"/>
- <Tool
- Name="VCWebServiceProxyGeneratorTool"/>
- <Tool
- Name="VCXMLDataGeneratorTool"/>
- <Tool
- Name="VCWebDeploymentTool"/>
- <Tool
- Name="VCManagedWrapperGeneratorTool"/>
- <Tool
- Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <File
- RelativePath="..\..\matroska.c">
- </File>
- <File
- RelativePath="..\..\matroska.h">
- </File>
- <File
- RelativePath="..\..\muxers.c">
- </File>
- <File
- RelativePath="..\..\muxers.h">
- </File>
- <File
- RelativePath="..\..\x264.c">
- <FileConfiguration
- Name="Debug|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
- BasicRuntimeChecks="3"/>
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32">
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""/>
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\x264.h">
- </File>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
diff --git a/common/arm/asm.S b/common/arm/asm.S
new file mode 100644
index 0000000..d163165
--- /dev/null
+++ b/common/arm/asm.S
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+# define ELF
+#else
+# define ELF @
+#endif
+
+ .macro require8, val=1
+ELF .eabi_attribute 24, \val
+ .endm
+
+ .macro preserve8, val=1
+ELF .eabi_attribute 25, \val
+ .endm
+
+ .macro function name
+ .global \name
+ELF .hidden \name
+ELF .type \name, %function
+ .func \name
+\name:
+ .endm
+
+ .macro movrel rd, val
+#if defined(HAVE_ARMV6T2) && !defined(PIC)
+ movw \rd, #:lower16:\val
+ movt \rd, #:upper16:\val
+#else
+ ldr \rd, =\val
+#endif
+ .endm
+
+.macro movconst rd, val
+#ifdef HAVE_ARMV6T2
+ movw \rd, #:lower16:\val
+.if \val >> 16
+ movt \rd, #:upper16:\val
+.endif
+#else
+ ldr \rd, =\val
+#endif
+.endm
+
+#define FENC_STRIDE 16
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+ vadd.u16 \a, \a, \b
+.endif
+ vpaddl.u16 \a, \a
+ vpaddl.u32 \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+ vadd.s16 \sum, \a, \b
+ vsub.s16 \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+ SUMSUB_AB \s1, \d1, \a, \b
+ SUMSUB_AB \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+ vabs.s16 \a, \a
+ vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+ vtrn.16 \s1, \s2
+.else
+ vtrn.32 \s1, \s2
+.endif
+.ifc \op, sumsub
+ SUMSUB_AB \d1, \d2, \s1, \s2
+.else
+ vabs.s16 \s1, \s1
+ vabs.s16 \s2, \s2
+ vmax.s16 \d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+ vtrn.32 \r0, \r4
+ vtrn.32 \r1, \r5
+ vtrn.32 \r2, \r6
+ vtrn.32 \r3, \r7
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.16 \r4, \r6
+ vtrn.16 \r5, \r7
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16 d0 d1 d2 d3
+ vtrn.32 \d0, \d2
+ vtrn.32 \d1, \d3
+ vtrn.16 \d0, \d1
+ vtrn.16 \d2, \d3
+.endm
diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S
new file mode 100644
index 0000000..40eff03
--- /dev/null
+++ b/common/arm/cpu-a.S
@@ -0,0 +1,106 @@
+/*****************************************************************************
+ * cpu-a.S: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.align
+
+// done in gas because .fpu neon overrides the refusal to assemble
+// instructions the selected -march/-mcpu doesn't support
+function x264_cpu_neon_test
+ vadd.i16 q0, q0, q0
+ bx lr
+.endfunc
+
+// return: 0 on success
+// 1 if counters were already enabled
+// 9 if lo-res counters were already enabled
+function x264_cpu_enable_armv7_counter
+ mrc p15, 0, r2, c9, c12, 0 // read PMNC
+ ands r0, r2, #1
+ andne r0, r2, #9
+
+ orr r2, r2, #1 // enable counters
+ bic r2, r2, #8 // full resolution
+ mcreq p15, 0, r2, c9, c12, 0 // write PMNC
+ mov r2, #1 << 31 // enable cycle counter
+ mcr p15, 0, r2, c9, c12, 1 // write CNTENS
+ bx lr
+.endfunc
+
+function x264_cpu_disable_armv7_counter
+ mrc p15, 0, r0, c9, c12, 0 // read PMNC
+ bic r0, r0, #1 // disable counters
+ mcr p15, 0, r0, c9, c12, 0 // write PMNC
+ bx lr
+.endfunc
+
+
+.macro READ_TIME r
+ mrc p15, 0, \r, c9, c13, 0
+.endm
+
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
+// nonzero otherwise
+function x264_cpu_fast_neon_mrc_test
+ // check for user access to performance counters
+ mrc p15, 0, r0, c9, c14, 0
+ cmp r0, #0
+ bxeq lr
+
+ push {r4-r6,lr}
+ bl x264_cpu_enable_armv7_counter
+ ands r1, r0, #8
+ mov r3, #0
+ mov ip, #4
+ mov r6, #4
+ moveq r5, #1
+ movne r5, #64
+
+average_loop:
+ mov r4, r5
+ READ_TIME r1
+1: subs r4, r4, #1
+.rept 8
+ vmov.u32 lr, d0[0]
+ add lr, lr, lr
+.endr
+ bgt 1b
+ READ_TIME r2
+
+ subs r6, r6, #1
+ sub r2, r2, r1
+ cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
+ addle r3, r3, r2
+ subles ip, ip, #1
+ bgt average_loop
+
+ // disable counters if we enabled them
+ ands r0, r0, #1
+ bleq x264_cpu_disable_armv7_counter
+
+ lsr r0, r3, #5
+ cmp r0, #10
+ movgt r0, #0
+ pop {r4-r6,pc}
+.endfunc
diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S
new file mode 100644
index 0000000..0ed7238
--- /dev/null
+++ b/common/arm/dct-a.S
@@ -0,0 +1,663 @@
+/*****************************************************************************
+ * dct-a.S: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+
+scan4x4_frame:
+.byte 0,1, 8,9, 2,3, 4,5
+.byte 2,3, 8,9, 16,17, 10,11
+.byte 12,13, 6,7, 14,15, 20,21
+.byte 10,11, 12,13, 6,7, 14,15
+
+.text
+
+// sum = a + (b>>shift) sub = (a>>shift) - b
+.macro SUMSUB_SHR shift sum sub a b t0 t1
+ vshr.s16 \t0, \b, #\shift
+ vshr.s16 \t1, \a, #\shift
+ vadd.s16 \sum, \a, \t0
+ vsub.s16 \sub, \t1, \b
+.endm
+
+// sum = (a>>shift) + b sub = a - (b>>shift)
+.macro SUMSUB_SHR2 shift sum sub a b t0 t1
+ vshr.s16 \t0, \a, #\shift
+ vshr.s16 \t1, \b, #\shift
+ vadd.s16 \sum, \t0, \b
+ vsub.s16 \sub, \a, \t1
+.endm
+
+// a += 1.5*ma b -= 1.5*mb
+.macro SUMSUB_15 a b ma mb t0 t1
+ vshr.s16 \t0, \ma, #1
+ vshr.s16 \t1, \mb, #1
+ vadd.s16 \t0, \t0, \ma
+ vadd.s16 \t1, \t1, \mb
+ vadd.s16 \a, \a, \t0
+ vsub.s16 \b, \b, \t1
+.endm
+
+
+function x264_dct4x4dc_neon
+ vld1.64 {d0-d3}, [r0,:128]
+ SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
+ SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
+
+ vmov.s16 d31, #1
+ HADAMARD 1, sumsub, q2, q3, q0, q1
+ vtrn.32 d4, d5
+ vadd.s16 d16, d4, d31
+ vtrn.32 d6, d7
+ vadd.s16 d17, d6, d31
+ vrhadd.s16 d0, d4, d5
+ vhsub.s16 d1, d16, d5
+ vhsub.s16 d2, d17, d7
+ vrhadd.s16 d3, d6, d7
+ vst1.64 {d0-d3}, [r0,:128]
+ bx lr
+.endfunc
+
+function x264_idct4x4dc_neon
+ vld1.64 {d0-d3}, [r0,:128]
+ SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
+ SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
+
+ HADAMARD 1, sumsub, q2, q3, q0, q1
+ HADAMARD 2, sumsub, d0, d1, d4, d5
+ HADAMARD 2, sumsub, d3, d2, d6, d7
+ vst1.64 {d0-d3}, [r0,:128]
+ bx lr
+.endfunc
+
+
+.macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7
+ SUMSUB_AB \d1, \d6, \d5, \d6
+ SUMSUB_AB \d3, \d7, \d4, \d7
+ vadd.s16 \d0, \d3, \d1
+ vadd.s16 \d4, \d7, \d7
+ vadd.s16 \d5, \d6, \d6
+ vsub.s16 \d2, \d3, \d1
+ vadd.s16 \d1, \d4, \d6
+ vsub.s16 \d3, \d7, \d5
+.endm
+
+function x264_sub4x4_dct_neon
+ mov r3, #FENC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.32 {d0[]}, [r1,:32], r3
+ vld1.32 {d1[]}, [r2,:32], ip
+ vld1.32 {d2[]}, [r1,:32], r3
+ vsubl.u8 q8, d0, d1
+ vld1.32 {d3[]}, [r2,:32], ip
+ vld1.32 {d4[]}, [r1,:32], r3
+ vsubl.u8 q9, d2, d3
+ vld1.32 {d5[]}, [r2,:32], ip
+ vld1.32 {d6[]}, [r1,:32], r3
+ vsubl.u8 q10, d4, d5
+ vld1.32 {d7[]}, [r2,:32], ip
+ vsubl.u8 q11, d6, d7
+
+ DCT_1D d0, d1, d2, d3, d16, d18, d20, d22
+ TRANSPOSE4x4_16 d0, d1, d2, d3
+ DCT_1D d4, d5, d6, d7, d0, d1, d2, d3
+ vst1.64 {d4-d7}, [r0,:128]
+ bx lr
+.endfunc
+
+function x264_sub8x4_dct_neon
+ vld1.64 {d0}, [r1,:64], r3
+ vld1.64 {d1}, [r2,:64], ip
+ vsubl.u8 q8, d0, d1
+ vld1.64 {d2}, [r1,:64], r3
+ vld1.64 {d3}, [r2,:64], ip
+ vsubl.u8 q9, d2, d3
+ vld1.64 {d4}, [r1,:64], r3
+ vld1.64 {d5}, [r2,:64], ip
+ vsubl.u8 q10, d4, d5
+ vld1.64 {d6}, [r1,:64], r3
+ vld1.64 {d7}, [r2,:64], ip
+ vsubl.u8 q11, d6, d7
+
+ DCT_1D q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSPOSE4x4_16 q0, q1, q2, q3
+
+ SUMSUB_AB q8, q12, q0, q3
+ SUMSUB_AB q9, q10, q1, q2
+ vadd.i16 q13, q12, q12
+ vadd.i16 q11, q10, q10
+ vadd.i16 d0, d16, d18
+ vadd.i16 d1, d26, d20
+ vsub.i16 d2, d16, d18
+ vsub.i16 d3, d24, d22
+ vst1.64 {d0-d1}, [r0,:128]!
+ vadd.i16 d4, d17, d19
+ vadd.i16 d5, d27, d21
+ vst1.64 {d2-d3}, [r0,:128]!
+ vsub.i16 d6, d17, d19
+ vsub.i16 d7, d25, d23
+ vst1.64 {d4-d5}, [r0,:128]!
+ vst1.64 {d6-d7}, [r0,:128]!
+ bx lr
+.endfunc
+
+function x264_sub8x8_dct_neon
+ push {lr}
+ mov r3, #FENC_STRIDE
+ mov ip, #FDEC_STRIDE
+ bl x264_sub8x4_dct_neon
+ pop {lr}
+ b x264_sub8x4_dct_neon
+.endfunc
+
+function x264_sub16x16_dct_neon
+ push {lr}
+ mov r3, #FENC_STRIDE
+ mov ip, #FDEC_STRIDE
+ bl x264_sub8x4_dct_neon
+ bl x264_sub8x4_dct_neon
+ sub r1, r1, #8*FENC_STRIDE-8
+ sub r2, r2, #8*FDEC_STRIDE-8
+ bl x264_sub8x4_dct_neon
+ bl x264_sub8x4_dct_neon
+ sub r1, r1, #8
+ sub r2, r2, #8
+ bl x264_sub8x4_dct_neon
+ bl x264_sub8x4_dct_neon
+ sub r1, r1, #8*FENC_STRIDE-8
+ sub r2, r2, #8*FDEC_STRIDE-8
+ bl x264_sub8x4_dct_neon
+ pop {lr}
+ b x264_sub8x4_dct_neon
+.endfunc
+
+
+.macro DCT8_1D type
+ SUMSUB_AB q2, q1, q11, q12 // s34/d34
+ SUMSUB_AB q3, q11, q10, q13 // s25/d25
+ SUMSUB_AB q13, q10, q9, q14 // s16/d16
+ SUMSUB_AB q14, q8, q8, q15 // s07/d07
+
+ SUMSUB_AB q9, q2, q14, q2 // a0/a2
+ SUMSUB_AB q12, q14, q13, q3 // a1/a3
+
+ SUMSUB_AB q3, q13, q8, q1 // a6/a5
+ vshr.s16 q0, q10, #1
+ vshr.s16 q15, q11, #1
+ vadd.s16 q0, q0, q10
+ vadd.s16 q15, q15, q11
+ vsub.s16 q3, q3, q0
+ vsub.s16 q13, q13, q15
+
+ SUMSUB_AB q0, q15, q10, q11 // a4/a7
+ vshr.s16 q10, q8, #1
+ vshr.s16 q11, q1, #1
+ vadd.s16 q10, q10, q8
+ vadd.s16 q11, q11, q1
+ vadd.s16 q10, q0, q10
+ vadd.s16 q15, q15, q11
+
+ SUMSUB_AB q8, q12, q9, q12
+ SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1
+ SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1
+ SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
+.endm
+
+function x264_sub8x8_dct8_neon
+ mov r3, #FENC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d16}, [r1,:64], r3
+ vld1.64 {d17}, [r2,:64], ip
+ vsubl.u8 q8, d16, d17
+ vld1.64 {d18}, [r1,:64], r3
+ vld1.64 {d19}, [r2,:64], ip
+ vsubl.u8 q9, d18, d19
+ vld1.64 {d20}, [r1,:64], r3
+ vld1.64 {d21}, [r2,:64], ip
+ vsubl.u8 q10, d20, d21
+ vld1.64 {d22}, [r1,:64], r3
+ vld1.64 {d23}, [r2,:64], ip
+ vsubl.u8 q11, d22, d23
+ vld1.64 {d24}, [r1,:64], r3
+ vld1.64 {d25}, [r2,:64], ip
+ vsubl.u8 q12, d24, d25
+ vld1.64 {d26}, [r1,:64], r3
+ vld1.64 {d27}, [r2,:64], ip
+ vsubl.u8 q13, d26, d27
+ vld1.64 {d28}, [r1,:64], r3
+ vld1.64 {d29}, [r2,:64], ip
+ vsubl.u8 q14, d28, d29
+ vld1.64 {d30}, [r1,:64], r3
+ vld1.64 {d31}, [r2,:64], ip
+ vsubl.u8 q15, d30, d31
+
+ DCT8_1D row
+ vswp d17, d24 // 8, 12
+ vswp d21, d28 // 10,14
+ vtrn.32 q8, q10
+ vtrn.32 q12, q14
+
+ vswp d19, d26 // 9, 13
+ vswp d23, d30 // 11,15
+ vtrn.32 q9, q11
+ vtrn.32 q13, q15
+
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q8, q9
+ vtrn.16 q14, q15
+ DCT8_1D col
+
+ vst1.64 {d16-d19}, [r0,:128]!
+ vst1.64 {d20-d23}, [r0,:128]!
+ vst1.64 {d24-d27}, [r0,:128]!
+ vst1.64 {d28-d31}, [r0,:128]!
+ bx lr
+.endfunc
+
+function x264_sub16x16_dct8_neon
+ push {lr}
+ bl x264_sub8x8_dct8_neon
+ sub r1, r1, #FENC_STRIDE*8 - 8
+ sub r2, r2, #FDEC_STRIDE*8 - 8
+ bl x264_sub8x8_dct8_neon
+ sub r1, r1, #8
+ sub r2, r2, #8
+ bl x264_sub8x8_dct8_neon
+ pop {lr}
+ sub r1, r1, #FENC_STRIDE*8 - 8
+ sub r2, r2, #FDEC_STRIDE*8 - 8
+ b x264_sub8x8_dct8_neon
+.endfunc
+
+
+// First part of IDCT (minus final SUMSUB_BA)
+.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
+ SUMSUB_AB \d4, \d5, \d0, \d2
+ vshr.s16 \d7, \d1, #1
+ vshr.s16 \d6, \d3, #1
+ vsub.s16 \d7, \d7, \d3
+ vadd.s16 \d6, \d6, \d1
+.endm
+
+function x264_add4x4_idct_neon
+ mov r2, #FDEC_STRIDE
+ vld1.64 {d0-d3}, [r1,:128]
+
+ IDCT_1D d4, d5, d6, d7, d0, d1, d2, d3
+ vld1.32 {d30[0]}, [r0,:32], r2
+ SUMSUB_AB q0, q1, q2, q3
+
+ TRANSPOSE4x4_16 d0, d1, d3, d2
+
+ IDCT_1D d4, d5, d6, d7, d0, d1, d3, d2
+ vld1.32 {d30[1]}, [r0,:32], r2
+ SUMSUB_AB q0, q1, q2, q3
+
+ vrshr.s16 q0, q0, #6
+ vld1.32 {d31[1]}, [r0,:32], r2
+ vrshr.s16 q1, q1, #6
+ vld1.32 {d31[0]}, [r0,:32], r2
+
+ sub r0, r0, r2, lsl #2
+ vaddw.u8 q0, q0, d30
+ vaddw.u8 q1, q1, d31
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d2, q1
+
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d2[1]}, [r0,:32], r2
+ vst1.32 {d2[0]}, [r0,:32], r2
+ bx lr
+.endfunc
+
+function x264_add8x4_idct_neon
+ vld1.64 {d0-d3}, [r1,:128]!
+ IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
+ vld1.64 {d4-d7}, [r1,:128]!
+ IDCT_1D d17, d19, d21, d23, d4, d5, d6, d7
+ SUMSUB_AB q0, q3, q8, q10
+ SUMSUB_AB q1, q2, q9, q11
+
+ TRANSPOSE4x4_16 q0, q1, q2, q3
+
+ IDCT_1D q8, q9, q10, q11, q0, q1, q2, q3
+ SUMSUB_AB q0, q3, q8, q10
+ SUMSUB_AB q1, q2, q9, q11
+
+ vrshr.s16 q0, q0, #6
+ vld1.32 {d28}, [r0,:64], r2
+ vrshr.s16 q1, q1, #6
+ vld1.32 {d29}, [r0,:64], r2
+ vrshr.s16 q2, q2, #6
+ vld1.32 {d30}, [r0,:64], r2
+ vrshr.s16 q3, q3, #6
+ vld1.32 {d31}, [r0,:64], r2
+
+ sub r0, r0, r2, lsl #2
+ vaddw.u8 q0, q0, d28
+ vaddw.u8 q1, q1, d29
+ vaddw.u8 q2, q2, d30
+ vaddw.u8 q3, q3, d31
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.32 {d0}, [r0,:64], r2
+ vqmovun.s16 d2, q2
+ vst1.32 {d1}, [r0,:64], r2
+ vqmovun.s16 d3, q3
+ vst1.32 {d2}, [r0,:64], r2
+ vst1.32 {d3}, [r0,:64], r2
+ bx lr
+.endfunc
+
+function x264_add8x8_idct_neon
+ mov r2, #FDEC_STRIDE
+ mov ip, lr
+ bl x264_add8x4_idct_neon
+ mov lr, ip
+ b x264_add8x4_idct_neon
+.endfunc
+
+function x264_add16x16_idct_neon
+ mov r2, #FDEC_STRIDE
+ mov ip, lr
+ bl x264_add8x4_idct_neon
+ bl x264_add8x4_idct_neon
+ sub r0, r0, #8*FDEC_STRIDE-8
+ bl x264_add8x4_idct_neon
+ bl x264_add8x4_idct_neon
+ sub r0, r0, #8
+ bl x264_add8x4_idct_neon
+ bl x264_add8x4_idct_neon
+ sub r0, r0, #8*FDEC_STRIDE-8
+ bl x264_add8x4_idct_neon
+ mov lr, ip
+ b x264_add8x4_idct_neon
+.endfunc
+
+
+.macro IDCT8_1D type
+.ifc \type, col
+ vswp d21, d28
+.endif
+ SUMSUB_AB q0, q1, q8, q12 // a0/a2
+.ifc \type, row
+ vld1.64 {d28-d31}, [r1,:128]!
+.else
+ vswp d19, d26
+.endif
+ SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4
+.ifc \type, col
+ vswp d23, d30
+.endif
+ SUMSUB_AB q8, q10, q13, q11
+ SUMSUB_15 q8, q10, q9, q15, q12, q14 // a7/a1
+ SUMSUB_AB q14, q15, q15, q9
+ SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3
+
+ SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5
+ SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7
+
+ SUMSUB_AB q10, q2, q0, q2 // b0/b6
+ SUMSUB_AB q11, q3, q1, q3 // b2/b4
+
+ SUMSUB_AB q8, q15, q10, q15
+ SUMSUB_AB q9, q14, q11, q14
+ SUMSUB_AB q10, q13, q3, q13
+.ifc \type, row
+ vtrn.16 q8, q9
+.endif
+ SUMSUB_AB q11, q12, q2, q12
+.endm
+
+function x264_add8x8_idct8_neon
+ mov r2, #FDEC_STRIDE
+ vld1.64 {d16-d19}, [r1,:128]!
+ vld1.64 {d20-d23}, [r1,:128]!
+ vld1.64 {d24-d27}, [r1,:128]!
+
+ IDCT8_1D row
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vswp d17, d24
+ IDCT8_1D col
+
+ vld1.64 {d0}, [r0,:64], r2
+ vrshr.s16 q8, q8, #6
+ vld1.64 {d1}, [r0,:64], r2
+ vrshr.s16 q9, q9, #6
+ vld1.64 {d2}, [r0,:64], r2
+ vrshr.s16 q10, q10, #6
+ vld1.64 {d3}, [r0,:64], r2
+ vrshr.s16 q11, q11, #6
+ vld1.64 {d4}, [r0,:64], r2
+ vrshr.s16 q12, q12, #6
+ vld1.64 {d5}, [r0,:64], r2
+ vrshr.s16 q13, q13, #6
+ vld1.64 {d6}, [r0,:64], r2
+ vrshr.s16 q14, q14, #6
+ vld1.64 {d7}, [r0,:64], r2
+ vrshr.s16 q15, q15, #6
+ sub r0, r0, r2, lsl #3
+
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vaddw.u8 q11, q11, d3
+ vst1.64 {d0}, [r0,:64], r2
+ vaddw.u8 q12, q12, d4
+ vst1.64 {d1}, [r0,:64], r2
+ vaddw.u8 q13, q13, d5
+ vst1.64 {d2}, [r0,:64], r2
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+ vst1.64 {d3}, [r0,:64], r2
+ vqmovun.s16 d5, q13
+ vst1.64 {d4}, [r0,:64], r2
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+ vst1.64 {d5}, [r0,:64], r2
+ vst1.64 {d6}, [r0,:64], r2
+ vst1.64 {d7}, [r0,:64], r2
+ bx lr
+.endfunc
+
+function x264_add16x16_idct8_neon
+ mov ip, lr
+ bl x264_add8x8_idct8_neon
+ sub r0, r0, #8*FDEC_STRIDE-8
+ bl x264_add8x8_idct8_neon
+ sub r0, r0, #8
+ bl x264_add8x8_idct8_neon
+ sub r0, r0, #8*FDEC_STRIDE-8
+ mov lr, ip
+ b x264_add8x8_idct8_neon
+.endfunc
+
+
+function x264_add8x8_idct_dc_neon
+ mov r2, #FDEC_STRIDE
+ vld1.64 {d16}, [r1,:64]
+ vrshr.s16 d16, d16, #6
+ vld1.64 {d0}, [r0,:64], r2
+ vmov.i16 q15, #0
+ vld1.64 {d1}, [r0,:64], r2
+ vld1.64 {d2}, [r0,:64], r2
+ vdup.16 d20, d16[0]
+ vld1.64 {d3}, [r0,:64], r2
+ vdup.16 d21, d16[1]
+ vld1.64 {d4}, [r0,:64], r2
+ vdup.16 d22, d16[2]
+ vld1.64 {d5}, [r0,:64], r2
+ vdup.16 d23, d16[3]
+ vld1.64 {d6}, [r0,:64], r2
+ vsub.s16 q12, q15, q10
+ vld1.64 {d7}, [r0,:64], r2
+ vsub.s16 q13, q15, q11
+
+ sub r0, r0, #8*FDEC_STRIDE
+
+ vqmovun.s16 d20, q10
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d26, q13
+
+ vmov d21, d20
+ vqadd.u8 q0, q0, q10
+ vmov d23, d22
+ vqadd.u8 q1, q1, q10
+ vmov d25, d24
+ vqadd.u8 q2, q2, q11
+ vmov d27, d26
+ vqadd.u8 q3, q3, q11
+ vqsub.u8 q0, q0, q12
+ vqsub.u8 q1, q1, q12
+ vqsub.u8 q2, q2, q13
+
+ vst1.64 {d0}, [r0,:64], r2
+ vqsub.u8 q3, q3, q13
+ vst1.64 {d1}, [r0,:64], r2
+ vst1.64 {d2}, [r0,:64], r2
+ vst1.64 {d3}, [r0,:64], r2
+ vst1.64 {d4}, [r0,:64], r2
+ vst1.64 {d5}, [r0,:64], r2
+ vst1.64 {d6}, [r0,:64], r2
+ vst1.64 {d7}, [r0,:64], r2
+ bx lr
+.endfunc
+
+.macro ADD16x4_IDCT_DC dc
+ vld1.64 {d16-d17}, [r0,:128], r3
+ vld1.64 {d18-d19}, [r0,:128], r3
+ vdup.16 d4, \dc[0]
+ vdup.16 d5, \dc[1]
+ vld1.64 {d20-d21}, [r0,:128], r3
+ vdup.16 d6, \dc[2]
+ vdup.16 d7, \dc[3]
+ vld1.64 {d22-d23}, [r0,:128], r3
+ vsub.s16 q12, q15, q2
+ vsub.s16 q13, q15, q3
+
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d5, q3
+ vqmovun.s16 d6, q12
+ vqmovun.s16 d7, q13
+
+ vqadd.u8 q8, q8, q2
+ vqadd.u8 q9, q9, q2
+ vqadd.u8 q10, q10, q2
+ vqadd.u8 q11, q11, q2
+
+ vqsub.u8 q8, q8, q3
+ vqsub.u8 q9, q9, q3
+ vqsub.u8 q10, q10, q3
+ vst1.64 {d16-d17}, [r2,:128], r3
+ vqsub.u8 q11, q11, q3
+ vst1.64 {d18-d19}, [r2,:128], r3
+ vst1.64 {d20-d21}, [r2,:128], r3
+ vst1.64 {d22-d23}, [r2,:128], r3
+.endm
+
+function x264_add16x16_idct_dc_neon
+ mov r2, r0
+ mov r3, #FDEC_STRIDE
+ vmov.i16 q15, #0
+
+ vld1.64 {d0-d3}, [r1,:64]
+ vrshr.s16 q0, #6
+ vrshr.s16 q1, #6
+
+ ADD16x4_IDCT_DC d0
+ ADD16x4_IDCT_DC d1
+ ADD16x4_IDCT_DC d2
+ ADD16x4_IDCT_DC d3
+ bx lr
+.endfunc
+
+function x264_sub8x8_dct_dc_neon
+ mov r3, #FENC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d16}, [r1,:64], r3
+ vld1.64 {d17}, [r2,:64], ip
+ vsubl.u8 q8, d16, d17
+ vld1.64 {d18}, [r1,:64], r3
+ vld1.64 {d19}, [r2,:64], ip
+ vsubl.u8 q9, d18, d19
+ vld1.64 {d20}, [r1,:64], r3
+ vld1.64 {d21}, [r2,:64], ip
+ vsubl.u8 q10, d20, d21
+ vld1.64 {d22}, [r1,:64], r3
+ vadd.s16 q0, q8, q9
+ vld1.64 {d23}, [r2,:64], ip
+ vsubl.u8 q11, d22, d23
+ vld1.64 {d24}, [r1,:64], r3
+ vadd.s16 q0, q0, q10
+ vld1.64 {d25}, [r2,:64], ip
+ vsubl.u8 q12, d24, d25
+ vld1.64 {d26}, [r1,:64], r3
+ vadd.s16 q0, q0, q11
+ vld1.64 {d27}, [r2,:64], ip
+ vsubl.u8 q13, d26, d27
+ vld1.64 {d28}, [r1,:64], r3
+ vld1.64 {d29}, [r2,:64], ip
+ vsubl.u8 q14, d28, d29
+ vld1.64 {d30}, [r1,:64], r3
+ vadd.s16 q1, q12, q13
+ vld1.64 {d31}, [r2,:64], ip
+ vpadd.s16 d0, d0, d1
+ vadd.s16 q1, q1, q14
+ vsubl.u8 q15, d30, d31
+ vadd.s16 q1, q1, q15
+ vpadd.s16 d2, d2, d3
+ vpadd.s16 d0, d0, d2
+ vst1.64 {d0}, [r0,:64]
+ bx lr
+.endfunc
+
+
+function x264_zigzag_scan_4x4_frame_neon
+ movrel r2, scan4x4_frame
+ vld1.64 {d0-d3}, [r1,:128]
+ vld1.64 {d16-d19}, [r2,:128]
+ vtbl.8 d4, {d0-d1}, d16
+ vtbl.8 d5, {d1-d3}, d17
+ vtbl.8 d6, {d0-d2}, d18
+ vtbl.8 d7, {d2-d3}, d19
+ vst1.64 {d4-d7}, [r0,:128]
+ bx lr
+.endfunc
diff --git a/common/arm/dct.h b/common/arm/dct.h
new file mode 100644
index 0000000..55f53ce
--- /dev/null
+++ b/common/arm/dct.h
@@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_ARM_DCT_H
+#define X264_ARM_DCT_H
+
+void x264_dct4x4dc_neon( int16_t d[16] );
+void x264_idct4x4dc_neon( int16_t d[16] );
+
+void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
+void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
+void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
+
+void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
+void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
+void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
+void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
+
+void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+
+#endif
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
new file mode 100644
index 0000000..f124b55
--- /dev/null
+++ b/common/arm/deblock-a.S
@@ -0,0 +1,283 @@
+/*****************************************************************************
+ * deblock.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Mans Rullgard <mans at mansr.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.macro h264_loop_filter_start
+ ldr ip, [sp]
+ ldr ip, [ip]
+ vmov.32 d24[0], ip
+ and ip, ip, ip, lsl #16
+ ands ip, ip, ip, lsl #8
+ bxlt lr
+.endm
+
+.macro align_push_regs
+ and ip, sp, #15
+ add ip, ip, #32
+ sub sp, sp, ip
+ vst1.64 {d12-d15}, [sp,:128]
+ sub sp, sp, #32
+ vst1.64 {d8-d11}, [sp,:128]
+.endm
+
+.macro align_pop_regs
+ vld1.64 {d8-d11}, [sp,:128]!
+ vld1.64 {d12-d15}, [sp,:128], ip
+.endm
+
+.macro h264_loop_filter_luma
+ vdup.8 q11, r2 @ alpha
+ vmovl.u8 q12, d24
+ vabd.u8 q6, q8, q0 @ abs(p0 - q0)
+ vmovl.u16 q12, d24
+ vabd.u8 q14, q9, q8 @ abs(p1 - p0)
+ vsli.16 q12, q12, #8
+ vabd.u8 q15, q1, q0 @ abs(q1 - q0)
+ vsli.32 q12, q12, #16
+ vclt.u8 q6, q6, q11 @ < alpha
+ vdup.8 q11, r3 @ beta
+ vclt.s8 q7, q12, #0
+ vclt.u8 q14, q14, q11 @ < beta
+ vclt.u8 q15, q15, q11 @ < beta
+ vbic q6, q6, q7
+ vabd.u8 q4, q10, q8 @ abs(p2 - p0)
+ vand q6, q6, q14
+ vabd.u8 q5, q2, q0 @ abs(q2 - q0)
+ vclt.u8 q4, q4, q11 @ < beta
+ vand q6, q6, q15
+ vclt.u8 q5, q5, q11 @ < beta
+ vand q4, q4, q6
+ vand q5, q5, q6
+ vand q12, q12, q6
+ vrhadd.u8 q14, q8, q0
+ vsub.i8 q6, q12, q4
+ vqadd.u8 q7, q9, q12
+ vhadd.u8 q10, q10, q14
+ vsub.i8 q6, q6, q5
+ vhadd.u8 q14, q2, q14
+ vmin.u8 q7, q7, q10
+ vqsub.u8 q11, q9, q12
+ vqadd.u8 q2, q1, q12
+ vmax.u8 q7, q7, q11
+ vqsub.u8 q11, q1, q12
+ vmin.u8 q14, q2, q14
+ vmovl.u8 q2, d0
+ vmax.u8 q14, q14, q11
+ vmovl.u8 q10, d1
+ vsubw.u8 q2, q2, d16
+ vsubw.u8 q10, q10, d17
+ vshl.i16 q2, q2, #2
+ vshl.i16 q10, q10, #2
+ vaddw.u8 q2, q2, d18
+ vaddw.u8 q10, q10, d19
+ vsubw.u8 q2, q2, d2
+ vsubw.u8 q10, q10, d3
+ vrshrn.i16 d4, q2, #3
+ vrshrn.i16 d5, q10, #3
+ vbsl q4, q7, q9
+ vbsl q5, q14, q1
+ vneg.s8 q7, q6
+ vmovl.u8 q14, d16
+ vmin.s8 q2, q2, q6
+ vmovl.u8 q6, d17
+ vmax.s8 q2, q2, q7
+ vmovl.u8 q11, d0
+ vmovl.u8 q12, d1
+ vaddw.s8 q14, q14, d4
+ vaddw.s8 q6, q6, d5
+ vsubw.s8 q11, q11, d4
+ vsubw.s8 q12, q12, d5
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d17, q6
+ vqmovun.s16 d0, q11
+ vqmovun.s16 d1, q12
+.endm
+
+function x264_deblock_v_luma_neon
+ h264_loop_filter_start
+
+ vld1.64 {d0, d1}, [r0,:128], r1
+ vld1.64 {d2, d3}, [r0,:128], r1
+ vld1.64 {d4, d5}, [r0,:128], r1
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ vld1.64 {d20,d21}, [r0,:128], r1
+ vld1.64 {d18,d19}, [r0,:128], r1
+ vld1.64 {d16,d17}, [r0,:128], r1
+
+ align_push_regs
+
+ h264_loop_filter_luma
+
+ sub r0, r0, r1, lsl #1
+ vst1.64 {d8, d9}, [r0,:128], r1
+ vst1.64 {d16,d17}, [r0,:128], r1
+ vst1.64 {d0, d1}, [r0,:128], r1
+ vst1.64 {d10,d11}, [r0,:128]
+
+ align_pop_regs
+ bx lr
+.endfunc
+
+function x264_deblock_h_luma_neon
+ h264_loop_filter_start
+
+ sub r0, r0, #4
+ vld1.64 {d6}, [r0], r1
+ vld1.64 {d20}, [r0], r1
+ vld1.64 {d18}, [r0], r1
+ vld1.64 {d16}, [r0], r1
+ vld1.64 {d0}, [r0], r1
+ vld1.64 {d2}, [r0], r1
+ vld1.64 {d4}, [r0], r1
+ vld1.64 {d26}, [r0], r1
+ vld1.64 {d7}, [r0], r1
+ vld1.64 {d21}, [r0], r1
+ vld1.64 {d19}, [r0], r1
+ vld1.64 {d17}, [r0], r1
+ vld1.64 {d1}, [r0], r1
+ vld1.64 {d3}, [r0], r1
+ vld1.64 {d5}, [r0], r1
+ vld1.64 {d27}, [r0], r1
+
+ TRANSPOSE8x8 q3, q10, q9, q8, q0, q1, q2, q13
+
+ align_push_regs
+
+ h264_loop_filter_luma
+
+ TRANSPOSE4x4 q4, q8, q0, q5
+
+ sub r0, r0, r1, lsl #4
+ add r0, r0, #2
+ vst1.32 {d8[0]}, [r0], r1
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d10[0]}, [r0], r1
+ vst1.32 {d8[1]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d10[1]}, [r0], r1
+ vst1.32 {d9[0]}, [r0], r1
+ vst1.32 {d17[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ vst1.32 {d11[0]}, [r0], r1
+ vst1.32 {d9[1]}, [r0], r1
+ vst1.32 {d17[1]}, [r0], r1
+ vst1.32 {d1[1]}, [r0], r1
+ vst1.32 {d11[1]}, [r0], r1
+
+ align_pop_regs
+ bx lr
+.endfunc
+
+.macro h264_loop_filter_chroma
+ vdup.8 d22, r2 // alpha
+ vmovl.u8 q12, d24
+ vabd.u8 d26, d16, d0 // abs(p0 - q0)
+ vmovl.u8 q2, d0
+ vabd.u8 d28, d18, d16 // abs(p1 - p0)
+ vsubw.u8 q2, q2, d16
+ vsli.16 d24, d24, #8
+ vshl.i16 q2, q2, #2
+ vabd.u8 d30, d2, d0 // abs(q1 - q0)
+ vaddw.u8 q2, q2, d18
+ vclt.u8 d26, d26, d22 // < alpha
+ vsubw.u8 q2, q2, d2
+ vdup.8 d22, r3 // beta
+ vclt.s8 d25, d24, #0
+ vrshrn.i16 d4, q2, #3
+ vclt.u8 d28, d28, d22 // < beta
+ vbic d26, d26, d25
+ vclt.u8 d30, d30, d22 // < beta
+ vand d26, d26, d28
+ vneg.s8 d25, d24
+ vand d26, d26, d30
+ vmin.s8 d4, d4, d24
+ vmovl.u8 q14, d16
+ vand d4, d4, d26
+ vmax.s8 d4, d4, d25
+ vmovl.u8 q11, d0
+ vaddw.s8 q14, q14, d4
+ vsubw.s8 q11, q11, d4
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d0, q11
+.endm
+
+function x264_deblock_v_chroma_neon
+ h264_loop_filter_start
+
+ sub r0, r0, r1, lsl #1
+ vld1.64 {d18}, [r0,:64], r1
+ vld1.64 {d16}, [r0,:64], r1
+ vld1.64 {d0}, [r0,:64], r1
+ vld1.64 {d2}, [r0,:64]
+
+ h264_loop_filter_chroma
+
+ sub r0, r0, r1, lsl #1
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d0}, [r0,:64], r1
+
+ bx lr
+.endfunc
+
+function x264_deblock_h_chroma_neon
+ h264_loop_filter_start
+
+ sub r0, r0, #2
+ vld1.32 {d18[]}, [r0], r1
+ vld1.32 {d16[]}, [r0], r1
+ vld1.32 {d0[]}, [r0], r1
+ vld1.32 {d2[]}, [r0], r1
+ vld1.32 {d18[1]}, [r0], r1
+ vld1.32 {d16[1]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[1]}, [r0], r1
+
+ vtrn.16 d18, d0
+ vtrn.16 d16, d2
+ vtrn.8 d18, d16
+ vtrn.8 d0, d2
+
+ h264_loop_filter_chroma
+
+ vtrn.16 d18, d0
+ vtrn.16 d16, d2
+ vtrn.8 d18, d16
+ vtrn.8 d0, d2
+
+ sub r0, r0, r1, lsl #3
+ vst1.32 {d18[0]}, [r0], r1
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d2[0]}, [r0], r1
+ vst1.32 {d18[1]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d2[1]}, [r0], r1
+
+ bx lr
+.endfunc
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
new file mode 100644
index 0000000..a62af39
--- /dev/null
+++ b/common/arm/mc-a.S
@@ -0,0 +1,1045 @@
+/*****************************************************************************
+ * mc.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Mans Rullgard <mans at mansr.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.text
+
+// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
+// They also use nothing above armv5te, but we don't care about pre-armv6
+
+// void prefetch_ref( uint8_t *pix, int stride, int parity )
+function x264_prefetch_ref_arm
+ sub r2, r2, #1
+ add r0, r0, #64
+ and r2, r2, r1
+ add r0, r0, r2, lsl #3
+ add r2, r1, r1, lsl #1
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+ add r3, r0, r1, lsl #2
+ pld [r0, r2]
+ pld [r3]
+ pld [r3, r1]
+ pld [r3, r1, lsl #1]
+ pld [r3, r2]
+ bx lr
+.endfunc
+
+// void prefetch_fenc( uint8_t *pix_y, int stride_y,
+// uint8_t *pix_uv, int stride_uv, int mb_x )
+function x264_prefetch_fenc_arm
+ ldr ip, [sp]
+ push {lr}
+ and lr, ip, #3
+ smulbb lr, lr, r1 // note: this assumes stride_y is <= 16 bits signed
+ and ip, ip, #6
+ smulbb ip, ip, r3
+ add r0, r0, #64
+ add r2, r2, #64
+ add r0, r0, lr, lsl #2
+ pld [r0]
+ add lr, r0, r1, lsl #1
+ pld [r0, r1]
+ pld [lr]
+ add r2, r2, ip, lsl #2
+ pld [lr, r1]
+ pld [r2]
+ add ip, r2, r3, lsl #1
+ pld [r2, r3]
+ pld [ip]
+ pld [ip, r3]
+ pop {pc}
+.endfunc
+
+
+// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
+function x264_memcpy_aligned_neon
+ orr r3, r0, r1, lsr #1
+ movrel ip, memcpy_table
+ and r3, r3, #0xc
+ ldr pc, [ip, r3]
+.endfunc
+
+.macro MEMCPY_ALIGNED srcalign dstalign
+function memcpy_aligned_\dstalign\()_\srcalign\()_neon
+ mov r3, r0
+.if \srcalign == 8 && \dstalign == 8
+ sub r2, #16
+ vld1.64 {d0}, [r1,:64]!
+ vst1.64 {d0}, [r3,:64]!
+ .set r1align, 128
+ .set r3align, 128
+.else
+ .set r1align, \srcalign * 8
+ .set r3align, \dstalign * 8
+.endif
+ tst r2, #16
+ beq 32f
+ sub r2, #16
+ vld1.64 {d0-d1}, [r1,:r1align]!
+ vst1.64 {d0-d1}, [r3,:r3align]!
+32: // n is a multiple of 32
+ tst r2, #32
+ beq 64f
+ sub r2, #32
+ vld1.64 {d0-d3}, [r1,:r1align]!
+ vst1.64 {d0-d3}, [r3,:r3align]!
+64: // n is a multiple of 64
+ subs r2, #64
+ vld1.64 {d0-d3}, [r1,:r1align]!
+ vld1.64 {d4-d7}, [r1,:r1align]!
+ vst1.64 {d0-d3}, [r3,:r3align]!
+ vst1.64 {d4-d7}, [r3,:r3align]!
+ bgt 64b
+.if \srcalign == 8 && \dstalign == 8
+ vld1.64 {d0}, [r1,:64]!
+ vst1.64 {d0}, [r3,:64]!
+.endif
+ bx lr
+.endfunc
+.endm
+
+MEMCPY_ALIGNED 16, 16
+MEMCPY_ALIGNED 16, 8
+MEMCPY_ALIGNED 8, 16
+MEMCPY_ALIGNED 8, 8
+
+.section .rodata
+memcpy_table:
+.word memcpy_aligned_16_16_neon
+.word memcpy_aligned_16_8_neon
+.word memcpy_aligned_8_16_neon
+.word memcpy_aligned_8_8_neon
+.text
+
+.ltorg
+
+// void x264_memzero_aligned( void *dst, size_t n )
+function x264_memzero_aligned_neon
+ vmov.i8 q0, #0
+ vmov.i8 q1, #0
+memzero_loop:
+ subs r1, #128
+.rept 4
+ vst1.64 {d0-d3}, [r0,:128]!
+.endr
+ bgt memzero_loop
+ bx lr
+.endfunc
+
+
+// void pixel_avg( uint8_t *dst, int dst_stride,
+// uint8_t *src1, int src1_stride,
+// uint8_t *src2, int src2_stride, int weight );
+.macro AVGH w h
+function x264_pixel_avg_\w\()x\h\()_neon
+ ldr ip, [sp, #8]
+ push {r4-r6,lr}
+ cmp ip, #32
+ ldrd r4, [sp, #16]
+ mov lr, #\h
+ beq x264_pixel_avg_w\w\()_neon
+ rsbs r6, ip, #64
+ blt x264_pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
+ cmp ip, #0
+ bge x264_pixel_avg_weight_w\w\()_add_add_neon
+ b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
+.endfunc
+.endm
+
+AVGH 4, 2
+AVGH 4, 4
+AVGH 4, 8
+AVGH 8, 4
+AVGH 8, 8
+AVGH 8, 16
+AVGH 16, 8
+AVGH 16, 16
+
+// 0 < weight < 64
+.macro load_weights_add_add
+ vdup.8 d30, ip
+ vdup.8 d31, r6
+.endm
+
+.macro load_add_add d1 d2
+ vld1.32 {\d1}, [r2], r3
+ vld1.32 {\d2}, [r4], r5
+.endm
+
+.macro weight_add_add dst s1 s2
+ vmull.u8 \dst, \s1, d30
+ vmlal.u8 \dst, \s2, d31
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+ rsb r6, #0
+ vdup.8 d30, ip
+ vdup.8 d31, r6
+.endm
+
+.macro load_add_sub d1 d2
+ vld1.32 {\d1}, [r2], r3
+ vld1.32 {\d2}, [r4], r5
+.endm
+
+.macro weight_add_sub dst s1 s2
+ vmull.u8 \dst, \s1, d30
+ vmlsl.u8 \dst, \s2, d31
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+ rsb ip, #0
+ vdup.8 d31, r6
+ vdup.8 d30, ip
+.endm
+
+.macro load_sub_add d1 d2
+ vld1.32 {\d2}, [r4], r5
+ vld1.32 {\d1}, [r2], r3
+.endm
+
+.macro weight_sub_add dst s1 s2
+ vmull.u8 \dst, \s2, d31
+ vmlsl.u8 \dst, \s1, d30
+.endm
+
+.macro AVG_WEIGHT ext
+function x264_pixel_avg_weight_w4_\ext\()_neon
+ load_weights_\ext
+1: // height loop
+ subs lr, lr, #2
+ load_\ext d0[], d1[]
+ weight_\ext q8, d0, d1
+ load_\ext d2[], d3[]
+ vqrshrun.s16 d0, q8, #6
+ weight_\ext q9, d2, d3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vqrshrun.s16 d1, q9, #6
+ vst1.32 {d1[0]}, [r0,:32], r1
+ bgt 1b
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_weight_w8_\ext\()_neon
+ load_weights_\ext
+1: // height loop
+ subs lr, lr, #4
+ load_\ext d0, d1
+ weight_\ext q8, d0, d1
+ load_\ext d2, d3
+ weight_\ext q9, d2, d3
+ load_\ext d4, d5
+ weight_\ext q10, d4, d5
+ load_\ext d6, d7
+ weight_\ext q11, d6, d7
+ vqrshrun.s16 d0, q8, #6
+ vqrshrun.s16 d1, q9, #6
+ vqrshrun.s16 d2, q10, #6
+ vqrshrun.s16 d3, q11, #6
+ vst1.64 {d0}, [r0,:64], r1
+ vst1.64 {d1}, [r0,:64], r1
+ vst1.64 {d2}, [r0,:64], r1
+ vst1.64 {d3}, [r0,:64], r1
+ bgt 1b
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_weight_w16_\ext\()_neon
+ load_weights_\ext
+1: // height loop
+ subs lr, lr, #2
+ load_\ext d0-d1, d2-d3
+ weight_\ext q8, d0, d2
+ weight_\ext q9, d1, d3
+ load_\ext d4-d5, d6-d7
+ weight_\ext q10, d4, d6
+ weight_\ext q11, d5, d7
+ vqrshrun.s16 d0, q8, #6
+ vqrshrun.s16 d1, q9, #6
+ vqrshrun.s16 d2, q10, #6
+ vqrshrun.s16 d3, q11, #6
+ vst1.64 {d0-d1}, [r0,:128], r1
+ vst1.64 {d2-d3}, [r0,:128], r1
+ bgt 1b
+ pop {r4-r6,pc}
+.endfunc
+.endm
+
+AVG_WEIGHT add_add
+AVG_WEIGHT add_sub
+AVG_WEIGHT sub_add
+
+function x264_pixel_avg_w4_neon
+ subs lr, lr, #2
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d2[]}, [r4], r5
+ vrhadd.u8 d0, d0, d2
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d3[]}, [r4], r5
+ vrhadd.u8 d1, d1, d3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ bgt x264_pixel_avg_w4_neon
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_w8_neon
+ subs lr, lr, #4
+ vld1.64 {d0}, [r2], r3
+ vld1.64 {d2}, [r4], r5
+ vrhadd.u8 d0, d0, d2
+ vld1.64 {d1}, [r2], r3
+ vld1.64 {d3}, [r4], r5
+ vrhadd.u8 d1, d1, d3
+ vst1.64 {d0}, [r0,:64], r1
+ vld1.64 {d2}, [r2], r3
+ vld1.64 {d4}, [r4], r5
+ vrhadd.u8 d2, d2, d4
+ vst1.64 {d1}, [r0,:64], r1
+ vld1.64 {d3}, [r2], r3
+ vld1.64 {d5}, [r4], r5
+ vrhadd.u8 d3, d3, d5
+ vst1.64 {d2}, [r0,:64], r1
+ vst1.64 {d3}, [r0,:64], r1
+ bgt x264_pixel_avg_w8_neon
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_w16_neon
+ subs lr, lr, #4
+ vld1.64 {d0-d1}, [r2], r3
+ vld1.64 {d2-d3}, [r4], r5
+ vrhadd.u8 q0, q0, q1
+ vld1.64 {d2-d3}, [r2], r3
+ vld1.64 {d4-d5}, [r4], r5
+ vrhadd.u8 q1, q1, q2
+ vst1.64 {d0-d1}, [r0,:128], r1
+ vld1.64 {d4-d5}, [r2], r3
+ vld1.64 {d6-d7}, [r4], r5
+ vrhadd.u8 q2, q2, q3
+ vst1.64 {d2-d3}, [r0,:128], r1
+ vld1.64 {d6-d7}, [r2], r3
+ vld1.64 {d0-d1}, [r4], r5
+ vrhadd.u8 q3, q3, q0
+ vst1.64 {d4-d5}, [r0,:128], r1
+ vst1.64 {d6-d7}, [r0,:128], r1
+ bgt x264_pixel_avg_w16_neon
+ pop {r4-r6,pc}
+.endfunc
+
+
+function x264_pixel_avg2_w4_neon
+ ldr ip, [sp, #4]
+ push {lr}
+ ldr lr, [sp, #4]
+avg2_w4_loop:
+ subs ip, ip, #2
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d2[]}, [lr], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d3[]}, [lr], r3
+ vrhadd.u8 d1, d1, d3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ bgt avg2_w4_loop
+ pop {pc}
+.endfunc
+
+function x264_pixel_avg2_w8_neon
+ ldr ip, [sp, #4]
+ push {lr}
+ ldr lr, [sp, #4]
+avg2_w8_loop:
+ subs ip, ip, #2
+ vld1.64 {d0}, [r2], r3
+ vld1.64 {d2}, [lr], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.64 {d1}, [r2], r3
+ vld1.64 {d3}, [lr], r3
+ vrhadd.u8 d1, d1, d3
+ vst1.64 {d0}, [r0,:64], r1
+ vst1.64 {d1}, [r0,:64], r1
+ bgt avg2_w8_loop
+ pop {pc}
+.endfunc
+
+function x264_pixel_avg2_w16_neon
+ ldr ip, [sp, #4]
+ push {lr}
+ ldr lr, [sp, #4]
+avg2_w16_loop:
+ subs ip, ip, #2
+ vld1.64 {d0-d1}, [r2], r3
+ vld1.64 {d2-d3}, [lr], r3
+ vrhadd.u8 q0, q0, q1
+ vld1.64 {d4-d5}, [r2], r3
+ vld1.64 {d6-d7}, [lr], r3
+ vrhadd.u8 q2, q2, q3
+ vst1.64 {d0-d1}, [r0,:128], r1
+ vst1.64 {d4-d5}, [r0,:128], r1
+ bgt avg2_w16_loop
+ pop {pc}
+.endfunc
+
+function x264_pixel_avg2_w20_neon
+ ldr ip, [sp, #4]
+ push {lr}
+ sub r1, r1, #16
+ ldr lr, [sp, #4]
+avg2_w20_loop:
+ subs ip, ip, #2
+ vld1.64 {d0-d2}, [r2], r3
+ vld1.64 {d4-d6}, [lr], r3
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 d2, d2, d6
+ vld1.64 {d4-d6}, [r2], r3
+ vld1.64 {d16-d18},[lr], r3
+ vrhadd.u8 q2, q2, q8
+ vst1.64 {d0-d1}, [r0,:128]!
+ vrhadd.u8 d6, d6, d18
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.64 {d4-d5}, [r0,:128]!
+ vst1.32 {d6[0]}, [r0,:32], r1
+ bgt avg2_w20_loop
+ pop {pc}
+.endfunc
+
+
+// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
+function x264_mc_copy_w4_neon
+ ldr ip, [sp]
+copy_w4_loop:
+ subs ip, ip, #4
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ bgt copy_w4_loop
+ bx lr
+.endfunc
+
+function x264_mc_copy_w8_neon
+ ldr ip, [sp]
+copy_w8_loop:
+ subs ip, ip, #4
+ vld1.32 {d0}, [r2], r3
+ vld1.32 {d1}, [r2], r3
+ vld1.32 {d2}, [r2], r3
+ vld1.32 {d3}, [r2], r3
+ vst1.32 {d0}, [r0,:64], r1
+ vst1.32 {d1}, [r0,:64], r1
+ vst1.32 {d2}, [r0,:64], r1
+ vst1.32 {d3}, [r0,:64], r1
+ bgt copy_w8_loop
+ bx lr
+.endfunc
+
+function x264_mc_copy_w16_neon
+ ldr ip, [sp]
+copy_w16_loop:
+ subs ip, ip, #4
+ vld1.32 {d0-d1}, [r2], r3
+ vld1.32 {d2-d3}, [r2], r3
+ vld1.32 {d4-d5}, [r2], r3
+ vld1.32 {d6-d7}, [r2], r3
+ vst1.32 {d0-d1}, [r0,:128], r1
+ vst1.32 {d2-d3}, [r0,:128], r1
+ vst1.32 {d4-d5}, [r0,:128], r1
+ vst1.32 {d6-d7}, [r0,:128], r1
+ bgt copy_w16_loop
+ bx lr
+.endfunc
+
+function x264_mc_copy_w16_aligned_neon
+ ldr ip, [sp]
+copy_w16_aligned_loop:
+ subs ip, ip, #4
+ vld1.32 {d0-d1}, [r2,:128], r3
+ vld1.32 {d2-d3}, [r2,:128], r3
+ vld1.32 {d4-d5}, [r2,:128], r3
+ vld1.32 {d6-d7}, [r2,:128], r3
+ vst1.32 {d0-d1}, [r0,:128], r1
+ vst1.32 {d2-d3}, [r0,:128], r1
+ vst1.32 {d4-d5}, [r0,:128], r1
+ vst1.32 {d6-d7}, [r0,:128], r1
+ bgt copy_w16_aligned_loop
+ bx lr
+.endfunc
+
+
+// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
+// uint8_t *src, int i_src_stride,
+// int dx, int dy, int i_width, int i_height );
+function x264_mc_chroma_neon
+ push {r4-r6, lr}
+ ldrd r4, [sp, #16]
+ ldr r6, [sp, #24]
+
+ asr lr, r5, #3
+ mul lr, r3, lr
+ add r2, r2, r4, asr #3
+ cmp r6, #4
+ add r2, r2, lr
+
+ and r4, r4, #7
+ and r5, r5, #7
+ pld [r2]
+ pld [r2, r3]
+
+ bgt mc_chroma_w8
+ beq mc_chroma_w4
+
+// calculate cA cB cC cD
+.macro CHROMA_MC_START r0 r1
+ muls lr, r4, r5
+ rsb r6, lr, r5, lsl #3
+ rsb ip, lr, r4, lsl #3
+ sub r4, lr, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r2, r3
+
+ vdup.8 d0, r4
+ lsl r3, r3, #1
+ vdup.8 d1, ip
+ vld1.64 {\r0}, [r2], r3
+ vdup.8 d2, r6
+ vld1.64 {\r1}, [r5], r3
+ vdup.8 d3, lr
+ ldr r4, [sp, #28]
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+.endm
+
+.macro CHROMA_MC width, align
+mc_chroma_w\width:
+ CHROMA_MC_START d4, d6
+// since the element size varies, there's a different index for the 2nd store
+.if \width == 4
+ .set st2, 1
+.else
+ .set st2, 2
+.endif
+
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+1: // height loop, interpolate xy
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+ vld1.64 {d4}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+ vld1.64 {d6}, [r5], r3
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+ subs r4, r4, #2
+ pld [r2]
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ vst1.\align {d16[0]}, [r0,:\align], r1
+ vst1.\align {d16[st2]}, [r0,:\align], r1
+ bgt 1b
+
+ pop {r4-r6, pc}
+
+2: // dx or dy are 0
+ tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ vtrn.32 d0, d1
+ ldr r4, [sp, #28]
+
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+ add r5, r2, r3
+ lsl r3, r3, #1
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d4[1]}, [r5], r3
+
+3: // vertical interpolation loop
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vld1.32 {d4[0]}, [r2], r3
+ vmull.u8 q9, d4, d1
+ vld1.32 {d4[1]}, [r5], r3
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+ subs r4, r4, #2
+ pld [r2]
+ vst1.\align {d16[0]}, [r0,:\align], r1
+ vst1.\align {d16[st2]}, [r0,:\align], r1
+ bgt 3b
+
+ pop {r4-r6, pc}
+
+4: // dy is 0
+ vld1.64 {d4}, [r2], r3
+ vld1.64 {d6}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+5: // horizontal interpolation loop
+ vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r4, r4, #2
+ vld1.64 {d4}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r2]
+ vrshrn.u16 d16, q8, #6
+ vld1.64 {d6}, [r2], r3
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ pld [r2]
+ vst1.\align {d16[0]}, [r0,:\align], r1
+ vst1.\align {d16[st2]}, [r0,:\align], r1
+ bgt 5b
+
+ pop {r4-r6, pc}
+.endm
+
+ CHROMA_MC 2, 16
+ CHROMA_MC 4, 32
+
+// the optimial timing for width 8 is different enough that it's not
+// readable to put it in the same macro as width 2/4
+mc_chroma_w8:
+ CHROMA_MC_START d4-d5, d6-d7
+
+1: // height loop, interpolate xy
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r2], r3
+ vmlal.u8 q8, d6, d2
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+ subs r4, r4, #2
+ vmlal.u8 q9, d7, d1
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+ vrshrn.u16 d16, q8, #6
+ vld1.64 {d6, d7}, [r5], r3
+ pld [r2]
+ vrshrn.u16 d17, q9, #6
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d17}, [r0,:64], r1
+ bgt 1b
+
+ pop {r4-r6, pc}
+
+2: // dx or dy are 0
+ tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ ldr r4, [sp, #28]
+
+ beq 4f
+
+ add r5, r2, r3
+ lsl r3, r3, #1
+ vld1.64 {d4}, [r2], r3
+ vld1.64 {d6}, [r5], r3
+
+3: // vertical interpolation loop
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+ vld1.64 {d4}, [r2], r3
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d1
+ vld1.64 {d6}, [r5], r3
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ subs r4, r4, #2
+ pld [r2]
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d17}, [r0,:64], r1
+ bgt 3b
+
+ pop {r4-r6, pc}
+
+4: // dy is 0
+ vld1.64 {d4, d5}, [r2], r3
+ vld1.64 {d6, d7}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+5: // horizontal interpolation loop
+ pld [r2]
+ subs r4, r4, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r2], r3
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d7, d1
+ pld [r2]
+ vext.8 d5, d4, d5, #1
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ vld1.64 {d6, d7}, [r2], r3
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d17}, [r0,:64], r1
+ bgt 5b
+
+ pop {r4-r6, pc}
+.endfunc
+
+
+// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
+function x264_hpel_filter_v_neon
+ ldr ip, [sp]
+ sub r1, r1, r3, lsl #1
+ push {lr}
+ add lr, r1, ip
+ vmov.u8 d30, #5
+ vmov.u8 d31, #20
+
+filter_v_loop:
+ subs ip, ip, #16
+ vld1.64 {d0-d1}, [r1,:128], r3
+ vld1.64 {d2-d3}, [r1,:128], r3
+ vld1.64 {d4-d5}, [r1,:128], r3
+ vld1.64 {d6-d7}, [r1,:128], r3
+ vld1.64 {d16-d17}, [r1,:128], r3
+ vld1.64 {d18-d19}, [r1,:128], r3
+ sub r1, lr, ip
+
+ vaddl.u8 q10, d0, d18
+ vmlsl.u8 q10, d2, d30
+ vmlal.u8 q10, d4, d31
+ vmlal.u8 q10, d6, d31
+ vmlsl.u8 q10, d16, d30
+
+ vaddl.u8 q11, d1, d19
+ vmlsl.u8 q11, d3, d30
+ vmlal.u8 q11, d5, d31
+ vmlal.u8 q11, d7, d31
+ vmlsl.u8 q11, d17, d30
+
+ vqrshrun.s16 d0, q10, #5
+ vst1.64 {d20-d21}, [r2,:128]!
+ vqrshrun.s16 d1, q11, #5
+ vst1.64 {d22-d23}, [r2,:128]!
+ vst1.64 {d0-d1}, [r0,:128]!
+ bgt filter_v_loop
+ pop {pc}
+.endfunc
+
+// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+function x264_hpel_filter_c_neon
+ sub r1, #16
+ vld1.64 {d0-d3}, [r1,:128]!
+
+ // unrolled 2x: 4% faster
+filter_c_loop:
+ subs r2, r2, #16
+ vld1.64 {d4-d7}, [r1,:128]!
+ vext.16 q8, q0, q1, #6
+ vext.16 q12, q1, q2, #3
+ vadd.s16 q8, q8, q12
+ vext.16 q9, q0, q1, #7
+ vext.16 q11, q1, q2, #2
+ vadd.s16 q9, q9, q11
+ vext.16 q10, q1, q2, #1
+ vext.16 q11, q1, q2, #6
+ vadd.s16 q10, q1, q10
+ vsub.s16 q8, q8, q9 // a-b
+ vext.16 q15, q2, q3, #3
+ vsub.s16 q9, q9, q10 // b-c
+
+ vext.16 q12, q1, q2, #7
+ vshr.s16 q8, q8, #2 // (a-b)/4
+ vadd.s16 q11, q11, q15
+ vext.16 q14, q2, q3, #2
+ vsub.s16 q8, q8, q9 // (a-b)/4-b+c
+ vadd.s16 q12, q12, q14
+ vext.16 q13, q2, q3, #1
+
+ vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4
+ vadd.s16 q13, q2, q13
+ vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vsub.s16 q11, q11, q12 // a-b
+ vsub.s16 q12, q12, q13 // b-c
+ vshr.s16 q11, q11, #2 // (a-b)/4
+ vqrshrun.s16 d30, q8, #6
+ vsub.s16 q11, q11, q12 // (a-b)/4-b+c
+ vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4
+ vld1.64 {d0-d3}, [r1,:128]!
+ vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+ vext.16 q8, q2, q3, #6
+ vqrshrun.s16 d31, q11, #6
+ vext.16 q12, q3, q0, #3
+ vadd.s16 q8, q8, q12
+ vext.16 q9, q2, q3, #7
+ vst1.64 {d30-d31}, [r0,:128]!
+ bxle lr
+ subs r2, r2, #16
+
+ vext.16 q11, q3, q0, #2
+ vadd.s16 q9, q9, q11
+ vext.16 q10, q3, q0, #1
+ vext.16 q11, q3, q0, #6
+ vadd.s16 q10, q3, q10
+ vsub.s16 q8, q8, q9 // a-b
+ vext.16 q15, q0, q1, #3
+ vsub.s16 q9, q9, q10 // b-c
+
+ vext.16 q12, q3, q0, #7
+ vshr.s16 q8, q8, #2 // (a-b)/4
+ vadd.s16 q11, q11, q15
+ vext.16 q14, q0, q1, #2
+ vsub.s16 q8, q8, q9 // (a-b)/4-b+c
+ vadd.s16 q12, q12, q14
+ vext.16 q13, q0, q1, #1
+
+ vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4
+ vadd.s16 q13, q0, q13
+ vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vsub.s16 q11, q11, q12 // a-b
+ vsub.s16 q12, q12, q13 // b-c
+ vshr.s16 q11, q11, #2 // (a-b)/4
+ vqrshrun.s16 d30, q8, #6
+ vsub.s16 q11, q11, q12 // (a-b)/4-b+c
+ vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4
+ vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+ vqrshrun.s16 d31, q11, #6
+ vst1.64 {d30-d31}, [r0,:128]!
+ bgt filter_c_loop
+ bx lr
+.endfunc
+
+// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+function x264_hpel_filter_h_neon
+ sub r1, #16
+ vmov.u8 d30, #5
+ vld1.64 {d0-d3}, [r1,:128]!
+ vmov.u8 d31, #20
+
+ // unrolled 3x because it's 5% faster, due to mitigating
+ // the high latency of multiplication and vqrshrun
+filter_h_loop:
+ subs r2, r2, #16
+ vld1.64 {d4-d5}, [r1,:128]!
+ vext.8 q8, q0, q1, #14
+ vext.8 q12, q1, q2, #3
+ vaddl.u8 q13, d16, d24
+ vext.8 q9, q0, q1, #15
+ vaddl.u8 q14, d17, d25
+
+ vext.8 q10, q1, q2, #1
+ vmlal.u8 q13, d2, d31
+ vmlsl.u8 q13, d18, d30
+ vext.8 q11, q1, q2, #2
+ vmlal.u8 q13, d20, d31
+ vmlsl.u8 q13, d22, d30
+
+ vmlsl.u8 q14, d19, d30
+ vmlal.u8 q14, d3, d31
+ vmlal.u8 q14, d21, d31
+ vmlsl.u8 q14, d23, d30
+ vqrshrun.s16 d6, q13, #5
+
+ vld1.64 {d0-d1}, [r1,:128]!
+ vext.8 q8, q1, q2, #14
+ vext.8 q12, q2, q0, #3
+ vaddl.u8 q13, d16, d24
+ vqrshrun.s16 d7, q14, #5
+ vext.8 q9, q1, q2, #15
+ vaddl.u8 q14, d17, d25
+
+ vst1.64 {d6-d7}, [r0,:128]!
+ bxle lr
+ subs r2, r2, #16
+
+ vext.8 q10, q2, q0, #1
+ vmlal.u8 q13, d4, d31
+ vmlsl.u8 q13, d18, d30
+ vext.8 q11, q2, q0, #2
+ vmlal.u8 q13, d20, d31
+ vmlsl.u8 q13, d22, d30
+
+ vmlsl.u8 q14, d19, d30
+ vmlal.u8 q14, d5, d31
+ vmlal.u8 q14, d21, d31
+ vmlsl.u8 q14, d23, d30
+ vqrshrun.s16 d6, q13, #5
+
+ vld1.64 {d2-d3}, [r1,:128]!
+ vext.8 q8, q2, q0, #14
+ vext.8 q12, q0, q1, #3
+ vaddl.u8 q13, d16, d24
+ vqrshrun.s16 d7, q14, #5
+ vext.8 q9, q2, q0, #15
+ vaddl.u8 q14, d17, d25
+
+ vst1.64 {d6-d7}, [r0,:128]!
+ bxle lr
+ subs r2, r2, #16
+
+ vext.8 q10, q0, q1, #1
+ vmlal.u8 q13, d0, d31
+ vmlsl.u8 q13, d18, d30
+ vext.8 q11, q0, q1, #2
+ vmlal.u8 q13, d20, d31
+ vmlsl.u8 q13, d22, d30
+
+ vmlsl.u8 q14, d19, d30
+ vmlal.u8 q14, d1, d31
+ vmlal.u8 q14, d21, d31
+ vmlsl.u8 q14, d23, d30
+
+ vqrshrun.s16 d6, q13, #5
+ vqrshrun.s16 d7, q14, #5
+ vst1.64 {d6-d7}, [r0,:128]!
+ bgt filter_h_loop
+ bx lr
+.endfunc
+
+
+// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
+// uint8_t *dstc, int src_stride, int dst_stride, int width,
+// int height )
+function x264_frame_init_lowres_core_neon
+ push {r4-r10,lr}
+ vpush {d8-d15}
+ ldrd r4, [sp, #96]
+ ldrd r6, [sp, #104]
+ ldr lr, [sp, #112]
+ sub r10, r6, r7 // dst_stride - width
+ and r10, r10, #~15
+
+lowres_yloop:
+ mov ip, r7 // width
+ mov r6, r0 // src0
+ add r8, r0, r5 // src1 = src0 + src_stride
+ add r9, r0, r5, lsl #1 // src2 = src1 + src_stride
+
+ vld2.8 {d8, d10}, [r6,:128]!
+ vld2.8 {d12,d14}, [r8,:128]!
+ vld2.8 {d16,d18}, [r9,:128]!
+
+lowres_xloop:
+ subs ip, ip, #16
+
+ vld2.8 {d9, d11}, [r6,:128]!
+ vld2.8 {d13,d15}, [r8,:128]!
+ vrhadd.u8 q0, q4, q6
+ vld2.8 {d17,d19}, [r9,:128]!
+ vrhadd.u8 q5, q5, q7
+ vld2.8 {d20,d22}, [r6,:128]!
+ vrhadd.u8 q1, q6, q8
+ vld2.8 {d24,d26}, [r8,:128]!
+ vrhadd.u8 q7, q7, q9
+ vext.8 q4, q4, q10, #1
+ vrhadd.u8 q0, q0, q5
+ vext.8 q6, q6, q12, #1
+ vrhadd.u8 q1, q1, q7
+ vld2.8 {d28,d30}, [r9,:128]!
+ vrhadd.u8 q4, q4, q6
+ vext.8 q8, q8, q14, #1
+ vrhadd.u8 q6, q6, q8
+ vst1.64 {d0-d1}, [r1,:128]!
+ vrhadd.u8 q2, q4, q5
+ vst1.64 {d2-d3}, [r3,:128]!
+ vrhadd.u8 q3, q6, q7
+ vst1.64 {d4-d5}, [r2,:128]!
+ vst1.64 {d6-d7}, [r4,:128]!
+
+ ble lowres_xloop_end
+ subs ip, ip, #16
+
+ vld2.8 {d21,d23}, [r6,:128]!
+ vld2.8 {d25,d27}, [r8,:128]!
+ vrhadd.u8 q0, q10, q12
+ vld2.8 {d29,d31}, [r9,:128]!
+ vrhadd.u8 q11, q11, q13
+ vld2.8 {d8, d10}, [r6,:128]!
+ vrhadd.u8 q1, q12, q14
+ vld2.8 {d12,d14}, [r8,:128]!
+ vrhadd.u8 q13, q13, q15
+ vext.8 q10, q10, q4, #1
+ vrhadd.u8 q0, q0, q11
+ vext.8 q12, q12, q6, #1
+ vrhadd.u8 q1, q1, q13
+ vld2.8 {d16,d18}, [r9,:128]!
+ vrhadd.u8 q10, q10, q12
+ vext.8 q14, q14, q8, #1
+ vrhadd.u8 q12, q12, q14
+ vst1.64 {d0-d1}, [r1,:128]!
+ vrhadd.u8 q2, q10, q11
+ vst1.64 {d2-d3}, [r3,:128]!
+ vrhadd.u8 q3, q12, q13
+ vst1.64 {d4-d5}, [r2,:128]!
+ vst1.64 {d6-d7}, [r4,:128]!
+
+ bgt lowres_xloop
+
+lowres_xloop_end:
+ subs lr, lr, #1
+ add r0, r0, r5, lsl #1
+ add r1, r1, r10
+ add r2, r2, r10
+ add r3, r3, r10
+ add r4, r4, r10
+ bgt lowres_yloop
+
+ vpop {d8-d15}
+ pop {r4-r10,pc}
+.endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
new file mode 100644
index 0000000..20cf151
--- /dev/null
+++ b/common/arm/mc-c.c
@@ -0,0 +1,196 @@
+/*****************************************************************************
+ * mc-c.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+void x264_prefetch_ref_arm( uint8_t *, int, int );
+void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
+
+void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
+void x264_memzero_aligned_neon( void *dst, int n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+
+void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+
+void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+
+void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
+{
+ NULL,
+ x264_pixel_avg2_w4_neon,
+ x264_pixel_avg2_w8_neon,
+ x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
+ x264_pixel_avg2_w16_neon,
+ x264_pixel_avg2_w20_neon,
+};
+
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
+{
+ NULL,
+ x264_mc_copy_w4_neon,
+ x264_mc_copy_w8_neon,
+ NULL,
+ x264_mc_copy_w16_neon,
+};
+
+static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+static void mc_luma_neon( uint8_t *dst, int i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight )
+{
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+ if ( (mvy&3) == 3 ) // explict if() to force conditional add
+ src1 += i_src_stride;
+
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
+ {
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ x264_pixel_avg_wtab_neon[i_width>>2](
+ dst, i_dst_stride, src1, i_src_stride,
+ src2, i_height );
+ if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
+ }
+ else if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
+ else
+ x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight )
+{
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+ if ( (mvy&3) == 3 ) // explict if() to force conditional add
+ src1 += i_src_stride;
+
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
+ {
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ x264_pixel_avg_wtab_neon[i_width>>2](
+ dst, *i_dst_stride, src1, i_src_stride,
+ src2, i_height );
+ if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+ return dst;
+ }
+ else if( weight->weightfn )
+ {
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
+ return dst;
+ }
+ else
+ {
+ *i_dst_stride = i_src_stride;
+ return src1;
+ }
+}
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ int stride, int width, int height, int16_t *buf )
+{
+ int realign = (intptr_t)src & 15;
+ src -= realign;
+ dstv -= realign;
+ dstc -= realign;
+ dsth -= realign;
+ width += realign;
+ while( height-- )
+ {
+ x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
+ x264_hpel_filter_c_neon( dstc, buf+8, width );
+ x264_hpel_filter_h_neon( dsth, src, width );
+ dsth += stride;
+ dstv += stride;
+ dstc += stride;
+ src += stride;
+ }
+}
+
+void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
+{
+ if( !(cpu&X264_CPU_ARMV6) )
+ return;
+
+ pf->prefetch_fenc = x264_prefetch_fenc_arm;
+ pf->prefetch_ref = x264_prefetch_ref_arm;
+
+ if( !(cpu&X264_CPU_NEON) )
+ return;
+
+ pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
+ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
+ pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
+
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
+
+// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
+#ifndef SYS_MACOSX
+ pf->memcpy_aligned = x264_memcpy_aligned_neon;
+#endif
+ pf->memzero_aligned = x264_memzero_aligned_neon;
+
+ pf->mc_chroma = x264_mc_chroma_neon;
+ pf->mc_luma = mc_luma_neon;
+ pf->get_ref = get_ref_neon;
+ pf->hpel_filter = hpel_filter_neon;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+}
diff --git a/common/ppc/mc.h b/common/arm/mc.h
similarity index 80%
copy from common/ppc/mc.h
copy to common/arm/mc.h
index 0465dd9..6ee510e 100644
--- a/common/ppc/mc.h
+++ b/common/arm/mc.h
@@ -1,7 +1,9 @@
/*****************************************************************************
- * mc.h: h264 encoder library
+ * mc.h: h264 encoder library (Motion Compensation)
*****************************************************************************
- * Copyright (C) 2003-2008 Eric Petit <eric.petit at lapsus.org>
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,9 +20,9 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifndef X264_PPC_MC_H
-#define X264_PPC_MC_H
+#ifndef X264_ARM_MC_H
+#define X264_ARM_MC_H
-void x264_mc_altivec_init( x264_mc_functions_t *pf );
+void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf );
#endif
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
new file mode 100644
index 0000000..4dd65ed
--- /dev/null
+++ b/common/arm/pixel-a.S
@@ -0,0 +1,1238 @@
+/*****************************************************************************
+ * pixel.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.section .rodata
+.align 4
+
+.rept 16 .byte 0xff
+.endr
+mask_ff:
+.rept 16 .byte 0
+.endr
+
+mask_ac4:
+.short 0, -1, -1, -1, 0, -1, -1, -1
+mask_ac8:
+.short 0, -1, -1, -1, -1, -1, -1, -1
+
+.text
+
+.macro SAD4_ARMV6 h
+function x264_pixel_sad_4x\h\()_armv6
+ push {r4-r6,lr}
+ ldr r4, [r2], r3
+ ldr r5, [r0], r1
+ ldr r6, [r2], r3
+ ldr lr, [r0], r1
+ usad8 ip, r4, r5
+.rept (\h - 2)/2
+ ldr r4, [r2], r3
+ ldr r5, [r0], r1
+ usada8 ip, r6, lr, ip
+ ldr r6, [r2], r3
+ ldr lr, [r0], r1
+ usada8 ip, r4, r5, ip
+.endr
+ usada8 r0, r6, lr, ip
+ pop {r4-r6,pc}
+.endfunc
+.endm
+
+SAD4_ARMV6 4
+SAD4_ARMV6 8
+
+
+.macro SAD_START_4 align:vararg
+ vld1.32 {d1[]}, [r2 \align], r3
+ vld1.32 {d0[]}, [r0,:32], r1
+ vabdl.u8 q8, d0, d1
+.endm
+
+.macro SAD_4 align:vararg
+ vld1.32 {d1[]}, [r2 \align], r3
+ vld1.32 {d0[]}, [r0,:32], r1
+ vabal.u8 q8, d0, d1
+.endm
+
+.macro SAD_START_8 align:vararg
+ vld1.64 {d1}, [r2 \align], r3
+ vld1.64 {d0}, [r0,:64], r1
+ vabdl.u8 q8, d0, d1
+.endm
+
+.macro SAD_8 align:vararg
+ vld1.64 {d1}, [r2 \align], r3
+ vld1.64 {d0}, [r0,:64], r1
+ vabal.u8 q8, d0, d1
+.endm
+
+.macro SAD_START_16 align:vararg
+ vld1.64 {d2-d3}, [r2 \align], r3
+ vld1.64 {d0-d1}, [r0,:128], r1
+ vabdl.u8 q8, d0, d2
+ vld1.64 {d6-d7}, [r2 \align], r3
+ vabdl.u8 q9, d1, d3
+ vld1.64 {d4-d5}, [r0,:128], r1
+.endm
+
+.macro SAD_16 align:vararg
+ vabal.u8 q8, d4, d6
+ vld1.64 {d2-d3}, [r2 \align], r3
+ vabal.u8 q9, d5, d7
+ vld1.64 {d0-d1}, [r0,:128], r1
+ vabal.u8 q8, d0, d2
+ vld1.64 {d6-d7}, [r2 \align], r3
+ vabal.u8 q9, d1, d3
+ vld1.64 {d4-d5}, [r0,:128], r1
+.endm
+
+.macro SAD_FUNC w, h, name, align:vararg
+function x264_pixel_sad\name\()_\w\()x\h\()_neon
+.if \w == 16
+ .set r, \h / 2 - 1
+.else
+ .set r, \h - 1
+.endif
+
+ SAD_START_\w \align
+.rept r
+ SAD_\w \align
+.endr
+
+.if \w > 8
+ vabal.u8 q8, d4, d6
+ vabal.u8 q9, d5, d7
+ vadd.u16 q8, q8, q9
+.endif
+.if \w > 4
+ vadd.u16 d16, d16, d17
+.endif
+ vpadd.u16 d0, d16, d16
+ vpaddl.u16 d0, d0
+ vmov.u32 r0, d0[0]
+ bx lr
+.endfunc
+.endm
+
+SAD_FUNC 4, 4
+SAD_FUNC 4, 8
+SAD_FUNC 8, 4
+SAD_FUNC 8, 8
+SAD_FUNC 8, 16
+SAD_FUNC 16, 8
+SAD_FUNC 16, 16
+
+SAD_FUNC 4, 4, _aligned, ,:32
+SAD_FUNC 4, 8, _aligned, ,:32
+SAD_FUNC 8, 4, _aligned, ,:64
+SAD_FUNC 8, 8, _aligned, ,:64
+SAD_FUNC 8, 16, _aligned, ,:64
+SAD_FUNC 16, 8, _aligned, ,:128
+SAD_FUNC 16, 16, _aligned, ,:128
+
+// If dual issue is possible, use additional accumulators to avoid
+// stalls from vadal's latency. This only matters for aligned.
+.macro SAD_DUAL_START_8
+ SAD_START_8 ,:64
+ vld1.64 {d3}, [r2,:64], r3
+ vld1.64 {d2}, [r0,:64], r1
+ vabdl.u8 q9, d2, d3
+.endm
+
+.macro SAD_DUAL_8 align:vararg
+ vld1.64 {d1}, [r2,:64], r3
+ vld1.64 {d0}, [r0,:64], r1
+ vabal.u8 q8, d0, d1
+ vld1.64 {d3}, [r2,:64], r3
+ vld1.64 {d2}, [r0,:64], r1
+ vabal.u8 q9, d2, d3
+.endm
+
+.macro SAD_DUAL_START_16
+ SAD_START_16 ,:128
+ vabdl.u8 q10, d4, d6
+ vld1.64 {d2-d3}, [r2,:128], r3
+ vabdl.u8 q11, d5, d7
+ vld1.64 {d0-d1}, [r0,:128], r1
+.endm
+
+.macro SAD_DUAL_16
+ vabal.u8 q8, d0, d2
+ vld1.64 {d6-d7}, [r2,:128], r3
+ vabal.u8 q9, d1, d3
+ vld1.64 {d4-d5}, [r0,:128], r1
+ vabal.u8 q10, d4, d6
+ vld1.64 {d2-d3}, [r2,:128], r3
+ vabal.u8 q11, d5, d7
+ vld1.64 {d0-d1}, [r0,:128], r1
+.endm
+
+.macro SAD_DUAL_END_16
+ vabal.u8 q8, d0, d2
+ vld1.64 {d6-d7}, [r2,:128], r3
+ vabal.u8 q9, d1, d3
+ vld1.64 {d4-d5}, [r0,:128], r1
+ vabal.u8 q10, d4, d6
+ vabal.u8 q11, d5, d7
+.endm
+
+.macro SAD_FUNC_DUAL w, h
+function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
+ SAD_DUAL_START_\w
+.rept \h / 2 - \w / 8
+ SAD_DUAL_\w
+.endr
+
+.if \w > 8
+ SAD_DUAL_END_16
+ vadd.u16 q8, q8, q9
+ vadd.u16 q9, q10, q11
+.endif
+.if \w > 4
+ vadd.u16 q8, q8, q9
+ vadd.u16 d16, d16, d17
+.endif
+ vpadd.u16 d0, d16, d16
+ vpaddl.u16 d0, d0
+ vmov.u32 r0, d0[0]
+ bx lr
+.endfunc
+.endm
+
+SAD_FUNC_DUAL 8, 4
+SAD_FUNC_DUAL 8, 8
+SAD_FUNC_DUAL 8, 16
+SAD_FUNC_DUAL 16, 8
+SAD_FUNC_DUAL 16, 16
+
+
+.macro SAD_X_START_4 x
+ vld1.32 {d0[]}, [r0,:32], lr
+ vld1.32 {d1[]}, [r1], r6
+ vabdl.u8 q8, d1, d0
+ vld1.32 {d2[]}, [r2], r6
+ vabdl.u8 q9, d2, d0
+ vld1.32 {d3[]}, [r3], r6
+ vabdl.u8 q10, d3, d0
+.if \x == 4
+ vld1.32 {d4[]}, [r12], r6
+ vabdl.u8 q11, d4, d0
+.endif
+.endm
+
+.macro SAD_X_4 x
+ vld1.32 {d0[]}, [r0,:32], lr
+ vld1.32 {d1[]}, [r1], r6
+ vabal.u8 q8, d1, d0
+ vld1.32 {d2[]}, [r2], r6
+ vabal.u8 q9, d2, d0
+ vld1.32 {d3[]}, [r3], r6
+ vabal.u8 q10, d3, d0
+.if \x == 4
+ vld1.32 {d4[]}, [r12], r6
+ vabal.u8 q11, d4, d0
+.endif
+.endm
+
+.macro SAD_X_START_8 x
+ vld1.64 {d0}, [r0,:64], lr
+ vld1.64 {d1}, [r1], r6
+ vabdl.u8 q8, d1, d0
+ vld1.64 {d2}, [r2], r6
+ vabdl.u8 q9, d2, d0
+ vld1.64 {d3}, [r3], r6
+ vabdl.u8 q10, d3, d0
+.if \x == 4
+ vld1.64 {d4}, [r12], r6
+ vabdl.u8 q11, d4, d0
+.endif
+.endm
+
+.macro SAD_X_8 x
+ vld1.64 {d0}, [r0,:64], lr
+ vld1.64 {d1}, [r1], r6
+ vabal.u8 q8, d1, d0
+ vld1.64 {d2}, [r2], r6
+ vabal.u8 q9, d2, d0
+ vld1.64 {d3}, [r3], r6
+ vabal.u8 q10, d3, d0
+.if \x == 4
+ vld1.64 {d4}, [r12], r6
+ vabal.u8 q11, d4, d0
+.endif
+.endm
+
+.macro SAD_X_START_16 x
+ vld1.64 {d0-d1}, [r0,:128], lr
+ vld1.64 {d2-d3}, [r1], r6
+ vabdl.u8 q8, d2, d0
+ vabdl.u8 q12, d3, d1
+ vld1.64 {d4-d5}, [r2], r6
+ vabdl.u8 q9, d4, d0
+ vabdl.u8 q13, d5, d1
+ vld1.64 {d6-d7}, [r3], r6
+ vabdl.u8 q10, d6, d0
+ vabdl.u8 q14, d7, d1
+.if \x == 4
+ vld1.64 {d2-d3}, [r12], r6
+ vabdl.u8 q11, d2, d0
+ vabdl.u8 q15, d3, d1
+.endif
+.endm
+
+.macro SAD_X_16 x
+ vld1.64 {d0-d1}, [r0,:128], lr
+ vld1.64 {d2-d3}, [r1], r6
+ vabal.u8 q8, d2, d0
+ vabal.u8 q12, d3, d1
+ vld1.64 {d4-d5}, [r2], r6
+ vabal.u8 q9, d4, d0
+ vabal.u8 q13, d5, d1
+ vld1.64 {d6-d7}, [r3], r6
+ vabal.u8 q10, d6, d0
+ vabal.u8 q14, d7, d1
+.if \x == 4
+ vld1.64 {d2-d3}, [r12], r6
+ vabal.u8 q11, d2, d0
+ vabal.u8 q15, d3, d1
+.endif
+.endm
+
+.macro SAD_X_FUNC x, w, h
+function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
+ push {r6-r7,lr}
+.if \x == 3
+ ldrd r6, [sp, #12]
+.else
+ ldrd r6, [sp, #16]
+ ldr r12, [sp, #12]
+.endif
+ mov lr, #FENC_STRIDE
+
+ SAD_X_START_\w \x
+.rept \h - 1
+ SAD_X_\w \x
+.endr
+
+// add up the sads
+.if \w > 8
+ vadd.u16 q8, q8, q12
+ vadd.u16 q9, q9, q13
+ vadd.u16 q10, q10, q14
+.if \x == 4
+ vadd.u16 q11, q11, q15
+.endif
+.endif
+.if \w > 4
+ vadd.u16 d16, d16, d17
+ vadd.u16 d18, d18, d19
+ vadd.u16 d20, d20, d21
+.if \x == 4
+ vadd.u16 d22, d22, d23
+.endif
+.endif
+ vpadd.u16 d0, d16, d18
+ vpadd.u16 d1, d20, d22
+ vpaddl.u16 q0, q0
+
+.if \x == 3
+ vst1.32 {d0}, [r7]!
+ vst1.32 {d1[0]}, [r7,:32]
+.else
+ vst1.32 {d0-d1}, [r7]
+.endif
+ pop {r6-r7,pc}
+.endfunc
+.endm
+
+SAD_X_FUNC 3, 4, 4
+SAD_X_FUNC 3, 4, 8
+SAD_X_FUNC 3, 8, 4
+SAD_X_FUNC 3, 8, 8
+SAD_X_FUNC 3, 8, 16
+SAD_X_FUNC 3, 16, 8
+SAD_X_FUNC 3, 16, 16
+
+SAD_X_FUNC 4, 4, 4
+SAD_X_FUNC 4, 4, 8
+SAD_X_FUNC 4, 8, 4
+SAD_X_FUNC 4, 8, 8
+SAD_X_FUNC 4, 8, 16
+SAD_X_FUNC 4, 16, 8
+SAD_X_FUNC 4, 16, 16
+
+
+.macro SSD_START_4
+ vld1.32 {d16[]}, [r0,:32], r1
+ vld1.32 {d17[]}, [r2,:32], r3
+ vsubl.u8 q2, d16, d17
+ vld1.32 {d16[]}, [r0,:32], r1
+ vmull.s16 q0, d4, d4
+ vld1.32 {d17[]}, [r2,:32], r3
+.endm
+
+.macro SSD_4
+ vsubl.u8 q2, d16, d17
+ vld1.32 {d16[]}, [r0,:32], r1
+ vmlal.s16 q0, d4, d4
+ vld1.32 {d17[]}, [r2,:32], r3
+.endm
+
+.macro SSD_END_4
+ vsubl.u8 q2, d16, d17
+ vmlal.s16 q0, d4, d4
+.endm
+
+.macro SSD_START_8
+ vld1.64 {d16}, [r0,:64], r1
+ vld1.64 {d17}, [r2,:64], r3
+ vsubl.u8 q2, d16, d17
+ vld1.64 {d16}, [r0,:64], r1
+ vmull.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d17}, [r2,:64], r3
+.endm
+
+.macro SSD_8
+ vsubl.u8 q2, d16, d17
+ vld1.64 {d16}, [r0,:64], r1
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d17}, [r2,:64], r3
+.endm
+
+.macro SSD_END_8
+ vsubl.u8 q2, d16, d17
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+.endm
+
+.macro SSD_START_16
+ vld1.64 {d16-d17}, [r0,:128], r1
+ vld1.64 {d18-d19}, [r2,:128], r3
+ vsubl.u8 q2, d16, d18
+ vsubl.u8 q3, d17, d19
+ vld1.64 {d16-d17}, [r0,:128], r1
+ vmull.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d18-d19}, [r2,:128], r3
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q0, d7, d7
+.endm
+
+.macro SSD_16
+ vsubl.u8 q2, d16, d18
+ vsubl.u8 q3, d17, d19
+ vld1.64 {d16-d17}, [r0,:128], r1
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d18-d19}, [r2,:128], r3
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q0, d7, d7
+.endm
+
+.macro SSD_END_16
+ vsubl.u8 q2, d16, d18
+ vsubl.u8 q3, d17, d19
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q0, d7, d7
+.endm
+
+.macro SSD_FUNC w h
+function x264_pixel_ssd_\w\()x\h\()_neon
+ SSD_START_\w
+.rept \h-2
+ SSD_\w
+.endr
+ SSD_END_\w
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+.endm
+
+SSD_FUNC 4, 4
+SSD_FUNC 4, 8
+SSD_FUNC 8, 4
+SSD_FUNC 8, 8
+SSD_FUNC 8, 16
+SSD_FUNC 16, 8
+SSD_FUNC 16, 16
+
+
+.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
+ vmull.u8 \qsqr, \dsrc, \dsrc
+ vaddw.u8 q0, q0, \dsrc
+ \vpadal \qsqr_sum, \qsqr_last
+.endm
+
+function x264_pixel_var_8x8_neon
+ vld1.64 {d16}, [r0,:64], r1
+ vmull.u8 q1, d16, d16
+ vmovl.u8 q0, d16
+ vld1.64 {d18}, [r0,:64], r1
+ vmull.u8 q2, d18, d18
+ vaddw.u8 q0, q0, d18
+
+ vld1.64 {d20}, [r0,:64], r1
+ VAR_SQR_SUM q1, q1, q3, d20, vpaddl.u16
+ vld1.64 {d22}, [r0,:64], r1
+ VAR_SQR_SUM q2, q2, q8, d22, vpaddl.u16
+
+ vld1.64 {d24}, [r0,:64], r1
+ VAR_SQR_SUM q1, q3, q9, d24
+ vld1.64 {d26}, [r0,:64], r1
+ VAR_SQR_SUM q2, q8, q10, d26
+ vld1.64 {d24}, [r0,:64], r1
+ VAR_SQR_SUM q1, q9, q14, d24
+ vld1.64 {d26}, [r0,:64], r1
+ VAR_SQR_SUM q2, q10, q15, d26
+ b x264_var_end
+.endfunc
+
+function x264_pixel_var_16x16_neon
+ vld1.64 {d16-d17}, [r0,:128], r1
+ vmull.u8 q12, d16, d16
+ vmovl.u8 q0, d16
+ vmull.u8 q13, d17, d17
+ vaddw.u8 q0, q0, d17
+
+ vld1.64 {d18-d19}, [r0,:128], r1
+ VAR_SQR_SUM q1, q12, q14, d18, vpaddl.u16
+ VAR_SQR_SUM q2, q13, q15, d19, vpaddl.u16
+
+ mov ip, #7
+var16_loop:
+ subs ip, ip, #1
+ vld1.64 {d16-d17}, [r0,:128], r1
+ VAR_SQR_SUM q1, q14, q12, d16
+ VAR_SQR_SUM q2, q15, q13, d17
+
+ vld1.64 {d18-d19}, [r0,:128], r1
+ VAR_SQR_SUM q1, q12, q14, d18
+ VAR_SQR_SUM q2, q13, q15, d19
+ bgt var16_loop
+.endfunc
+
+function x264_var_end
+ vpaddl.u16 q8, q14
+ vpaddl.u16 q9, q15
+ vadd.u32 q1, q1, q8
+ vadd.u16 d0, d0, d1
+ vadd.u32 q1, q1, q9
+ vadd.u32 q1, q1, q2
+ vpaddl.u16 d0, d0
+ vadd.u32 d2, d2, d3
+ vpadd.u32 d0, d0, d2
+
+ vmov r0, r1, d0
+ bx lr
+.endfunc
+
+.macro DIFF_SUM diff da db lastdiff
+ vld1.64 {\da}, [r0,:64], r1
+ vld1.64 {\db}, [r2,:64], r3
+.ifnb \lastdiff
+ vadd.s16 q0, q0, \lastdiff
+.endif
+ vsubl.u8 \diff, \da, \db
+.endm
+
+.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
+ \vmlal \acc, \d0, \d0
+ vmlal.s16 \acc, \d1, \d1
+.endm
+
+function x264_pixel_var2_8x8_neon
+ DIFF_SUM q0, d0, d1
+ DIFF_SUM q8, d16, d17
+ SQR_ACC q1, d0, d1, vmull.s16
+ DIFF_SUM q9, d18, d19, q8
+ SQR_ACC q2, d16, d17, vmull.s16
+.rept 2
+ DIFF_SUM q8, d16, d17, q9
+ SQR_ACC q1, d18, d19
+ DIFF_SUM q9, d18, d19, q8
+ SQR_ACC q2, d16, d17
+.endr
+ DIFF_SUM q8, d16, d17, q9
+ SQR_ACC q1, d18, d19
+ vadd.s16 q0, q0, q8
+ SQR_ACC q2, d16, d17
+
+ ldr ip, [sp]
+ vadd.s16 d0, d0, d1
+ vadd.s32 q1, q1, q2
+ vpaddl.s16 d0, d0
+ vadd.s32 d1, d2, d3
+ vpadd.s32 d0, d0, d1
+
+ vmov.32 r0, r1, d0
+ vst1.32 {d0[1]}, [ip,:32]
+ mul r0, r0, r0
+ sub r0, r1, r0, lsr #6
+ bx lr
+.endfunc
+
+
+.macro LOAD_DIFF_8x4 q0 q1 q2 q3
+ vld1.32 {d1}, [r2], r3
+ vld1.32 {d0}, [r0,:64], r1
+ vsubl.u8 \q0, d0, d1
+ vld1.32 {d3}, [r2], r3
+ vld1.32 {d2}, [r0,:64], r1
+ vsubl.u8 \q1, d2, d3
+ vld1.32 {d5}, [r2], r3
+ vld1.32 {d4}, [r0,:64], r1
+ vsubl.u8 \q2, d4, d5
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d6}, [r0,:64], r1
+ vsubl.u8 \q3, d6, d7
+.endm
+
+function x264_pixel_satd_4x4_neon
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d2[]}, [r0,:32], r1
+ vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d0[1]}, [r0,:32], r1
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d2[1]}, [r0,:32], r1
+ vsubl.u8 q0, d0, d1
+ vsubl.u8 q1, d2, d3
+
+ SUMSUB_AB q2, q3, q0, q1
+ SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7
+ HADAMARD 1, sumsub, q2, q3, q0, q1
+ HADAMARD 2, amax, q0,, q2, q3
+
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+
+function x264_pixel_satd_4x8_neon
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d2[]}, [r0,:32], r1
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d7[]}, [r2], r3
+ vld1.32 {d6[]}, [r0,:32], r1
+
+ vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d0[1]}, [r0,:32], r1
+ vsubl.u8 q0, d0, d1
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d2[1]}, [r0,:32], r1
+ vsubl.u8 q1, d2, d3
+ vld1.32 {d5[1]}, [r2], r3
+ vld1.32 {d4[1]}, [r0,:32], r1
+ vsubl.u8 q2, d4, d5
+ vld1.32 {d7[1]}, [r2], r3
+ SUMSUB_AB q8, q9, q0, q1
+ vld1.32 {d6[1]}, [r0,:32], r1
+ vsubl.u8 q3, d6, d7
+ SUMSUB_AB q10, q11, q2, q3
+ b x264_satd_4x8_8x4_end_neon
+.endfunc
+
+function x264_pixel_satd_8x4_neon
+ vld1.64 {d1}, [r2], r3
+ vld1.64 {d0}, [r0,:64], r1
+ vsubl.u8 q0, d0, d1
+ vld1.64 {d3}, [r2], r3
+ vld1.64 {d2}, [r0,:64], r1
+ vsubl.u8 q1, d2, d3
+ vld1.64 {d5}, [r2], r3
+ vld1.64 {d4}, [r0,:64], r1
+ vsubl.u8 q2, d4, d5
+ vld1.64 {d7}, [r2], r3
+ SUMSUB_AB q8, q9, q0, q1
+ vld1.64 {d6}, [r0,:64], r1
+ vsubl.u8 q3, d6, d7
+ SUMSUB_AB q10, q11, q2, q3
+.endfunc
+
+function x264_satd_4x8_8x4_end_neon
+ vadd.s16 q0, q8, q10
+ vadd.s16 q1, q9, q11
+ vsub.s16 q2, q8, q10
+ vsub.s16 q3, q9, q11
+
+ vtrn.16 q0, q1
+ vadd.s16 q8, q0, q1
+ vtrn.16 q2, q3
+ vsub.s16 q9, q0, q1
+ vadd.s16 q10, q2, q3
+ vsub.s16 q11, q2, q3
+ vtrn.32 q8, q10
+ vabs.s16 q8, q8
+ vtrn.32 q9, q11
+ vabs.s16 q10, q10
+ vabs.s16 q9, q9
+ vabs.s16 q11, q11
+ vmax.u16 q0, q8, q10
+ vmax.u16 q1, q9, q11
+
+ vadd.u16 q0, q0, q1
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+
+function x264_pixel_satd_8x8_neon
+ mov ip, lr
+
+ bl x264_satd_8x8_neon
+ vadd.u16 q0, q12, q13
+ vadd.u16 q1, q14, q15
+
+ vadd.u16 q0, q0, q1
+ HORIZ_ADD d0, d0, d1
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+
+function x264_pixel_satd_8x16_neon
+ vpush {d8-d11}
+ mov ip, lr
+
+ bl x264_satd_8x8_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+
+ bl x264_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+
+function x264_satd_8x8_neon
+ LOAD_DIFF_8x4 q8, q9, q10, q11
+ vld1.64 {d7}, [r2], r3
+ SUMSUB_AB q0, q1, q8, q9
+ vld1.64 {d6}, [r0,:64], r1
+ vsubl.u8 q12, d6, d7
+ vld1.64 {d17}, [r2], r3
+ SUMSUB_AB q2, q3, q10, q11
+ vld1.64 {d16}, [r0,:64], r1
+ vsubl.u8 q13, d16, d17
+ vld1.64 {d19}, [r2], r3
+ SUMSUB_AB q8, q10, q0, q2
+ vld1.64 {d18}, [r0,:64], r1
+ vsubl.u8 q14, d18, d19
+ vld1.64 {d1}, [r2], r3
+ SUMSUB_AB q9, q11, q1, q3
+ vld1.64 {d0}, [r0,:64], r1
+ vsubl.u8 q15, d0, d1
+.endfunc
+
+// one vertical hadamard pass and two horizontal
+function x264_satd_8x4v_8x8h_neon
+ SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
+ vtrn.16 q8, q9
+ SUMSUB_AB q12, q14, q0, q2
+ vtrn.16 q10, q11
+ SUMSUB_AB q13, q15, q1, q3
+ SUMSUB_AB q0, q1, q8, q9
+ vtrn.16 q12, q13
+ SUMSUB_AB q2, q3, q10, q11
+ vtrn.16 q14, q15
+ SUMSUB_AB q8, q9, q12, q13
+ vtrn.32 q0, q2
+ SUMSUB_AB q10, q11, q14, q15
+
+ vtrn.32 q1, q3
+ ABS2 q0, q2
+ vtrn.32 q8, q10
+ ABS2 q1, q3
+ vtrn.32 q9, q11
+ ABS2 q8, q10
+ ABS2 q9, q11
+ vmax.s16 q12, q0, q2
+ vmax.s16 q13, q1, q3
+ vmax.s16 q14, q8, q10
+ vmax.s16 q15, q9, q11
+ bx lr
+.endfunc
+
+function x264_pixel_satd_16x8_neon
+ vpush {d8-d11}
+ mov ip, lr
+
+ bl x264_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+
+ bl x264_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+
+function x264_pixel_satd_16x16_neon
+ vpush {d8-d11}
+ mov ip, lr
+
+ bl x264_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+
+ bl x264_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ bl x264_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ bl x264_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
+
+function x264_satd_16x4_neon
+ vld1.64 {d2-d3}, [r2], r3
+ vld1.64 {d0-d1}, [r0,:128], r1
+ vsubl.u8 q8, d0, d2
+ vld1.64 {d6-d7}, [r2], r3
+ vsubl.u8 q12, d1, d3
+ vld1.64 {d4-d5}, [r0,:128], r1
+ vsubl.u8 q9, d4, d6
+ vld1.64 {d2-d3}, [r2], r3
+ vsubl.u8 q13, d5, d7
+ vld1.64 {d0-d1}, [r0,:128], r1
+ vsubl.u8 q10, d0, d2
+ vld1.64 {d6-d7}, [r2], r3
+ vsubl.u8 q14, d1, d3
+ vadd.s16 q0, q8, q9
+ vld1.64 {d4-d5}, [r0,:128], r1
+ vsub.s16 q1, q8, q9
+ vsubl.u8 q11, d4, d6
+ vsubl.u8 q15, d5, d7
+ SUMSUB_AB q2, q3, q10, q11
+ SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3
+ b x264_satd_8x4v_8x8h_neon
+.endfunc
+
+
+function x264_pixel_sa8d_8x8_neon
+ mov ip, lr
+ bl x264_sa8d_8x8_neon
+ vadd.u16 q0, q8, q9
+ HORIZ_ADD d0, d0, d1
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ add r0, r0, #1
+ lsr r0, r0, #1
+ bx lr
+.endfunc
+
+function x264_pixel_sa8d_16x16_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x264_sa8d_8x8_neon
+ vpaddl.u16 q4, q8
+ vpaddl.u16 q5, q9
+ bl x264_sa8d_8x8_neon
+ vpadal.u16 q4, q8
+ vpadal.u16 q5, q9
+ sub r0, r0, r1, lsl #4
+ sub r2, r2, r3, lsl #4
+ add r0, r0, #8
+ add r2, r2, #8
+ bl x264_sa8d_8x8_neon
+ vpadal.u16 q4, q8
+ vpadal.u16 q5, q9
+ bl x264_sa8d_8x8_neon
+ vpaddl.u16 q8, q8
+ vpaddl.u16 q9, q9
+ vadd.u32 q0, q4, q8
+ vadd.u32 q1, q5, q9
+ vadd.u32 q0, q0, q1
+ vadd.u32 d0, d0, d1
+ vpadd.u32 d0, d0, d0
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ add r0, r0, #1
+ lsr r0, r0, #1
+ bx lr
+.endfunc
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+ SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+ SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
+
+function x264_sa8d_8x8_neon
+ LOAD_DIFF_8x4 q8, q9, q10, q11
+ vld1.64 {d7}, [r2], r3
+ SUMSUB_AB q0, q1, q8, q9
+ vld1.64 {d6}, [r0,:64], r1
+ vsubl.u8 q12, d6, d7
+ vld1.64 {d17}, [r2], r3
+ SUMSUB_AB q2, q3, q10, q11
+ vld1.64 {d16}, [r0,:64], r1
+ vsubl.u8 q13, d16, d17
+ vld1.64 {d19}, [r2], r3
+ SUMSUB_AB q8, q10, q0, q2
+ vld1.64 {d18}, [r0,:64], r1
+ vsubl.u8 q14, d18, d19
+ vld1.64 {d1}, [r2], r3
+ SUMSUB_AB q9, q11, q1, q3
+ vld1.64 {d0}, [r0,:64], r1
+ vsubl.u8 q15, d0, d1
+
+ HADAMARD4_V q12, q13, q14, q15, q0, q1, q2, q3
+ SUMSUB_ABCD q0, q8, q1, q9, q8, q12, q9, q13
+ SUMSUB_AB q2, q10, q10, q14
+ vtrn.16 q8, q9
+ SUMSUB_AB q3, q11, q11, q15
+ vtrn.16 q0, q1
+ SUMSUB_AB q12, q13, q8, q9
+ vtrn.16 q10, q11
+ SUMSUB_AB q8, q9, q0, q1
+ vtrn.16 q2, q3
+ SUMSUB_AB q14, q15, q10, q11
+ vadd.i16 q10, q2, q3
+ vtrn.32 q12, q14
+ vsub.i16 q11, q2, q3
+ vtrn.32 q13, q15
+ SUMSUB_AB q0, q2, q12, q14
+ vtrn.32 q8, q10
+ SUMSUB_AB q1, q3, q13, q15
+ vtrn.32 q9, q11
+ SUMSUB_AB q12, q14, q8, q10
+ SUMSUB_AB q13, q15, q9, q11
+
+ vswp d1, d24
+ ABS2 q0, q12
+ vswp d3, d26
+ ABS2 q1, q13
+ vswp d5, d28
+ ABS2 q2, q14
+ vswp d7, d30
+ ABS2 q3, q15
+ vmax.s16 q8, q0, q12
+ vmax.s16 q9, q1, q13
+ vmax.s16 q10, q2, q14
+ vmax.s16 q11, q3, q15
+ vadd.i16 q8, q8, q9
+ vadd.i16 q9, q10, q11
+ bx lr
+.endfunc
+
+
+.macro HADAMARD_AC w h
+function x264_pixel_hadamard_ac_\w\()x\h\()_neon
+ vpush {d8-d15}
+ movrel ip, mask_ac4
+ vmov.i8 q4, #0
+ // note: this assumes mask_ac8 is after mask_ac4 (so don't move it)
+ vld1.64 {d12-d15}, [ip,:128]
+ vmov.i8 q5, #0
+
+ mov ip, lr
+ bl x264_hadamard_ac_8x8_neon
+.if \h > 8
+ bl x264_hadamard_ac_8x8_neon
+.endif
+.if \w > 8
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #8
+ bl x264_hadamard_ac_8x8_neon
+.endif
+.if \w * \h == 256
+ sub r0, r0, r1, lsl #4
+ bl x264_hadamard_ac_8x8_neon
+.endif
+
+ vadd.s32 d8, d8, d9
+ vadd.s32 d10, d10, d11
+ vpadd.s32 d0, d8, d10
+ vpop {d8-d15}
+ mov lr, ip
+ vmov r0, r1, d0
+ lsr r0, r0, #1
+ lsr r1, r1, #2
+ bx lr
+.endfunc
+.endm
+
+HADAMARD_AC 8, 8
+HADAMARD_AC 8, 16
+HADAMARD_AC 16, 8
+HADAMARD_AC 16, 16
+
+// q4: satd q5: sa8d q6: mask_ac4 q7: mask_ac8
+function x264_hadamard_ac_8x8_neon
+ vld1.64 {d2}, [r0,:64], r1
+ vld1.64 {d3}, [r0,:64], r1
+ vaddl.u8 q0, d2, d3
+ vld1.64 {d6}, [r0,:64], r1
+ vsubl.u8 q1, d2, d3
+ vld1.64 {d7}, [r0,:64], r1
+ vaddl.u8 q2, d6, d7
+ vld1.64 {d18}, [r0,:64], r1
+ vsubl.u8 q3, d6, d7
+ vld1.64 {d19}, [r0,:64], r1
+ vaddl.u8 q8, d18, d19
+ vld1.64 {d22}, [r0,:64], r1
+ vsubl.u8 q9, d18, d19
+ vld1.64 {d23}, [r0,:64], r1
+
+ SUMSUB_ABCD q12, q14, q13, q15, q0, q2, q1, q3
+ vaddl.u8 q10, d22, d23
+ vsubl.u8 q11, d22, d23
+ vtrn.16 q12, q13
+ SUMSUB_ABCD q0, q2, q1, q3, q8, q10, q9, q11
+
+ vtrn.16 q14, q15
+ SUMSUB_AB q8, q9, q12, q13
+ vtrn.16 q0, q1
+ SUMSUB_AB q10, q11, q14, q15
+ vtrn.16 q2, q3
+ SUMSUB_AB q12, q13, q0, q1
+ vtrn.32 q8, q10
+ SUMSUB_AB q14, q15, q2, q3
+ vtrn.32 q9, q11
+ SUMSUB_AB q0, q2, q8, q10
+ vtrn.32 q12, q14
+ SUMSUB_AB q1, q3, q9, q11
+ vtrn.32 q13, q15
+ SUMSUB_ABCD q8, q10, q9, q11, q12, q14, q13, q15
+
+ vabs.s16 q12, q0
+ vabs.s16 q13, q8
+ vabs.s16 q15, q1
+ vadd.s16 q12, q12, q13
+ vabs.s16 q14, q2
+ vand.s16 q12, q12, q6
+ vabs.s16 q13, q3
+ vadd.s16 q12, q12, q15
+ vabs.s16 q15, q9
+ vadd.s16 q12, q12, q14
+ vabs.s16 q14, q10
+ vadd.s16 q12, q12, q13
+ vabs.s16 q13, q11
+ vadd.s16 q12, q12, q15
+ vsub.s16 q15, q11, q3
+ vadd.s16 q12, q12, q14
+ vadd.s16 q14, q11, q3
+ vadd.s16 q12, q12, q13
+ vsub.s16 q13, q10, q2
+ vadd.s16 q2, q10, q2
+ vpadal.u16 q4, q12
+
+ SUMSUB_AB q10, q11, q9, q1
+ SUMSUB_AB q9, q8, q0, q8
+ vswp d29, d30
+ vabs.s16 q14, q14
+ vabs.s16 q15, q15
+ vswp d5, d26
+ vabs.s16 q2, q2
+ vabs.s16 q13, q13
+ vswp d21, d22
+ vabs.s16 q10, q10
+ vabs.s16 q11, q11
+ vmax.s16 q3, q14, q15
+ vmax.s16 q2, q2, q13
+ vmax.s16 q1, q10, q11
+ vswp d19, d16
+ SUMSUB_AB q14, q15, q9, q8
+
+ vadd.s16 q2, q2, q3
+ vadd.s16 q2, q2, q1
+ vand q14, q14, q7
+ vadd.s16 q2, q2, q2
+ vabs.s16 q15, q15
+ vabs.s16 q14, q14
+ vadd.s16 q2, q2, q15
+ vadd.s16 q2, q2, q14
+ vpadal.u16 q5, q2
+ bx lr
+.endfunc
+
+
+.macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext
+ vld1.64 {\db}, [r2], r3
+ vmull.u8 \ssa, \da, \da
+ vmull.u8 \s12, \da, \db
+.if \n == 1
+ vpaddl.u16 q2, \lastssa
+ vpaddl.u16 q3, \lasts12
+ vaddl.u8 q0, d0, \da
+.else
+ vpadal.u16 q2, \lastssa
+ vpadal.u16 q3, \lasts12
+ vaddw.u8 q0, q0, \da
+.endif
+ vpadal.u16 q2, \lastssb
+.if \n < 3
+ vld1.64 {\dnext}, [r0], r1
+.endif
+.if \n == 1
+ vaddl.u8 q1, d2, \db
+.else
+ vaddw.u8 q1, q1, \db
+.endif
+ vmull.u8 \ssb, \db, \db
+.endm
+
+function x264_pixel_ssim_4x4x2_core_neon
+ ldr ip, [sp]
+ vld1.64 {d0}, [r0], r1
+ vld1.64 {d2}, [r2], r3
+ vmull.u8 q2, d0, d0
+ vmull.u8 q3, d0, d2
+ vld1.64 {d28}, [r0], r1
+ vmull.u8 q15, d2, d2
+
+ SSIM_ITER 1, q8, q9, q14, q2, q3, q15, d28, d29, d26
+ SSIM_ITER 2, q10,q11,q13, q8, q9, q14, d26, d27, d28
+ SSIM_ITER 3, q8, q9, q15, q10,q11,q13, d28, d29
+
+ vpadal.u16 q2, q8
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
+ vpadal.u16 q2, q15
+ vpadal.u16 q3, q9
+
+ vpadd.u32 d0, d0, d1
+ vpadd.u32 d1, d2, d3
+ vpadd.u32 d2, d4, d5
+ vpadd.u32 d3, d6, d7
+
+ vst4.32 {d0-d3}, [ip]
+ bx lr
+.endfunc
+
+// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
+function x264_pixel_ssim_end4_neon
+ vld1.32 {d16-d19}, [r0,:128]!
+ vld1.32 {d20-d23}, [r1,:128]!
+ vadd.s32 q0, q8, q10
+ vadd.s32 q1, q9, q11
+ vld1.32 {d24-d27}, [r0,:128]!
+ vadd.s32 q0, q0, q1
+ vld1.32 {d28-d31}, [r1,:128]!
+ vadd.s32 q2, q12, q14
+ vadd.s32 q3, q13, q15
+ vld1.32 {d16-d17}, [r0,:128]
+ vadd.s32 q1, q1, q2
+ vld1.32 {d18-d19}, [r1,:128]
+ vadd.s32 q8, q8, q9
+ vadd.s32 q2, q2, q3
+ vadd.s32 q3, q3, q8
+
+ vtrn.32 q0, q1
+ vtrn.32 q2, q3
+ vswp d1, d4
+ vswp d3, d6
+
+// s1=q0, s2=q1, ss=q2, s12=q3
+ vmul.s32 q8, q0, q1 // s1*s2
+ vmul.s32 q0, q0, q0
+ vmla.s32 q0, q1, q1 // s1*s1 + s2*s2
+
+ vshl.s32 q3, q3, #7
+ vshl.s32 q2, q2, #6
+ vadd.s32 q1, q8, q8
+
+ mov r3, #416 // ssim_c1 = .01*.01*255*255*64
+ movconst ip, 235963 // ssim_c2 = .03*.03*255*255*64*63
+ vdup.32 q14, r3
+ vdup.32 q15, ip
+
+ vsub.s32 q2, q2, q0 // vars
+ vsub.s32 q3, q3, q1 // covar*2
+ vadd.s32 q0, q0, q14
+ vadd.s32 q2, q2, q15
+ vadd.s32 q1, q1, q14
+ vadd.s32 q3, q3, q15
+
+ vcvt.f32.s32 q0, q0
+ vcvt.f32.s32 q2, q2
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q3, q3
+
+ vmul.f32 q0, q0, q2
+ vmul.f32 q1, q1, q3
+
+ cmp r2, #4
+
+ vdiv.f32 s0, s4, s0
+ vdiv.f32 s1, s5, s1
+ vdiv.f32 s2, s6, s2
+ vdiv.f32 s3, s7, s3
+
+ beq ssim_skip
+ movrel r3, mask_ff
+ sub r3, r3, r2, lsl #2
+ vld1.64 {d6-d7}, [r3]
+ vand q0, q0, q3
+ssim_skip:
+ vadd.f32 d0, d0, d1
+ vpadd.f32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+.endfunc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
new file mode 100644
index 0000000..0683520
--- /dev/null
+++ b/common/arm/pixel.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * pixel.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_ARM_PIXEL_H
+#define X264_ARM_PIXEL_H
+
+#define DECL_PIXELS( ret, name, suffix, args ) \
+ ret x264_pixel_##name##_16x16_##suffix args;\
+ ret x264_pixel_##name##_16x8_##suffix args;\
+ ret x264_pixel_##name##_8x16_##suffix args;\
+ ret x264_pixel_##name##_8x8_##suffix args;\
+ ret x264_pixel_##name##_8x4_##suffix args;\
+ ret x264_pixel_##name##_4x8_##suffix args;\
+ ret x264_pixel_##name##_4x4_##suffix args;\
+
+#define DECL_X1( name, suffix ) \
+ DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
+
+#define DECL_X4( name, suffix ) \
+ DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )
+
+int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int );
+
+DECL_X1( sad, neon )
+DECL_X1( sad_aligned, neon )
+DECL_X1( sad_aligned, neon_dual )
+DECL_X4( sad, neon )
+DECL_X1( satd, neon )
+DECL_X1( ssd, neon )
+
+int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
+
+uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
+int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
+
+uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int );
+
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
+ const uint8_t *, int,
+ int sums[2][4]);
+float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
+
+#endif
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
new file mode 100644
index 0000000..9a91478
--- /dev/null
+++ b/common/arm/predict-a.S
@@ -0,0 +1,270 @@
+/*****************************************************************************
+ * predict_armv6.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+
+pw_76543210: .short 7,6,5,4,3,2,1,0
+
+.text
+
+// because gcc doesn't believe in using the free shift in add
+function x264_predict_4x4_h_armv6
+ ldrb r1, [r0, #0*FDEC_STRIDE-1]
+ ldrb r2, [r0, #1*FDEC_STRIDE-1]
+ ldrb r3, [r0, #2*FDEC_STRIDE-1]
+ ldrb ip, [r0, #3*FDEC_STRIDE-1]
+ add r1, r1, r1, lsl #8
+ add r2, r2, r2, lsl #8
+ add r3, r3, r3, lsl #8
+ add ip, ip, ip, lsl #8
+ add r1, r1, r1, lsl #16
+ str r1, [r0, #0*FDEC_STRIDE]
+ add r2, r2, r2, lsl #16
+ str r2, [r0, #1*FDEC_STRIDE]
+ add r3, r3, r3, lsl #16
+ str r3, [r0, #2*FDEC_STRIDE]
+ add ip, ip, ip, lsl #16
+ str ip, [r0, #3*FDEC_STRIDE]
+ bx lr
+.endfunc
+
+function x264_predict_4x4_dc_armv6
+ mov ip, #0
+ ldr r1, [r0, #-FDEC_STRIDE]
+ ldrb r2, [r0, #0*FDEC_STRIDE-1]
+ ldrb r3, [r0, #1*FDEC_STRIDE-1]
+ usad8 r1, r1, ip
+ add r2, r2, #4
+ ldrb ip, [r0, #2*FDEC_STRIDE-1]
+ add r2, r2, r3
+ ldrb r3, [r0, #3*FDEC_STRIDE-1]
+ add r2, r2, ip
+ add r2, r2, r3
+ add r1, r1, r2
+ lsr r1, r1, #3
+ add r1, r1, r1, lsl #8
+ add r1, r1, r1, lsl #16
+ str r1, [r0, #0*FDEC_STRIDE]
+ str r1, [r0, #1*FDEC_STRIDE]
+ str r1, [r0, #2*FDEC_STRIDE]
+ str r1, [r0, #3*FDEC_STRIDE]
+ bx lr
+.endfunc
+
+// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
+.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
+ uhadd8 \a1, \a1, \c1
+ uhadd8 \a2, \a2, \c2
+ uhadd8 \c1, \a1, \b1
+ uhadd8 \c2, \a2, \b2
+ eor \a1, \a1, \b1
+ eor \a2, \a2, \b2
+ and \a1, \a1, \pb_1
+ and \a2, \a2, \pb_1
+ uadd8 \a1, \a1, \c1
+ uadd8 \a2, \a2, \c2
+.endm
+
+function x264_predict_4x4_ddr_armv6
+ ldr r1, [r0, # -FDEC_STRIDE]
+ ldrb r2, [r0, # -FDEC_STRIDE-1]
+ ldrb r3, [r0, #0*FDEC_STRIDE-1]
+ push {r4-r6,lr}
+ add r2, r2, r1, lsl #8
+ ldrb r4, [r0, #1*FDEC_STRIDE-1]
+ add r3, r3, r2, lsl #8
+ ldrb r5, [r0, #2*FDEC_STRIDE-1]
+ ldrb r6, [r0, #3*FDEC_STRIDE-1]
+ add r4, r4, r3, lsl #8
+ add r5, r5, r4, lsl #8
+ add r6, r6, r5, lsl #8
+ ldr ip, =0x01010101
+ PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
+ str r1, [r0, #0*FDEC_STRIDE]
+ lsl r2, r1, #8
+ lsl r3, r1, #16
+ lsl r4, r4, #8
+ lsl r5, r1, #24
+ add r2, r2, r4, lsr #24
+ str r2, [r0, #1*FDEC_STRIDE]
+ add r3, r3, r4, lsr #16
+ str r3, [r0, #2*FDEC_STRIDE]
+ add r5, r5, r4, lsr #8
+ str r5, [r0, #3*FDEC_STRIDE]
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_predict_4x4_ddl_neon
+ sub r0, #FDEC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d0}, [r0], ip
+ vdup.8 d3, d0[7]
+ vext.8 d1, d0, d0, #1
+ vext.8 d2, d0, d3, #2
+ vhadd.u8 d0, d0, d2
+ vrhadd.u8 d0, d0, d1
+ vst1.32 {d0[0]}, [r0,:32], ip
+ vext.8 d1, d0, d0, #1
+ vext.8 d2, d0, d0, #2
+ vst1.32 {d1[0]}, [r0,:32], ip
+ vext.8 d3, d0, d0, #3
+ vst1.32 {d2[0]}, [r0,:32], ip
+ vst1.32 {d3[0]}, [r0,:32], ip
+ bx lr
+.endfunc
+
+function x264_predict_8x8_dc_neon
+ mov ip, #0
+ ldrd r2, [r1, #8]
+ push {r4-r5,lr}
+ ldrd r4, [r1, #16]
+ lsl r3, r3, #8
+ ldrb lr, [r1, #7]
+ usad8 r2, r2, ip
+ usad8 r3, r3, ip
+ usada8 r2, r4, ip, r2
+ add lr, lr, #8
+ usada8 r3, r5, ip, r3
+ add r2, r2, lr
+ mov ip, #FDEC_STRIDE
+ add r2, r2, r3
+ lsr r2, r2, #4
+
+ vdup.8 d0, r2
+.rept 8
+ vst1.64 {d0}, [r0,:64], ip
+.endr
+ pop {r4-r5,pc}
+.endfunc
+
+
+function x264_predict_8x8_h_neon
+ add r1, r1, #7
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d16}, [r1]
+ vdup.8 d0, d16[7]
+ vdup.8 d1, d16[6]
+ vst1.64 {d0}, [r0,:64], ip
+ vdup.8 d2, d16[5]
+ vst1.64 {d1}, [r0,:64], ip
+ vdup.8 d3, d16[4]
+ vst1.64 {d2}, [r0,:64], ip
+ vdup.8 d4, d16[3]
+ vst1.64 {d3}, [r0,:64], ip
+ vdup.8 d5, d16[2]
+ vst1.64 {d4}, [r0,:64], ip
+ vdup.8 d6, d16[1]
+ vst1.64 {d5}, [r0,:64], ip
+ vdup.8 d7, d16[0]
+ vst1.64 {d6}, [r0,:64], ip
+ vst1.64 {d7}, [r0,:64], ip
+ bx lr
+.endfunc
+
+function x264_predict_8x8c_h_neon
+ sub r1, r0, #1
+ mov ip, #FDEC_STRIDE
+.rept 4
+ vld1.8 {d0[]}, [r1], ip
+ vld1.8 {d2[]}, [r1], ip
+ vst1.64 {d0}, [r0,:64], ip
+ vst1.64 {d2}, [r0,:64], ip
+.endr
+ bx lr
+.endfunc
+
+function x264_predict_8x8c_v_neon
+ sub r0, r0, #FDEC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d0}, [r0,:64], ip
+.rept 8
+ vst1.64 {d0}, [r0,:64], ip
+.endr
+ bx lr
+.endfunc
+
+
+function x264_predict_16x16_dc_neon
+ sub r3, r0, #FDEC_STRIDE
+ sub r0, r0, #1
+ vld1.64 {d0-d1}, [r3,:128]
+ ldrb ip, [r0], #FDEC_STRIDE
+ vaddl.u8 q0, d0, d1
+ ldrb r1, [r0], #FDEC_STRIDE
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0, d0
+ vpadd.u16 d0, d0, d0
+.rept 4
+ ldrb r2, [r0], #FDEC_STRIDE
+ add ip, ip, r1
+ ldrb r3, [r0], #FDEC_STRIDE
+ add ip, ip, r2
+ ldrb r1, [r0], #FDEC_STRIDE
+ add ip, ip, r3
+.endr
+ ldrb r2, [r0], #FDEC_STRIDE
+ add ip, ip, r1
+ ldrb r3, [r0], #FDEC_STRIDE
+ add ip, ip, r2
+
+ sub r0, r0, #FDEC_STRIDE*16
+ add ip, ip, r3
+ vdup.16 d1, ip
+ vadd.u16 d0, d0, d1
+ mov ip, #FDEC_STRIDE
+ add r0, r0, #1
+ vrshr.u16 d0, d0, #5
+ vdup.8 q0, d0[0]
+.rept 16
+ vst1.64 {d0-d1}, [r0,:64], ip
+.endr
+ bx lr
+.endfunc
+
+function x264_predict_16x16_h_neon
+ sub r1, r0, #1
+ mov ip, #FDEC_STRIDE
+.rept 8
+ vld1.8 {d0[]}, [r1], ip
+ vmov d1, d0
+ vld1.8 {d2[]}, [r1], ip
+ vmov d3, d2
+ vst1.64 {d0-d1}, [r0,:128], ip
+ vst1.64 {d2-d3}, [r0,:128], ip
+.endr
+ bx lr
+.endfunc
+
+function x264_predict_16x16_v_neon
+ sub r0, r0, #FDEC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d0-d1}, [r0,:128], ip
+.rept 16
+ vst1.64 {d0-d1}, [r0,:128], ip
+.endr
+ bx lr
+.endfunc
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
new file mode 100644
index 0000000..1f2cd52
--- /dev/null
+++ b/common/arm/predict-c.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_h_armv6( uint8_t *src );
+void x264_predict_4x4_ddr_armv6( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[33] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[33] );
+
+void x264_predict_16x16_dc_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+
+void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
+{
+ if (!(cpu&X264_CPU_ARMV6))
+ return;
+
+ pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
+ pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
+
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+ pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+}
+
+void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
+}
+
+void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+ pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
+ pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
+}
+
+void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+ pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
+ pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
+ pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
+}
diff --git a/common/ppc/predict.h b/common/arm/predict.h
similarity index 69%
copy from common/ppc/predict.h
copy to common/arm/predict.h
index 29488aa..fe5ccda 100644
--- a/common/ppc/predict.h
+++ b/common/arm/predict.h
@@ -1,7 +1,9 @@
/*****************************************************************************
* predict.h: h264 encoder library
*****************************************************************************
- * Copyright (C) 2007 Guillaume Poirier <gpoirier at mplayerhq.hu>
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,10 +20,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifndef X264_PPC_PREDICT_H
-#define X264_PPC_PREDICT_H
+#ifndef X264_ARM_PREDICT_H
+#define X264_ARM_PREDICT_H
-void x264_predict_16x16_init_altivec ( x264_predict_t pf[7] );
-void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] );
+void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
+void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
-#endif /* X264_PPC_PREDICT_H */
+#endif
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
new file mode 100644
index 0000000..0b49eb4
--- /dev/null
+++ b/common/arm/quant-a.S
@@ -0,0 +1,352 @@
+/*****************************************************************************
+ * quant.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+pmovmskb_byte:
+.byte 1,2,4,8,16,32,64,128
+.byte 1,2,4,8,16,32,64,128
+
+.text
+
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
+ vadd.u16 q8, q8, \bias0
+ vadd.u16 q9, q9, \bias1
+.ifc \load_mf, yes
+ vld1.64 {\mf0-\mf3}, [r1,:128]!
+.endif
+ vmull.u16 q10, d16, \mf0
+ vmull.u16 q11, d17, \mf1
+ vmull.u16 q12, d18, \mf2
+ vmull.u16 q13, d19, \mf3
+ vshr.s16 q14, q14, #15
+ vshr.s16 q15, q15, #15
+ vshrn.u32 d16, q10, #16
+ vshrn.u32 d17, q11, #16
+ vshrn.u32 d18, q12, #16
+ vshrn.u32 d19, q13, #16
+ veor q8, q8, q14
+ veor q9, q9, q15
+ vsub.s16 q8, q8, q14
+ vsub.s16 q9, q9, q15
+ vorr \bias0, q8, q9
+ vst1.64 {d16-d19}, [r0,:128]!
+.endm
+
+.macro QUANT_END d
+ vmov r2, r3, \d
+ orrs r0, r2, r3
+ movne r0, #1
+ bx lr
+.endm
+
+// quant_2x2_dc( int16_t dct[4], int mf, int bias )
+function x264_quant_2x2_dc_neon
+ vld1.64 {d0}, [r0,:64]
+ vabs.s16 d3, d0
+ vdup.16 d2, r2
+ vdup.16 d1, r1
+ vadd.u16 d3, d3, d2
+ vmull.u16 q3, d3, d1
+ vshr.s16 d0, d0, #15
+ vshrn.u32 d3, q3, #16
+ veor d3, d3, d0
+ vsub.s16 d3, d3, d0
+ vst1.64 {d3}, [r0,:64]
+ QUANT_END d3
+.endfunc
+
+// quant_4x4_dc( int16_t dct[16], int mf, int bias )
+function x264_quant_4x4_dc_neon
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ vdup.16 q0, r2
+ vdup.16 q2, r1
+ QUANT_TWO q0, q0, d4, d5, d4, d5
+ vorr d0, d0, d1
+ QUANT_END d0
+.endfunc
+
+// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4_neon
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ vld1.64 {d0-d3}, [r2,:128]
+ vld1.64 {d4-d7}, [r1,:128]
+ QUANT_TWO q0, q1, d4, d5, d6, d7
+ vorr d0, d0, d1
+ QUANT_END d0
+.endfunc
+
+// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+function x264_quant_8x8_neon
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ vld1.64 {d0-d3}, [r2,:128]!
+ vld1.64 {d4-d7}, [r1,:128]!
+ QUANT_TWO q0, q1, d4, d5, d6, d7
+.rept 3
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ vld1.64 {d2-d5}, [r2,:128]!
+ QUANT_TWO q1, q2, d4, d5, d6, d7, yes
+ vorr q0, q0, q1
+.endr
+ vorr d0, d0, d1
+ QUANT_END d0
+.endfunc
+
+.macro DEQUANT_START mf_size offset dc=no
+ mov r3, #0x2b
+ mul r3, r3, r2
+ lsr r3, r3, #8 // i_qbits = i_qp / 6
+ add ip, r3, r3, lsl #1
+ sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
+.ifc \dc,no
+ add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
+.else
+ ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
+.endif
+ subs r3, r3, #\offset // 6 for 8x8
+.endm
+
+// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+.macro DEQUANT size bits
+function x264_dequant_\size\()_neon
+ DEQUANT_START \bits+2, \bits
+.ifc \size, 8x8
+ mov r2, #4
+.endif
+ blt dequant_\size\()_rshift
+
+ vdup.16 q15, r3
+dequant_\size\()_lshift_loop:
+.ifc \size, 8x8
+ subs r2, r2, #1
+.endif
+ vld1.32 {d16-d17}, [r1,:128]!
+ vld1.32 {d18-d19}, [r1,:128]!
+ vmovn.s32 d4, q8
+ vld1.32 {d20-d21}, [r1,:128]!
+ vmovn.s32 d5, q9
+ vld1.32 {d22-d23}, [r1,:128]!
+ vmovn.s32 d6, q10
+ vld1.16 {d0-d3}, [r0,:128]
+ vmovn.s32 d7, q11
+ vmul.s16 q0, q0, q2
+ vmul.s16 q1, q1, q3
+ vshl.s16 q0, q0, q15
+ vshl.s16 q1, q1, q15
+ vst1.16 {d0-d3}, [r0,:128]!
+.ifc \size, 8x8
+ bgt dequant_\size\()_lshift_loop
+.endif
+ bx lr
+
+dequant_\size\()_rshift:
+ vdup.32 q15, r3
+ rsb r3, r3, #0
+ mov ip, #1
+ sub r3, r3, #1
+ lsl ip, ip, r3
+
+.ifc \size, 8x8
+dequant_\size\()_rshift_loop:
+ subs r2, r2, #1
+.endif
+ vdup.32 q10, ip
+ vld1.32 {d16-d17}, [r1,:128]!
+ vdup.32 q11, ip
+ vld1.32 {d18-d19}, [r1,:128]!
+ vmovn.s32 d4, q8
+ vld1.32 {d16-d17}, [r1,:128]!
+ vmovn.s32 d5, q9
+ vld1.32 {d18-d19}, [r1,:128]!
+ vmovn.s32 d6, q8
+ vld1.16 {d0-d3}, [r0,:128]
+ vmovn.s32 d7, q9
+ vdup.32 q12, ip
+ vdup.32 q13, ip
+
+ vmlal.s16 q10, d0, d4
+ vmlal.s16 q11, d1, d5
+ vmlal.s16 q12, d2, d6
+ vmlal.s16 q13, d3, d7
+ vshl.s32 q10, q10, q15
+ vshl.s32 q11, q11, q15
+ vshl.s32 q12, q12, q15
+ vshl.s32 q13, q13, q15
+
+ vmovn.s32 d0, q10
+ vmovn.s32 d1, q11
+ vmovn.s32 d2, q12
+ vmovn.s32 d3, q13
+ vst1.16 {d0-d3}, [r0,:128]!
+.ifc \size, 8x8
+ bgt dequant_\size\()_rshift_loop
+.endif
+ bx lr
+.endfunc
+.endm
+
+DEQUANT 4x4, 4
+DEQUANT 8x8, 6
+
+// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+function x264_dequant_4x4_dc_neon
+ DEQUANT_START 6, 6, yes
+ blt dequant_4x4_dc_rshift
+
+ lsl r1, r1, r3
+ vdup.16 q2, r1
+ vld1.16 {d0-d3}, [r0,:128]
+ vdup.16 q15, r3
+
+ vmul.s16 q0, q0, q2
+ vmul.s16 q1, q1, q2
+ vst1.16 {d0-d3}, [r0,:128]
+ bx lr
+
+dequant_4x4_dc_rshift:
+ vdup.16 d4, r1
+ vdup.32 q15, r3
+ rsb r3, r3, #0
+ mov ip, #1
+ sub r3, r3, #1
+ lsl ip, ip, r3
+
+ vdup.32 q10, ip
+ vdup.32 q11, ip
+ vld1.16 {d0-d3}, [r0,:128]
+ vdup.32 q12, ip
+ vdup.32 q13, ip
+
+ vmlal.s16 q10, d0, d4
+ vmlal.s16 q11, d1, d4
+ vmlal.s16 q12, d2, d4
+ vmlal.s16 q13, d3, d4
+ vshl.s32 q10, q10, q15
+ vshl.s32 q11, q11, q15
+ vshl.s32 q12, q12, q15
+ vshl.s32 q13, q13, q15
+
+ vmovn.s32 d0, q10
+ vmovn.s32 d1, q11
+ vmovn.s32 d2, q12
+ vmovn.s32 d3, q13
+ vst1.16 {d0-d3}, [r0,:128]
+ bx lr
+.endfunc
+
+
+// int coeff_last( int16_t *l )
+function x264_coeff_last4_arm
+ ldrd r2, [r0]
+ subs r0, r3, #0
+ movne r0, #2
+ movne r2, r3
+ lsrs r2, r2, #16
+ addne r0, r0, #1
+ bx lr
+.endfunc
+
+.macro COEFF_LAST_1x size
+function x264_coeff_last\size\()_neon
+.if \size == 15
+ sub r0, r0, #2
+ vld1.64 {d0-d3}, [r0]
+.else
+ vld1.64 {d0-d3}, [r0,:128]
+.endif
+ vtst.16 q0, q0
+ vtst.16 q1, q1
+ vshrn.u16 d0, q0, #8
+ vshrn.u16 d1, q1, #8
+ vshrn.u16 d0, q0, #4
+ vclz.i32 d0, d0
+ mov ip, #7
+ mov r3, #\size - 9
+ vmov r0, r1, d0
+
+ subs r1, ip, r1, lsr #2
+ addge r0, r1, #\size - 8
+ sublts r0, r3, r0, lsr #2
+ movlt r0, #0
+ bx lr
+.endfunc
+.endm
+
+COEFF_LAST_1x 15
+COEFF_LAST_1x 16
+
+function x264_coeff_last64_neon
+ vld1.64 {d16-d19}, [r0,:128]!
+ vqmovn.u16 d16, q8
+ vqmovn.u16 d17, q9
+ vld1.64 {d20-d23}, [r0,:128]!
+ vqmovn.u16 d18, q10
+ vqmovn.u16 d19, q11
+ vld1.64 {d24-d27}, [r0,:128]!
+ vqmovn.u16 d20, q12
+ vqmovn.u16 d21, q13
+ vld1.64 {d28-d31}, [r0,:128]!
+ vqmovn.u16 d22, q14
+ vqmovn.u16 d23, q15
+
+ movrel r1, pmovmskb_byte
+ vld1.64 {d0-d1}, [r1,:128]
+
+ vtst.8 q8, q8
+ vtst.8 q9, q9
+ vtst.8 q10, q10
+ vtst.8 q11, q11
+
+ vand q8, q8, q0
+ vand q9, q9, q0
+ vand q10, q10, q0
+ vand q11, q11, q0
+
+ vpadd.u8 d0, d16, d17
+ vpadd.u8 d1, d18, d19
+ vpadd.u8 d2, d20, d21
+ vpadd.u8 d3, d22, d23
+ vpadd.u8 d0, d0, d1
+ vpadd.u8 d1, d2, d3
+ vpadd.u8 d0, d0, d1
+ vclz.i32 d0, d0
+ mov ip, #31
+ vmov r0, r1, d0
+
+ subs r1, ip, r1
+ addge r0, r1, #32
+ sublts r0, ip, r0
+ movlt r0, #0
+ bx lr
+.endfunc
diff --git a/matroska.h b/common/arm/quant.h
similarity index 50%
copy from matroska.h
copy to common/arm/quant.h
index be6f530..dcfed63 100644
--- a/matroska.h
+++ b/common/arm/quant.h
@@ -1,7 +1,9 @@
/*****************************************************************************
- * matroska.h:
+ * quant.h: h264 encoder library
*****************************************************************************
- * Copyright (C) 2005 Mike Matsnev
+ * Copyright (C) 2005-2008 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,24 +20,23 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifndef X264_MATROSKA_H
-#define X264_MATROSKA_H
+#ifndef X264_ARM_QUANT_H
+#define X264_ARM_QUANT_H
-typedef struct mk_Writer mk_Writer;
+int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
-mk_Writer *mk_createWriter( const char *filename );
+int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
-int mk_writeHeader( mk_Writer *w, const char *writingApp,
- const char *codecID,
- const void *codecPrivate, unsigned codecPrivateSize,
- int64_t default_frame_duration,
- int64_t timescale,
- unsigned width, unsigned height,
- unsigned d_width, unsigned d_height );
+void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-int mk_startFrame( mk_Writer *w );
-int mk_addFrameData( mk_Writer *w, const void *data, unsigned size );
-int mk_setFrameFlags( mk_Writer *w, int64_t timestamp, int keyframe );
-int mk_close( mk_Writer *w );
+int x264_coeff_last4_arm( int16_t * );
+int x264_coeff_last15_neon( int16_t * );
+int x264_coeff_last16_neon( int16_t * );
+int x264_coeff_last64_neon( int16_t * );
#endif
diff --git a/common/bs.h b/common/bs.h
index eafa8f8..0773de6 100644
--- a/common/bs.h
+++ b/common/bs.h
@@ -73,24 +73,37 @@ extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
- int offset = ((intptr_t)p_data & (WORD_SIZE-1));
+ int offset = ((intptr_t)p_data & 3);
s->p = s->p_start = (uint8_t*)p_data - offset;
s->p_end = (uint8_t*)p_data + i_data;
- s->i_left = offset ? 8*offset : (WORD_SIZE*8);
- s->cur_bits = endian_fix( *(intptr_t*)s->p );
+ s->i_left = (WORD_SIZE - offset)*8;
+ s->cur_bits = endian_fix32( M32(s->p) );
+ s->cur_bits >>= (4-offset)*8;
}
static inline int bs_pos( bs_t *s )
{
return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
}
-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32/64-bit aligned. */
+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
static inline void bs_flush( bs_t *s )
{
- *(intptr_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+ M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
s->p += WORD_SIZE - s->i_left / 8;
s->i_left = WORD_SIZE*8;
}
+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
+static inline void bs_realign( bs_t *s )
+{
+ int offset = ((intptr_t)s->p & 3);
+ if( offset )
+ {
+ s->p = (uint8_t*)s->p - offset;
+ s->i_left = (WORD_SIZE - offset)*8;
+ s->cur_bits = endian_fix32( M32(s->p) );
+ s->cur_bits >>= (4-offset)*8;
+ }
+}
static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
{
@@ -101,9 +114,9 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
if( s->i_left <= 32 )
{
#ifdef WORDS_BIGENDIAN
- *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left);
+ M32( s->p ) = s->cur_bits >> (32 - s->i_left);
#else
- *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+ M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
#endif
s->i_left += 32;
s->p += 4;
@@ -120,7 +133,7 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
{
i_count -= s->i_left;
s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
- *(uint32_t*)s->p = endian_fix( s->cur_bits );
+ M32( s->p ) = endian_fix( s->cur_bits );
s->p += 4;
s->cur_bits = i_bits;
s->i_left = 32 - i_count;
@@ -143,7 +156,7 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit )
s->i_left--;
if( s->i_left == WORD_SIZE*8-32 )
{
- *(uint32_t*)s->p = endian_fix32( s->cur_bits );
+ M32( s->p ) = endian_fix32( s->cur_bits );
s->p += 4;
s->i_left = WORD_SIZE*8;
}
@@ -151,23 +164,19 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit )
static inline void bs_align_0( bs_t *s )
{
- if( s->i_left&7 )
- {
- s->cur_bits <<= s->i_left&7;
- s->i_left &= ~7;
- }
+ bs_write( s, s->i_left&7, 0 );
bs_flush( s );
}
static inline void bs_align_1( bs_t *s )
{
- if( s->i_left&7 )
- {
- s->cur_bits <<= s->i_left&7;
- s->cur_bits |= (1 << (s->i_left&7)) - 1;
- s->i_left &= ~7;
- }
+ bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
bs_flush( s );
}
+static inline void bs_align_10( bs_t *s )
+{
+ if( s->i_left&7 )
+ bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
+}
/* golomb functions */
@@ -245,7 +254,7 @@ static inline void bs_write_te( bs_t *s, int x, int val )
static inline void bs_rbsp_trailing( bs_t *s )
{
bs_write1( s, 1 );
- bs_flush( s );
+ bs_write( s, s->i_left&7, 0 );
}
static inline int bs_size_ue( unsigned int val )
diff --git a/common/cabac.h b/common/cabac.h
index 9d0fddd..35871b4 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -39,7 +39,7 @@ typedef struct
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
- DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
+ ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[460];
diff --git a/common/common.c b/common/common.c
index d7d45d3..6d1d7f0 100644
--- a/common/common.c
+++ b/common/common.c
@@ -21,6 +21,9 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
+#include "common.h"
+#include "cpu.h"
+
#include <stdarg.h>
#include <ctype.h>
@@ -28,9 +31,6 @@
#include <malloc.h>
#endif
-#include "common.h"
-#include "cpu.h"
-
static void x264_log_default( void *, int, const char *, va_list );
/****************************************************************************
@@ -43,8 +43,9 @@ void x264_param_default( x264_param_t *param )
/* CPU autodetect */
param->cpu = x264_cpu_detect();
- param->i_threads = 1;
+ param->i_threads = X264_THREADS_AUTO;
param->b_deterministic = 1;
+ param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
/* Video properties */
param->i_csp = X264_CSP_I420;
@@ -62,16 +63,21 @@ void x264_param_default( x264_param_t *param )
param->i_fps_num = 25;
param->i_fps_den = 1;
param->i_level_idc = -1;
+ param->i_slice_max_size = 0;
+ param->i_slice_max_mbs = 0;
+ param->i_slice_count = 0;
/* Encoder parameters */
- param->i_frame_reference = 1;
+ param->i_frame_reference = 3;
param->i_keyint_max = 250;
param->i_keyint_min = 25;
- param->i_bframe = 0;
+ param->i_bframe = 3;
param->i_scenecut_threshold = 40;
param->i_bframe_adaptive = X264_B_ADAPT_FAST;
param->i_bframe_bias = 0;
- param->b_bframe_pyramid = 0;
+ param->i_bframe_pyramid = 0;
+ param->b_interlaced = 0;
+ param->b_constrained_intra = 0;
param->b_deblocking_filter = 1;
param->i_deblocking_filter_alphac0 = 0;
@@ -80,14 +86,14 @@ void x264_param_default( x264_param_t *param )
param->b_cabac = 1;
param->i_cabac_init_idc = 0;
- param->rc.i_rc_method = X264_RC_NONE;
+ param->rc.i_rc_method = X264_RC_CRF;
param->rc.i_bitrate = 0;
param->rc.f_rate_tolerance = 1.0;
param->rc.i_vbv_max_bitrate = 0;
param->rc.i_vbv_buffer_size = 0;
param->rc.f_vbv_buffer_init = 0.9;
- param->rc.i_qp_constant = 26;
- param->rc.f_rf_constant = 0;
+ param->rc.i_qp_constant = 23;
+ param->rc.f_rf_constant = 23;
param->rc.i_qp_min = 10;
param->rc.i_qp_max = 51;
param->rc.i_qp_step = 4;
@@ -95,6 +101,7 @@ void x264_param_default( x264_param_t *param )
param->rc.f_pb_factor = 1.3;
param->rc.i_aq_mode = X264_AQ_VARIANCE;
param->rc.f_aq_strength = 1.0;
+ param->rc.i_lookahead = 40;
param->rc.b_stat_write = 0;
param->rc.psz_stat_out = "x264_2pass.log";
@@ -104,6 +111,7 @@ void x264_param_default( x264_param_t *param )
param->rc.f_qblur = 0.5;
param->rc.f_complexity_blur = 20;
param->rc.i_zones = 0;
+ param->rc.b_mb_tree = 1;
/* Log */
param->pf_log = x264_log_default;
@@ -117,19 +125,25 @@ void x264_param_default( x264_param_t *param )
param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
param->analyse.i_me_method = X264_ME_HEX;
param->analyse.f_psy_rd = 1.0;
+ param->analyse.b_psy = 1;
param->analyse.f_psy_trellis = 0;
param->analyse.i_me_range = 16;
- param->analyse.i_subpel_refine = 6;
+ param->analyse.i_subpel_refine = 7;
+ param->analyse.b_mixed_references = 1;
param->analyse.b_chroma_me = 1;
param->analyse.i_mv_range_thread = -1;
param->analyse.i_mv_range = -1; // set from level_idc
param->analyse.i_chroma_qp_offset = 0;
param->analyse.b_fast_pskip = 1;
+ param->analyse.b_weighted_bipred = 1;
+ param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
param->analyse.b_dct_decimate = 1;
+ param->analyse.b_transform_8x8 = 1;
+ param->analyse.i_trellis = 1;
param->analyse.i_luma_deadzone[0] = 21;
param->analyse.i_luma_deadzone[1] = 11;
- param->analyse.b_psnr = 1;
- param->analyse.b_ssim = 1;
+ param->analyse.b_psnr = 0;
+ param->analyse.b_ssim = 0;
param->i_cqm_preset = X264_CQM_FLAT;
memset( param->cqm_4iy, 16, 16 );
@@ -140,7 +154,10 @@ void x264_param_default( x264_param_t *param )
memset( param->cqm_8py, 16, 64 );
param->b_repeat_headers = 1;
+ param->b_annexb = 1;
param->b_aud = 0;
+ param->b_vfr_input = 1;
+ param->b_dts_compress = 0;
}
static int parse_enum( const char *arg, const char * const *names, int *dst )
@@ -246,7 +263,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
if( b_error )
{
char *buf = strdup(value);
- char *tok, UNUSED *saveptr, *init;
+ char *tok, UNUSED *saveptr=NULL, *init;
b_error = 0;
p->cpu = 0;
for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
@@ -262,10 +279,19 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
OPT("threads")
{
if( !strcmp(value, "auto") )
- p->i_threads = 0;
+ p->i_threads = X264_THREADS_AUTO;
else
p->i_threads = atoi(value);
}
+ OPT("sliced-threads")
+ p->b_sliced_threads = atobool(value);
+ OPT("sync-lookahead")
+ {
+ if( !strcmp(value, "auto") )
+ p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
+ else
+ p->i_sync_lookahead = atoi(value);
+ }
OPT2("deterministic", "n-deterministic")
p->b_deterministic = atobool(value);
OPT2("level", "level-idc")
@@ -331,6 +357,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->i_scenecut_threshold = atoi(value);
}
}
+ OPT("intra-refresh")
+ p->b_intra_refresh = atobool(value);
OPT("bframes")
p->i_bframe = atoi(value);
OPT("b-adapt")
@@ -345,7 +373,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
OPT("b-bias")
p->i_bframe_bias = atoi(value);
OPT("b-pyramid")
- p->b_bframe_pyramid = atobool(value);
+ b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid );
OPT("nf")
p->b_deblocking_filter = !atobool(value);
OPT2("filter", "deblock")
@@ -363,12 +391,20 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
else
p->b_deblocking_filter = atobool(value);
}
+ OPT("slice-max-size")
+ p->i_slice_max_size = atoi(value);
+ OPT("slice-max-mbs")
+ p->i_slice_max_mbs = atoi(value);
+ OPT("slices")
+ p->i_slice_count = atoi(value);
OPT("cabac")
p->b_cabac = atobool(value);
OPT("cabac-idc")
p->i_cabac_init_idc = atoi(value);
OPT("interlaced")
p->b_interlaced = atobool(value);
+ OPT("constrained-intra")
+ p->b_constrained_intra = atobool(value);
OPT("cqm")
{
if( strstr( value, "flat" ) )
@@ -438,7 +474,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
}
OPT("log")
p->i_log_level = atoi(value);
-#ifdef VISUALIZE
+#ifdef HAVE_VISUALIZE
OPT("visualize")
p->b_visualize = atobool(value);
#endif
@@ -460,6 +496,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->analyse.b_transform_8x8 = atobool(value);
OPT2("weightb", "weight-b")
p->analyse.b_weighted_bipred = atobool(value);
+ OPT("weightp")
+ p->analyse.i_weighted_pred = atoi(value);
OPT2("direct", "direct-pred")
b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
OPT("chroma-qp-offset")
@@ -489,6 +527,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->analyse.f_psy_trellis = 0;
}
}
+ OPT("psy")
+ p->analyse.b_psy = atobool(value);
OPT("chroma-me")
p->analyse.b_chroma_me = atobool(value);
OPT("mixed-refs")
@@ -520,6 +560,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->rc.f_rf_constant = atof(value);
p->rc.i_rc_method = X264_RC_CRF;
}
+ OPT("rc-lookahead")
+ p->rc.i_lookahead = atoi(value);
OPT2("qpmin", "qp-min")
p->rc.i_qp_min = atoi(value);
OPT2("qpmax", "qp-max")
@@ -555,6 +597,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
}
OPT("qcomp")
p->rc.f_qcompress = atof(value);
+ OPT("mbtree")
+ p->rc.b_mb_tree = atobool(value);
OPT("qblur")
p->rc.f_qblur = atof(value);
OPT2("cplxblur", "cplx-blur")
@@ -573,6 +617,10 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->b_repeat_headers = !atobool(value);
OPT("repeat-headers")
p->b_repeat_headers = atobool(value);
+ OPT("annexb")
+ p->b_annexb = atobool(value);
+ OPT("force-cfr")
+ p->b_vfr_input = !atobool(value);
else
return X264_PARAM_BAD_NAME;
#undef OPT
@@ -593,11 +641,14 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
****************************************************************************/
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
{
- if( i_level <= h->param.i_log_level )
+ if( !h || i_level <= h->param.i_log_level )
{
va_list arg;
va_start( arg, psz_fmt );
- h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
+ if( !h )
+ x264_log_default( NULL, i_level, psz_fmt, arg );
+ else
+ h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
va_end( arg );
}
}
@@ -630,18 +681,22 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
/****************************************************************************
* x264_picture_alloc:
****************************************************************************/
-void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
{
pic->i_type = X264_TYPE_AUTO;
pic->i_qpplus1 = 0;
pic->img.i_csp = i_csp;
pic->img.i_plane = 3;
pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
+ if( !pic->img.plane[0] )
+ return -1;
pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
pic->img.i_stride[0] = i_width;
pic->img.i_stride[1] = i_width / 2;
pic->img.i_stride[2] = i_width / 2;
+ pic->param = NULL;
+ return 0;
}
/****************************************************************************
@@ -658,23 +713,23 @@ void x264_picture_clean( x264_picture_t *pic )
/****************************************************************************
* x264_nal_encode:
****************************************************************************/
-int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal )
+int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal )
{
- uint8_t *dst = p_data;
uint8_t *src = nal->p_payload;
- uint8_t *end = &nal->p_payload[nal->i_payload];
- int i_count = 0;
+ uint8_t *end = nal->p_payload + nal->i_payload;
+ uint8_t *orig_dst = dst;
+ int i_count = 0, size;
- /* FIXME this code doesn't check overflow */
-
- if( b_annexeb )
+ /* long nal start code (we always use long ones) */
+ if( b_annexb )
{
- /* long nal start code (we always use long ones)*/
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x01;
}
+ else /* save room for size later */
+ dst += 4;
/* nal header */
*dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
@@ -692,9 +747,19 @@ int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal
i_count = 0;
*dst++ = *src++;
}
- *pi_data = dst - (uint8_t*)p_data;
+ size = (dst - orig_dst) - 4;
- return *pi_data;
+ /* Write the size header for mp4/etc */
+ if( !b_annexb )
+ {
+ /* Size doesn't include the size of the header we're writing now. */
+ orig_dst[0] = size>>24;
+ orig_dst[1] = size>>16;
+ orig_dst[2] = size>> 8;
+ orig_dst[3] = size>> 0;
+ }
+
+ return size+4;
}
@@ -704,22 +769,25 @@ int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal
****************************************************************************/
void *x264_malloc( int i_size )
{
+ uint8_t *align_buf = NULL;
#ifdef SYS_MACOSX
/* Mac OS X always returns 16 bytes aligned memory */
- return malloc( i_size );
+ align_buf = malloc( i_size );
#elif defined( HAVE_MALLOC_H )
- return memalign( 16, i_size );
+ align_buf = memalign( 16, i_size );
#else
- uint8_t * buf;
- uint8_t * align_buf;
- buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
- sizeof( int ) );
- align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
- align_buf -= (intptr_t) align_buf & 15;
- *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
- *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
- return align_buf;
+ uint8_t *buf = malloc( i_size + 15 + sizeof(void **) + sizeof(int) );
+ if( buf )
+ {
+ align_buf = buf + 15 + sizeof(void **) + sizeof(int);
+ align_buf -= (intptr_t) align_buf & 15;
+ *( (void **) ( align_buf - sizeof(void **) ) ) = buf;
+ *( (int *) ( align_buf - sizeof(void **) - sizeof(int) ) ) = i_size;
+ }
#endif
+ if( !align_buf )
+ x264_log( NULL, X264_LOG_ERROR, "malloc of size %d failed\n", i_size );
+ return align_buf;
}
/****************************************************************************
@@ -738,31 +806,6 @@ void x264_free( void *p )
}
/****************************************************************************
- * x264_realloc:
- ****************************************************************************/
-void *x264_realloc( void *p, int i_size )
-{
-#ifdef HAVE_MALLOC_H
- return realloc( p, i_size );
-#else
- int i_old_size = 0;
- uint8_t * p_new;
- if( p )
- {
- i_old_size = *( (int*) ( (uint8_t*) p - sizeof( void ** ) -
- sizeof( int ) ) );
- }
- p_new = x264_malloc( i_size );
- if( i_old_size > 0 && i_size > 0 )
- {
- memcpy( p_new, p, ( i_old_size < i_size ) ? i_old_size : i_size );
- }
- x264_free( p );
- return p_new;
-#endif
-}
-
-/****************************************************************************
* x264_reduce_fraction:
****************************************************************************/
void x264_reduce_fraction( int *n, int *d )
@@ -775,9 +818,9 @@ void x264_reduce_fraction( int *n, int *d )
c = a % b;
while(c)
{
- a = b;
- b = c;
- c = a % b;
+ a = b;
+ b = c;
+ c = a % b;
}
*n /= b;
*d /= b;
@@ -825,6 +868,8 @@ char *x264_param2string( x264_param_t *p, int b_res )
if( p->rc.psz_zones )
len += strlen(p->rc.psz_zones);
buf = s = x264_malloc( len );
+ if( !buf )
+ return NULL;
if( b_res )
{
@@ -839,7 +884,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
- s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
+ s += sprintf( s, " psy=%d", p->analyse.b_psy );
+ if( p->analyse.b_psy )
+ s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
@@ -847,26 +894,39 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " 8x8dct=%d", p->analyse.b_transform_8x8 );
s += sprintf( s, " cqm=%d", p->i_cqm_preset );
s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
+ s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
s += sprintf( s, " threads=%d", p->i_threads );
+ s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
+ if( p->i_slice_count )
+ s += sprintf( s, " slices=%d", p->i_slice_count );
+ if( p->i_slice_max_size )
+ s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size );
+ if( p->i_slice_max_mbs )
+ s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs );
s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
s += sprintf( s, " mbaff=%d", p->b_interlaced );
+ s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );
s += sprintf( s, " bframes=%d", p->i_bframe );
if( p->i_bframe )
{
s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d",
- p->b_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
+ p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred );
}
+ s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
+
+ s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d intra_refresh=%d",
+ p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh );
- s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
- p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
+ if( p->rc.b_mb_tree || p->rc.i_vbv_buffer_size )
+ s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );
- s += sprintf( s, " rc=%s", p->rc.i_rc_method == X264_RC_ABR ?
+ s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size ? "cbr" : "abr" )
- : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp" );
+ : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
{
if( p->rc.i_rc_method == X264_RC_CRF )
@@ -888,7 +948,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
if( !(p->rc.i_rc_method == X264_RC_CQP && p->rc.i_qp_constant == 0) )
{
s += sprintf( s, " ip_ratio=%.2f", p->rc.f_ip_factor );
- if( p->i_bframe )
+ if( p->i_bframe && !p->rc.b_mb_tree )
s += sprintf( s, " pb_ratio=%.2f", p->rc.f_pb_factor );
s += sprintf( s, " aq=%d", p->rc.i_aq_mode );
if( p->rc.i_aq_mode )
diff --git a/common/common.h b/common/common.h
index 1e46ae8..950f48f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -34,28 +34,39 @@
#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
+#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define CHECKED_MALLOC( var, size )\
-{\
+do {\
var = x264_malloc( size );\
if( !var )\
- {\
- x264_log( h, X264_LOG_ERROR, "malloc failed\n" );\
goto fail;\
- }\
-}
+} while( 0 )
+#define CHECKED_MALLOCZERO( var, size )\
+do {\
+ CHECKED_MALLOC( var, size );\
+ memset( var, 0, size );\
+} while( 0 )
#define X264_BFRAME_MAX 16
#define X264_THREAD_MAX 128
-#define X264_SLICE_MAX 4
-#define X264_NAL_MAX (4 + X264_SLICE_MAX)
#define X264_PCM_COST (386*8)
+#define X264_LOOKAHEAD_MAX 250
+// arbitrary, but low because SATD scores are 1/4 normal
+#define X264_LOOKAHEAD_QP 12
// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
#define X264_THREAD_HEIGHT 24
+/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
+ * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
+ * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
+ * real weights are being used. */
+
+#define X264_WEIGHTP_FAKE (-1)
+
/****************************************************************************
* Includes
****************************************************************************/
@@ -65,6 +76,22 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
+#include <limits.h>
+
+/* Unions for type-punning.
+ * Mn: load or store n bits, aligned, native-endian
+ * CPn: copy n bits, aligned, native-endian
+ * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
+typedef union { uint16_t i; uint8_t c[2]; } MAY_ALIAS x264_union16_t;
+typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } MAY_ALIAS x264_union32_t;
+typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
+#define M16(src) (((x264_union16_t*)(src))->i)
+#define M32(src) (((x264_union32_t*)(src))->i)
+#define M64(src) (((x264_union64_t*)(src))->i)
+#define CP16(dst,src) M16(dst) = M16(src)
+#define CP32(dst,src) M32(dst) = M32(src)
+#define CP64(dst,src) M64(dst) = M64(src)
+
#include "x264.h"
#include "bs.h"
#include "set.h"
@@ -77,12 +104,11 @@
#include "quant.h"
/****************************************************************************
- * Generals functions
+ * General functions
****************************************************************************/
/* x264_malloc : will do or emulate a memalign
* you have to use x264_free for buffers allocated with x264_malloc */
void *x264_malloc( int );
-void *x264_realloc( void *p, int i_size );
void x264_free( void * );
/* x264_slurp_file: malloc space for the whole file and read it */
@@ -95,6 +121,8 @@ int64_t x264_mdate( void );
* the encoding options */
char *x264_param2string( x264_param_t *p, int b_res );
+int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal );
+
/* log */
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
@@ -152,6 +180,26 @@ static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
return amvd0 + (amvd1<<16);
}
+extern const uint8_t x264_exp2_lut[64];
+extern const float x264_log2_lut[128];
+extern const float x264_log2_lz_lut[32];
+
+/* Not a general-purpose function; multiplies input by -1/6 to convert
+ * qp to qscale. */
+static ALWAYS_INLINE int x264_exp2fix8( float x )
+{
+ int i = x*(-64.f/6.f) + 512.5f;
+ if( i < 0 ) return 0;
+ if( i > 1023 ) return 0xffff;
+ return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
+}
+
+static ALWAYS_INLINE float x264_log2( uint32_t x )
+{
+ int lz = x264_clz( x );
+ return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
+}
+
/****************************************************************************
*
****************************************************************************/
@@ -199,11 +247,23 @@ typedef struct
int b_ref_pic_list_reordering_l0;
int b_ref_pic_list_reordering_l1;
- struct {
+ struct
+ {
int idc;
int arg;
} ref_pic_list_order[2][16];
+ /* P-frame weighting */
+ x264_weight_t weight[32][3];
+
+ int i_mmco_remove_from_end;
+ int i_mmco_command_count;
+ struct /* struct for future expansion */
+ {
+ int i_difference_of_pic_nums;
+ int i_poc;
+ } mmco[16];
+
int i_cabac_init_idc;
int i_qp;
@@ -218,6 +278,19 @@ typedef struct
} x264_slice_header_t;
+typedef struct x264_lookahead_t
+{
+ volatile uint8_t b_exit_thread;
+ uint8_t b_thread_active;
+ uint8_t b_analyse_keyframe;
+ int i_last_keyframe;
+ int i_slicetype_length;
+ x264_frame_t *last_nonb;
+ x264_synch_frame_list_t ifbuf;
+ x264_synch_frame_list_t next;
+ x264_synch_frame_list_t ofbuf;
+} x264_lookahead_t;
+
/* From ffmpeg
*/
#define X264_SCAN8_SIZE (6*8)
@@ -262,36 +335,37 @@ struct x264_t
/* encoder parameters */
x264_param_t param;
- x264_t *thread[X264_THREAD_MAX];
+ x264_t *thread[X264_THREAD_MAX+1];
x264_pthread_t thread_handle;
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
+ int i_threadslice_start; /* first row in this thread slice */
+ int i_threadslice_end; /* row after the end of this thread slice */
/* bitstream output */
struct
{
int i_nal;
- x264_nal_t nal[X264_NAL_MAX];
+ int i_nals_allocated;
+ x264_nal_t *nal;
int i_bitstream; /* size of p_bitstream */
uint8_t *p_bitstream; /* will hold data for all nal */
bs_t bs;
- int i_frame_size;
} out;
+ uint8_t *nal_buffer;
+ int nal_buffer_size;
+
/**** thread synchronization starts here ****/
/* frame number/poc */
int i_frame;
+ int i_frame_num;
- int i_frame_offset; /* decoding only */
- int i_frame_num; /* decoding only */
- int i_poc_msb; /* decoding only */
- int i_poc_lsb; /* decoding only */
- int i_poc; /* decoding only */
-
- int i_thread_num; /* threads only */
- int i_nal_type; /* threads only */
- int i_nal_ref_idc; /* threads only */
+ int i_thread_frames; /* Number of different frames being encoded by threads;
+ * 1 when sliced-threads is on. */
+ int i_nal_type;
+ int i_nal_ref_idc;
/* We use only one SPS and one PPS */
x264_sps_t sps_array[1];
@@ -300,9 +374,12 @@ struct x264_t
x264_pps_t *pps;
int i_idr_pic_id;
- /* quantization matrix for decoding, [cqm][qp%6][coef_y][coef_x] */
- int (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */
- int (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */
+ /* Timebase multiplier for DTS compression */
+ int i_dts_compress_multiplier;
+
+ /* quantization matrix for decoding, [cqm][qp%6][coef] */
+ int (*dequant4_mf[4])[16]; /* [4][6][16] */
+ int (*dequant8_mf[2])[64]; /* [2][6][64] */
/* quantization matrix for trellis, [cqm][qp][coef] */
int (*unquant4_mf[4])[16]; /* [4][52][16] */
int (*unquant8_mf[2])[64]; /* [2][52][64] */
@@ -312,10 +389,16 @@ struct x264_t
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
+ /* mv/ref cost arrays. Indexed by lambda instead of
+ * qp because, due to rounding, some quantizers share
+ * lambdas. This saves memory. */
+ uint16_t *cost_mv[92];
+ uint16_t *cost_mv_fpel[92][4];
+
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
- DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
- DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
+ ALIGNED_16( uint32_t nr_residual_sum[2][64] );
+ ALIGNED_16( uint16_t nr_offset[2][64] );
uint32_t nr_count[2];
/* Slice header */
@@ -327,18 +410,17 @@ struct x264_t
struct
{
/* Frames to be encoded (whose types have been decided) */
- x264_frame_t *current[X264_BFRAME_MAX*4+3];
- /* Temporary buffer (frames types not yet decided) */
- x264_frame_t *next[X264_BFRAME_MAX*4+3];
- /* Unused frames */
- x264_frame_t *unused[X264_BFRAME_MAX*4 + X264_THREAD_MAX*2 + 16+4];
- /* For adaptive B decision */
- x264_frame_t *last_nonb;
+ x264_frame_t **current;
+ /* Unused frames: 0 = fenc, 1 = fdec */
+ x264_frame_t **unused[2];
+
+ /* Unused blank frames (for duplicates) */
+ x264_frame_t **blank_unused;
/* frames used for reference + sentinels */
x264_frame_t *reference[16+2];
- int i_last_idr; /* Frame number of the last IDR */
+ int i_last_keyframe; /* Frame number of the last keyframe */
int i_input; /* Number of input frames already accepted */
@@ -346,6 +428,10 @@ struct x264_t
int i_max_ref0;
int i_max_ref1;
int i_delay; /* Number of frames buffered for B reordering */
+ int i_bframe_delay;
+ int64_t i_bframe_delay_time;
+ int64_t i_init_delta;
+ int64_t i_prev_dts[2];
int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */
int b_have_sub8x8_esa;
} frames;
@@ -368,11 +454,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
- DECLARE_ALIGNED_16( int16_t luma16x16_dc[16] );
- DECLARE_ALIGNED_16( int16_t chroma_dc[2][4] );
+ ALIGNED_16( int16_t luma16x16_dc[16] );
+ ALIGNED_16( int16_t chroma_dc[2][4] );
// FIXME share memory?
- DECLARE_ALIGNED_16( int16_t luma8x8[4][64] );
- DECLARE_ALIGNED_16( int16_t luma4x4[16+8][16] );
+ ALIGNED_16( int16_t luma8x8[4][64] );
+ ALIGNED_16( int16_t luma4x4[16+8][16] );
} dct;
/* MB table and cache for current frame/mb */
@@ -418,6 +504,7 @@ struct x264_t
unsigned int i_neighbour;
unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */
unsigned int i_neighbour4[16]; /* at the time the block is coded */
+ unsigned int i_neighbour_intra; /* for constrained intra pred */
int i_mb_type_top;
int i_mb_type_left;
int i_mb_type_topleft;
@@ -444,12 +531,14 @@ struct x264_t
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
- uint8_t (*nnz_backup)[16]; /* when using cavlc + 8x8dct, the deblocker uses a modified nnz */
+
+ /* buffer for weighted versions of the reference frames */
+ uint8_t *p_weight_buf[16];
/* current value */
int i_type;
int i_partition;
- DECLARE_ALIGNED_4( uint8_t i_sub_partition[4] );
+ ALIGNED_4( uint8_t i_sub_partition[4] );
int b_transform_8x8;
int i_cbp_luma;
@@ -466,28 +555,31 @@ struct x264_t
/* skip flag for motion compensation */
/* if we've already done MC, we don't need to do it again */
int b_skip_mc;
+ /* set to true if we are re-encoding a macroblock. */
+ int b_reencode_mb;
+ int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
struct
{
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- DECLARE_ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
- DECLARE_ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
+ ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
+ ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
- DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
- DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
- DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
- DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+ ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
+ ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
+ ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
+ ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
/* Psy trellis DCT data */
- DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
- DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );
+ ALIGNED_16( int16_t fenc_dct8[4][64] );
+ ALIGNED_16( int16_t fenc_dct4[16][16] );
/* Psy RD SATD scores */
int fenc_satd[4][4];
@@ -506,6 +598,7 @@ struct x264_t
/* pointer over mb of the references */
int i_fref[2];
uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
+ uint8_t *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16];
/* fref stride */
@@ -516,24 +609,24 @@ struct x264_t
struct
{
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
- int8_t intra4x4_pred_mode[X264_SCAN8_SIZE];
+ ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_SIZE] );
/* i_non_zero_count if available else 0x80 */
- uint8_t non_zero_count[X264_SCAN8_SIZE];
+ ALIGNED_4( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* -1 if unused, -2 if unavailable */
- DECLARE_ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
+ ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
/* 0 if not available */
- DECLARE_ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
- DECLARE_ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+ ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
+ ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
- DECLARE_ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
+ ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
- DECLARE_ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
- DECLARE_ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] );
- DECLARE_ALIGNED_4( int16_t pskip_mv[2] );
+ ALIGNED_4( int16_t direct_mv[2][4][2] );
+ ALIGNED_4( int8_t direct_ref[2][4] );
+ ALIGNED_4( int16_t pskip_mv[2] );
/* number of neighbors (top and left) that used 8x8 dct */
int i_neighbour_transform_size;
@@ -554,12 +647,19 @@ struct x264_t
int b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
int b_direct_auto_write; /* analyse direct modes, to use and/or save */
+ /* lambda values */
+ int i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
+ int i_psy_rd_lambda;
+ int i_chroma_lambda2_offset;
+
/* B_direct and weighted prediction */
int16_t dist_scale_factor[16][2];
- int16_t bipred_weight[32][4];
+ int8_t bipred_weight_buf[2][32][4];
+ int8_t (*bipred_weight)[4];
/* maps fref1[0]'s ref indices into the current list0 */
- int8_t map_col_to_list0_buf[2]; // for negative indices
- int8_t map_col_to_list0[16];
+#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
+ int8_t map_col_to_list0[18];
+ int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
} mb;
/* rate control encoding only */
@@ -586,6 +686,7 @@ struct x264_t
int i_mb_count_ref[2][32];
int i_mb_partition[17];
int i_mb_cbp[6];
+ int i_mb_pred_mode[3][13];
/* Adaptive direct mv pred */
int i_direct_score[2];
/* Metrics */
@@ -596,9 +697,9 @@ struct x264_t
/* Cumulated stats */
/* per slice info */
- int i_slice_count[5];
- int64_t i_slice_size[5];
- double f_slice_qp[5];
+ int i_frame_count[5];
+ int64_t i_frame_size[5];
+ double f_frame_qp[5];
int i_consecutive_bframes[X264_BFRAME_MAX+1];
/* */
int64_t i_ssd_global[5];
@@ -613,9 +714,12 @@ struct x264_t
int64_t i_mb_count_8x8dct[2];
int64_t i_mb_count_ref[2][2][32];
int64_t i_mb_cbp[6];
+ int64_t i_mb_pred_mode[3][13];
/* */
int i_direct_score[2];
int i_direct_frames[2];
+ /* num p-frames weighted */
+ int i_wpred[3];
} stat;
@@ -635,9 +739,10 @@ struct x264_t
x264_quant_function_t quantf;
x264_deblock_function_t loopf;
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
struct visualize_t *visualize;
#endif
+ x264_lookahead_t *lookahead;
};
// included at the end because it needs x264_t
diff --git a/common/cpu.c b/common/cpu.c
index 1cb7080..9f2d5a6 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -22,8 +22,11 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
+#define _GNU_SOURCE // for sched_getaffinity
+#include "common.h"
+#include "cpu.h"
+
#if defined(HAVE_PTHREAD) && defined(SYS_LINUX)
-#define _GNU_SOURCE
#include <sched.h>
#endif
#ifdef SYS_BEOS
@@ -39,9 +42,6 @@
#include <machine/cpu.h>
#endif
-#include "common.h"
-#include "cpu.h"
-
const x264_cpu_name_t x264_cpu_names[] = {
{"Altivec", X264_CPU_ALTIVEC},
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
@@ -61,9 +61,30 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
+ {"ARMv6", X264_CPU_ARMV6},
+ {"NEON", X264_CPU_NEON},
+ {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"", 0},
};
+#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
+#include <signal.h>
+#include <setjmp.h>
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler( int sig )
+{
+ if( !canjump )
+ {
+ signal( sig, SIG_DFL );
+ raise( sig );
+ }
+
+ canjump = 0;
+ siglongjmp( jmpbuf, 1 );
+}
+#endif
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
@@ -122,13 +143,17 @@ uint32_t x264_cpu_detect( void )
if( ecx&0x00000040 ) /* SSE4a */
{
cpu |= X264_CPU_SSE2_IS_FAST;
- cpu |= X264_CPU_SSE_MISALIGN;
cpu |= X264_CPU_LZCNT;
cpu |= X264_CPU_SHUFFLE_IS_FAST;
- x264_cpu_mask_misalign_sse();
}
else
cpu |= X264_CPU_SSE2_IS_SLOW;
+
+ if( ecx&0x00000080 ) /* Misalign SSE */
+ {
+ cpu |= X264_CPU_SSE_MISALIGN;
+ x264_cpu_mask_misalign_sse();
+ }
}
}
@@ -224,22 +249,6 @@ uint32_t x264_cpu_detect( void )
}
#elif defined( SYS_LINUX )
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
- if( !canjump )
- {
- signal( sig, SIG_DFL );
- raise( sig );
- }
-
- canjump = 0;
- siglongjmp( jmpbuf, 1 );
-}
uint32_t x264_cpu_detect( void )
{
@@ -265,6 +274,48 @@ uint32_t x264_cpu_detect( void )
}
#endif
+#elif defined( ARCH_ARM )
+
+void x264_cpu_neon_test();
+int x264_cpu_fast_neon_mrc_test();
+
+uint32_t x264_cpu_detect( void )
+{
+ int flags = 0;
+#ifdef HAVE_ARMV6
+ flags |= X264_CPU_ARMV6;
+
+ // don't do this hack if compiled with -mfpu=neon
+#ifndef HAVE_NEON
+ static void (* oldsig)( int );
+ oldsig = signal( SIGILL, sigill_handler );
+ if( sigsetjmp( jmpbuf, 1 ) )
+ {
+ signal( SIGILL, oldsig );
+ return flags;
+ }
+
+ canjump = 1;
+ x264_cpu_neon_test();
+ canjump = 0;
+ signal( SIGILL, oldsig );
+#endif
+
+ flags |= X264_CPU_NEON;
+
+ // fast neon -> arm (Cortex-A9) detection relies on user access to the
+ // cycle counter; this assumes ARMv7 performance counters.
+ // NEON requires at least ARMv7, ARMv8 may require changes here, but
+ // hopefully this hacky detection method will have been replaced by then.
+ // Note that there is potential for a race condition if another program or
+ // x264 instance disables or reinits the counters while x264 is using them,
+ // which may result in incorrect detection and the counters stuck enabled.
+ flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+ // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#endif
+ return flags;
+}
+
#else
uint32_t x264_cpu_detect( void )
diff --git a/common/cpu.h b/common/cpu.h
index 4380a35..6901e1e 100644
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -33,12 +33,12 @@ void x264_cpu_mask_misalign_sse( void );
* gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
* problem, but I don't want to require such a new version.
* This applies only to x86_32, since other architectures that need alignment
- * also have ABIs that ensure aligned stack. */
+ * either have ABIs that ensure aligned stack, or don't support it at all. */
#if defined(ARCH_X86) && defined(HAVE_MMX)
-int x264_stack_align( void (*func)(x264_t*), x264_t *arg );
-#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg)
+int x264_stack_align( void (*func)(), ... );
+#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
-#define x264_stack_align(func,arg) func(arg)
+#define x264_stack_align(func,...) func(__VA_ARGS__)
#endif
typedef struct {
diff --git a/common/dct.c b/common/dct.c
index 1f8f4b3..aa83ef4 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -28,79 +28,78 @@
#ifdef ARCH_PPC
# include "ppc/dct.h"
#endif
+#ifdef ARCH_ARM
+# include "arm/dct.h"
+#endif
int x264_dct4_weight2_zigzag[2][16];
int x264_dct8_weight2_zigzag[2][64];
-/*
- * XXX For all dct dc : input could be equal to output so ...
- */
-
-static void dct4x4dc( int16_t d[4][4] )
+static void dct4x4dc( int16_t d[16] )
{
- int16_t tmp[4][4];
+ int16_t tmp[16];
int s01, s23;
int d01, d23;
int i;
for( i = 0; i < 4; i++ )
{
- s01 = d[i][0] + d[i][1];
- d01 = d[i][0] - d[i][1];
- s23 = d[i][2] + d[i][3];
- d23 = d[i][2] - d[i][3];
-
- tmp[0][i] = s01 + s23;
- tmp[1][i] = s01 - s23;
- tmp[2][i] = d01 - d23;
- tmp[3][i] = d01 + d23;
+ s01 = d[i*4+0] + d[i*4+1];
+ d01 = d[i*4+0] - d[i*4+1];
+ s23 = d[i*4+2] + d[i*4+3];
+ d23 = d[i*4+2] - d[i*4+3];
+
+ tmp[0*4+i] = s01 + s23;
+ tmp[1*4+i] = s01 - s23;
+ tmp[2*4+i] = d01 - d23;
+ tmp[3*4+i] = d01 + d23;
}
for( i = 0; i < 4; i++ )
{
- s01 = tmp[i][0] + tmp[i][1];
- d01 = tmp[i][0] - tmp[i][1];
- s23 = tmp[i][2] + tmp[i][3];
- d23 = tmp[i][2] - tmp[i][3];
-
- d[i][0] = ( s01 + s23 + 1 ) >> 1;
- d[i][1] = ( s01 - s23 + 1 ) >> 1;
- d[i][2] = ( d01 - d23 + 1 ) >> 1;
- d[i][3] = ( d01 + d23 + 1 ) >> 1;
+ s01 = tmp[i*4+0] + tmp[i*4+1];
+ d01 = tmp[i*4+0] - tmp[i*4+1];
+ s23 = tmp[i*4+2] + tmp[i*4+3];
+ d23 = tmp[i*4+2] - tmp[i*4+3];
+
+ d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
+ d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
+ d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
+ d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
}
}
-static void idct4x4dc( int16_t d[4][4] )
+static void idct4x4dc( int16_t d[16] )
{
- int16_t tmp[4][4];
+ int16_t tmp[16];
int s01, s23;
int d01, d23;
int i;
for( i = 0; i < 4; i++ )
{
- s01 = d[i][0] + d[i][1];
- d01 = d[i][0] - d[i][1];
- s23 = d[i][2] + d[i][3];
- d23 = d[i][2] - d[i][3];
-
- tmp[0][i] = s01 + s23;
- tmp[1][i] = s01 - s23;
- tmp[2][i] = d01 - d23;
- tmp[3][i] = d01 + d23;
+ s01 = d[i*4+0] + d[i*4+1];
+ d01 = d[i*4+0] - d[i*4+1];
+ s23 = d[i*4+2] + d[i*4+3];
+ d23 = d[i*4+2] - d[i*4+3];
+
+ tmp[0*4+i] = s01 + s23;
+ tmp[1*4+i] = s01 - s23;
+ tmp[2*4+i] = d01 - d23;
+ tmp[3*4+i] = d01 + d23;
}
for( i = 0; i < 4; i++ )
{
- s01 = tmp[i][0] + tmp[i][1];
- d01 = tmp[i][0] - tmp[i][1];
- s23 = tmp[i][2] + tmp[i][3];
- d23 = tmp[i][2] - tmp[i][3];
-
- d[i][0] = s01 + s23;
- d[i][1] = s01 - s23;
- d[i][2] = d01 - d23;
- d[i][3] = d01 + d23;
+ s01 = tmp[i*4+0] + tmp[i*4+1];
+ d01 = tmp[i*4+0] - tmp[i*4+1];
+ s23 = tmp[i*4+2] + tmp[i*4+3];
+ d23 = tmp[i*4+2] - tmp[i*4+3];
+
+ d[i*4+0] = s01 + s23;
+ d[i*4+1] = s01 - s23;
+ d[i*4+2] = d01 - d23;
+ d[i*4+3] = d01 + d23;
}
}
@@ -119,42 +118,42 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
}
}
-static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
{
- int16_t d[4][4];
- int16_t tmp[4][4];
+ int16_t d[16];
+ int16_t tmp[16];
int i;
- pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+ pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
for( i = 0; i < 4; i++ )
{
- const int s03 = d[i][0] + d[i][3];
- const int s12 = d[i][1] + d[i][2];
- const int d03 = d[i][0] - d[i][3];
- const int d12 = d[i][1] - d[i][2];
-
- tmp[0][i] = s03 + s12;
- tmp[1][i] = 2*d03 + d12;
- tmp[2][i] = s03 - s12;
- tmp[3][i] = d03 - 2*d12;
+ const int s03 = d[i*4+0] + d[i*4+3];
+ const int s12 = d[i*4+1] + d[i*4+2];
+ const int d03 = d[i*4+0] - d[i*4+3];
+ const int d12 = d[i*4+1] - d[i*4+2];
+
+ tmp[0*4+i] = s03 + s12;
+ tmp[1*4+i] = 2*d03 + d12;
+ tmp[2*4+i] = s03 - s12;
+ tmp[3*4+i] = d03 - 2*d12;
}
for( i = 0; i < 4; i++ )
{
- const int s03 = tmp[i][0] + tmp[i][3];
- const int s12 = tmp[i][1] + tmp[i][2];
- const int d03 = tmp[i][0] - tmp[i][3];
- const int d12 = tmp[i][1] - tmp[i][2];
-
- dct[i][0] = s03 + s12;
- dct[i][1] = 2*d03 + d12;
- dct[i][2] = s03 - s12;
- dct[i][3] = d03 - 2*d12;
+ const int s03 = tmp[i*4+0] + tmp[i*4+3];
+ const int s12 = tmp[i*4+1] + tmp[i*4+2];
+ const int d03 = tmp[i*4+0] - tmp[i*4+3];
+ const int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+ dct[i*4+0] = s03 + s12;
+ dct[i*4+1] = 2*d03 + d12;
+ dct[i*4+2] = s03 - s12;
+ dct[i*4+3] = d03 - 2*d12;
}
}
-static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
{
sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
@@ -162,7 +161,7 @@ static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
-static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
+static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
{
sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
@@ -170,52 +169,70 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
+static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
+{
+ int16_t d[16];
+ int sum = 0;
+
+ pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+
+ sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
+ sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
+
+ return sum;
+}
+
+static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
+{
+ dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
+ dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
+ dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
+ dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
+}
-static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
+static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
{
- int16_t d[4][4];
- int16_t tmp[4][4];
+ int16_t d[16];
+ int16_t tmp[16];
int x, y;
int i;
for( i = 0; i < 4; i++ )
{
- const int s02 = dct[0][i] + dct[2][i];
- const int d02 = dct[0][i] - dct[2][i];
- const int s13 = dct[1][i] + (dct[3][i]>>1);
- const int d13 = (dct[1][i]>>1) - dct[3][i];
-
- tmp[i][0] = s02 + s13;
- tmp[i][1] = d02 + d13;
- tmp[i][2] = d02 - d13;
- tmp[i][3] = s02 - s13;
+ const int s02 = dct[0*4+i] + dct[2*4+i];
+ const int d02 = dct[0*4+i] - dct[2*4+i];
+ const int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
+ const int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
+
+ tmp[i*4+0] = s02 + s13;
+ tmp[i*4+1] = d02 + d13;
+ tmp[i*4+2] = d02 - d13;
+ tmp[i*4+3] = s02 - s13;
}
for( i = 0; i < 4; i++ )
{
- const int s02 = tmp[0][i] + tmp[2][i];
- const int d02 = tmp[0][i] - tmp[2][i];
- const int s13 = tmp[1][i] + (tmp[3][i]>>1);
- const int d13 = (tmp[1][i]>>1) - tmp[3][i];
-
- d[0][i] = ( s02 + s13 + 32 ) >> 6;
- d[1][i] = ( d02 + d13 + 32 ) >> 6;
- d[2][i] = ( d02 - d13 + 32 ) >> 6;
- d[3][i] = ( s02 - s13 + 32 ) >> 6;
+ const int s02 = tmp[0*4+i] + tmp[2*4+i];
+ const int d02 = tmp[0*4+i] - tmp[2*4+i];
+ const int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
+ const int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
+
+ d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
+ d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
+ d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
+ d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
}
for( y = 0; y < 4; y++ )
{
for( x = 0; x < 4; x++ )
- {
- p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
- }
+ p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
p_dst += FDEC_STRIDE;
}
}
-static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
+static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
{
add4x4_idct( &p_dst[0], dct[0] );
add4x4_idct( &p_dst[4], dct[1] );
@@ -223,7 +240,7 @@ static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
-static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
+static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
{
add8x8_idct( &p_dst[0], &dct[0] );
add8x8_idct( &p_dst[8], &dct[4] );
@@ -262,29 +279,29 @@ static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
DST(7) = (a4>>2) - a7 ;\
}
-static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
{
int i;
- int16_t tmp[8][8];
+ int16_t tmp[64];
- pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+ pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
-#define SRC(x) tmp[x][i]
-#define DST(x) tmp[x][i]
+#define SRC(x) tmp[x*8+i]
+#define DST(x) tmp[x*8+i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#undef DST
-#define SRC(x) tmp[i][x]
-#define DST(x) dct[x][i]
+#define SRC(x) tmp[i*8+x]
+#define DST(x) dct[x*8+i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#undef DST
}
-static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
+static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
{
sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
@@ -319,20 +336,20 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
DST(7, b0 - b7);\
}
-static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
+static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
{
int i;
- dct[0][0] += 32; // rounding for the >>6 at the end
+ dct[0] += 32; // rounding for the >>6 at the end
-#define SRC(x) dct[x][i]
-#define DST(x,rhs) dct[x][i] = (rhs)
+#define SRC(x) dct[x*8+i]
+#define DST(x,rhs) dct[x*8+i] = (rhs)
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
-#define SRC(x) dct[i][x]
+#define SRC(x) dct[i*8+x]
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
@@ -340,7 +357,7 @@ static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
#undef DST
}
-static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
+static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
{
add8x8_idct8( &dst[0], dct[0] );
add8x8_idct8( &dst[8], dct[1] );
@@ -361,23 +378,23 @@ static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
}
}
-static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
+static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
{
- add4x4_idct_dc( &p_dst[0], dct[0][0] );
- add4x4_idct_dc( &p_dst[4], dct[0][1] );
- add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
- add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
+ add4x4_idct_dc( &p_dst[0], dct[0] );
+ add4x4_idct_dc( &p_dst[4], dct[1] );
+ add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
+ add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
-static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
+static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
{
int i;
- for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
+ for( i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
{
- add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
- add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
- add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
- add4x4_idct_dc( &p_dst[12], dct[i][3] );
+ add4x4_idct_dc( &p_dst[ 0], dct[0] );
+ add4x4_idct_dc( &p_dst[ 4], dct[1] );
+ add4x4_idct_dc( &p_dst[ 8], dct[2] );
+ add4x4_idct_dc( &p_dst[12], dct[3] );
}
}
@@ -391,6 +408,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = add4x4_idct;
dctf->sub8x8_dct = sub8x8_dct;
+ dctf->sub8x8_dct_dc = sub8x8_dct_dc;
dctf->add8x8_idct = add8x8_idct;
dctf->add8x8_idct_dc = add8x8_idct_dc;
@@ -416,6 +434,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
#ifndef ARCH_X86_64
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
@@ -434,6 +453,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
{
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
@@ -454,6 +474,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
+
+ if( cpu&X264_CPU_SSE4 )
+ dctf->add4x4_idct = x264_add4x4_idct_sse4;
+
#endif //HAVE_MMX
#ifdef ARCH_PPC
@@ -474,6 +498,30 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
}
#endif
+
+#ifdef HAVE_ARMV6
+ if( cpu&X264_CPU_NEON )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_neon;
+ dctf->sub8x8_dct = x264_sub8x8_dct_neon;
+ dctf->sub16x16_dct = x264_sub16x16_dct_neon;
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
+ dctf->dct4x4dc = x264_dct4x4dc_neon;
+ dctf->idct4x4dc = x264_idct4x4dc_neon;
+
+ dctf->add4x4_idct = x264_add4x4_idct_neon;
+ dctf->add8x8_idct = x264_add8x8_idct_neon;
+ dctf->add16x16_idct = x264_add16x16_idct_neon;
+
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
+
+ dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
+ dctf->add16x16_idct8= x264_add16x16_idct8_neon;
+ }
+#endif
}
void x264_dct_init_weights( void )
@@ -489,8 +537,7 @@ void x264_dct_init_weights( void )
}
-// gcc pessimizes multi-dimensional arrays here, even with constant indices
-#define ZIG(i,y,x) level[i] = dct[0][x*8+y];
+#define ZIG(i,y,x) level[i] = dct[x*8+y];
#define ZIGZAG8_FRAME\
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
@@ -528,42 +575,43 @@ void x264_dct_init_weights( void )
ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
#define ZIGZAG4_FRAME\
- ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
+ ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
#define ZIGZAG4_FIELD\
- ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
+ ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
-static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
+static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
{
ZIGZAG8_FRAME
}
-static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
+static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
{
ZIGZAG8_FIELD
}
#undef ZIG
-#define ZIG(i,y,x) level[i] = dct[0][x*4+y];
+#define ZIG(i,y,x) level[i] = dct[x*4+y];
+#define ZIGDC(i,y,x) ZIG(i,y,x)
-static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
+static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
{
ZIGZAG4_FRAME
}
-static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
+static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
{
- *(uint32_t*)level = *(uint32_t*)dct;
+ CP32( level, dct );
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
- *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
- *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
- *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
+ CP32( level+6, dct+6 );
+ CP64( level+8, dct+8 );
+ CP64( level+12, dct+12 );
}
#undef ZIG
@@ -571,43 +619,76 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
level[i] = p_src[oe] - p_dst[od];\
+ nz |= level[i];\
}
#define COPY4x4\
- *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
- *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
- *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
- *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
+ CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+ CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+ CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+ CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define COPY8x8\
- *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
- *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
- *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
- *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
- *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
- *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
- *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
- *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
+ CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+ CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+ CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+ CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
+ CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
+ CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
+ CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
+ CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
+
+static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+{
+ int nz = 0;
+ ZIGZAG4_FRAME
+ COPY4x4
+ return !!nz;
+}
-static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
+ int nz = 0;
+ ZIGZAG4_FIELD
+ COPY4x4
+ return !!nz;
+}
+
+#undef ZIGDC
+#define ZIGDC(i,y,x) {\
+ int oe = x+y*FENC_STRIDE;\
+ int od = x+y*FDEC_STRIDE;\
+ *dc = p_src[oe] - p_dst[od];\
+ level[0] = 0;\
+}
+
+static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
+{
+ int nz = 0;
ZIGZAG4_FRAME
COPY4x4
+ return !!nz;
}
-static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
{
+ int nz = 0;
ZIGZAG4_FIELD
COPY4x4
+ return !!nz;
}
-static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
{
+ int nz = 0;
ZIGZAG8_FRAME
COPY8x8
+ return !!nz;
}
-static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
{
+ int nz = 0;
ZIGZAG8_FIELD
COPY8x8
+ return !!nz;
}
#undef ZIG
@@ -636,9 +717,18 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = zigzag_scan_4x4_field;
pf->sub_8x8 = zigzag_sub_8x8_field;
pf->sub_4x4 = zigzag_sub_4x4_field;
+ pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
+ {
pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
+ pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
+ }
+ if( cpu&X264_CPU_SSSE3 )
+ {
+ pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
+ pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
+ }
#endif
#ifdef ARCH_PPC
@@ -652,6 +742,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = zigzag_scan_4x4_frame;
pf->sub_8x8 = zigzag_sub_8x8_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame;
+ pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
@@ -662,6 +753,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_SSSE3 )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
+ pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
@@ -672,6 +764,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_ALTIVEC )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
#endif
+#ifdef HAVE_ARMV6
+ if( cpu&X264_CPU_NEON )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
+#endif
}
pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
diff --git a/common/dct.h b/common/dct.h
index 3819ce1..6f282b9 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -91,34 +91,36 @@ typedef struct
// pix1 stride = FENC_STRIDE
// pix2 stride = FDEC_STRIDE
// p_dst stride = FDEC_STRIDE
- void (*sub4x4_dct) ( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
- void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] );
+ void (*sub4x4_dct) ( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+ void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[16] );
- void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
- void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
- void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
+ void (*sub8x8_dct) ( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
+ void (*sub8x8_dct_dc)( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+ void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][16] );
+ void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[4] );
- void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
- void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
- void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] );
+ void (*sub16x16_dct) ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+ void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][16] );
+ void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[16] );
- void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
- void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
+ void (*sub8x8_dct8) ( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
+ void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[64] );
- void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
- void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] );
+ void (*sub16x16_dct8) ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+ void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][64] );
- void (*dct4x4dc) ( int16_t d[4][4] );
- void (*idct4x4dc)( int16_t d[4][4] );
+ void (*dct4x4dc) ( int16_t d[16] );
+ void (*idct4x4dc)( int16_t d[16] );
} x264_dct_function_t;
typedef struct
{
- void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] );
- void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
- void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
- void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
+ void (*scan_8x8)( int16_t level[64], int16_t dct[64] );
+ void (*scan_4x4)( int16_t level[16], int16_t dct[16] );
+ int (*sub_8x8) ( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
+ int (*sub_4x4) ( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
+ int (*sub_4x4ac)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t;
diff --git a/common/frame.c b/common/frame.c
index cc4b1b3..40cc78f 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -26,9 +26,9 @@
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
-x264_frame_t *x264_frame_new( x264_t *h )
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
{
- x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
+ x264_frame_t *frame;
int i, j;
int i_mb_count = h->mb.i_mb_count;
@@ -38,9 +38,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
int chroma_plane_size;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
- if( !frame ) return NULL;
-
- memset( frame, 0, sizeof(x264_frame_t) );
+ CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
/* allocate frame data (+64 for extra data for me) */
i_width = ALIGN( h->param.i_width, 16 );
@@ -50,60 +48,22 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
{
- frame->i_stride[i] = ALIGN( i_stride >> !!i, 16 );
+ frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
frame->i_width[i] = i_width >> !!i;
frame->i_lines[i] = i_lines >> !!i;
}
- luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
- chroma_plane_size = (frame->i_stride[1] * ( frame->i_lines[1] + 2*i_padv ));
+ luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
+ chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
for( i = 1; i < 3; i++ )
{
CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
- /* all 4 luma planes allocated together, since the cacheline split code
- * requires them to be in-phase wrt cacheline alignment. */
- if( h->param.analyse.i_subpel_refine )
- {
- CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
- for( i = 0; i < 4; i++ )
- frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
- frame->plane[0] = frame->filtered[0];
- }
- else
- {
- CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
- frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
- }
-
- if( h->frames.b_have_lowres )
- {
- frame->i_width_lowres = frame->i_width[0]/2;
- frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
- frame->i_lines_lowres = frame->i_lines[0]/2;
-
- luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
-
- CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
- for( i = 0; i < 4; i++ )
- frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
-
- for( j = 0; j <= !!h->param.i_bframe; j++ )
- for( i = 0; i <= h->param.i_bframe; i++ )
- {
- CHECKED_MALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
- memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
- CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
- }
- }
- if( h->param.analyse.i_me_method >= X264_ME_ESA )
- {
- CHECKED_MALLOC( frame->buffer[3],
- frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
- frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
- }
+ for( i = 0; i < h->param.i_bframe + 2; i++ )
+ for( j = 0; j < h->param.i_bframe + 2; j++ )
+ CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
frame->i_poc = -1;
frame->i_type = X264_TYPE_AUTO;
@@ -112,73 +72,142 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->i_frame = -1;
frame->i_frame_num = -1;
frame->i_lines_completed = -1;
+ frame->b_fdec = b_fdec;
+ frame->orig = frame;
- CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
- CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
- CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
- CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) );
- if( h->param.i_bframe )
+ /* all 4 luma planes allocated together, since the cacheline split code
+ * requires them to be in-phase wrt cacheline alignment. */
+ if( h->param.analyse.i_subpel_refine && b_fdec )
{
- CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
- CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+ CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
+ for( i = 0; i < 4; i++ )
+ frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+ frame->plane[0] = frame->filtered[0];
}
else
{
- frame->mv[1] = NULL;
- frame->ref[1] = NULL;
+ CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
+ frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
}
- CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
- CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
- for( i = 0; i < h->param.i_bframe + 2; i++ )
- for( j = 0; j < h->param.i_bframe + 2; j++ )
- CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+ frame->b_duplicate = 0;
- if( h->param.rc.i_aq_mode )
+ if( b_fdec ) /* fdec frame */
+ {
+ CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
+ CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+ CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+ if( h->param.i_bframe )
+ {
+ CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
+ CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+ }
+ else
+ {
+ frame->mv[1] = NULL;
+ frame->ref[1] = NULL;
+ }
+ CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
+ CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
+ if( h->param.analyse.i_me_method >= X264_ME_ESA )
+ {
+ CHECKED_MALLOC( frame->buffer[3],
+ frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
+ frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+ }
+ }
+ else /* fenc frame */
{
- CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
if( h->frames.b_have_lowres )
- CHECKED_MALLOC( frame->i_inv_qscale_factor, h->mb.i_mb_count * sizeof(uint16_t) );
+ {
+ frame->i_width_lowres = frame->i_width[0]/2;
+ frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
+ frame->i_lines_lowres = frame->i_lines[0]/2;
+
+ luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
+
+ CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
+ for( i = 0; i < 4; i++ )
+ frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
+
+ for( j = 0; j <= !!h->param.i_bframe; j++ )
+ for( i = 0; i <= h->param.i_bframe; i++ )
+ {
+ CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+ CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+ }
+ CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
+ for( j = 0; j <= h->param.i_bframe+1; j++ )
+ for( i = 0; i <= h->param.i_bframe+1; i++ )
+ {
+ CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+ CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
+ }
+ frame->i_intra_cost = frame->lowres_costs[0][0];
+ memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+ }
+ if( h->param.rc.i_aq_mode )
+ {
+ CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+ CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
+ if( h->frames.b_have_lowres )
+ /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
+ CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+ }
}
- x264_pthread_mutex_init( &frame->mutex, NULL );
- x264_pthread_cond_init( &frame->cv, NULL );
+ if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
+ goto fail;
+ if( x264_pthread_cond_init( &frame->cv, NULL ) )
+ goto fail;
return frame;
fail:
- x264_frame_delete( frame );
+ x264_free( frame );
return NULL;
}
void x264_frame_delete( x264_frame_t *frame )
{
int i, j;
- for( i = 0; i < 4; i++ )
- x264_free( frame->buffer[i] );
- for( i = 0; i < 4; i++ )
- x264_free( frame->buffer_lowres[i] );
- for( i = 0; i < X264_BFRAME_MAX+2; i++ )
- for( j = 0; j < X264_BFRAME_MAX+2; j++ )
- x264_free( frame->i_row_satds[i][j] );
- for( j = 0; j < 2; j++ )
- for( i = 0; i <= X264_BFRAME_MAX; i++ )
- {
- x264_free( frame->lowres_mvs[j][i] );
- x264_free( frame->lowres_mv_costs[j][i] );
- }
- x264_free( frame->f_qp_offset );
- x264_free( frame->i_inv_qscale_factor );
- x264_free( frame->i_intra_cost );
- x264_free( frame->i_row_bits );
- x264_free( frame->i_row_qp );
- x264_free( frame->mb_type );
- x264_free( frame->mv[0] );
- x264_free( frame->mv[1] );
- x264_free( frame->ref[0] );
- x264_free( frame->ref[1] );
- x264_pthread_mutex_destroy( &frame->mutex );
- x264_pthread_cond_destroy( &frame->cv );
+ /* Duplicate frames are blank copies of real frames (including pointers),
+ * so freeing those pointers would cause a double free later. */
+ if( !frame->b_duplicate )
+ {
+ for( i = 0; i < 4; i++ )
+ x264_free( frame->buffer[i] );
+ for( i = 0; i < 4; i++ )
+ x264_free( frame->buffer_lowres[i] );
+ for( i = 0; i < X264_BFRAME_MAX+2; i++ )
+ for( j = 0; j < X264_BFRAME_MAX+2; j++ )
+ x264_free( frame->i_row_satds[i][j] );
+ for( j = 0; j < 2; j++ )
+ for( i = 0; i <= X264_BFRAME_MAX; i++ )
+ {
+ x264_free( frame->lowres_mvs[j][i] );
+ x264_free( frame->lowres_mv_costs[j][i] );
+ }
+ x264_free( frame->i_propagate_cost );
+ for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
+ for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
+ {
+ x264_free( frame->lowres_costs[j][i] );
+ x264_free( frame->lowres_inter_types[j][i] );
+ }
+ x264_free( frame->f_qp_offset );
+ x264_free( frame->f_qp_offset_aq );
+ x264_free( frame->i_inv_qscale_factor );
+ x264_free( frame->i_row_bits );
+ x264_free( frame->i_row_qp );
+ x264_free( frame->mb_type );
+ x264_free( frame->mv[0] );
+ x264_free( frame->mv[1] );
+ x264_free( frame->ref[0] );
+ x264_free( frame->ref[1] );
+ x264_pthread_mutex_destroy( &frame->mutex );
+ x264_pthread_cond_destroy( &frame->cv );
+ }
x264_free( frame );
}
@@ -194,7 +223,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
dst->i_type = src->i_type;
dst->i_qpplus1 = src->i_qpplus1;
- dst->i_pts = src->i_pts;
+ dst->i_pts = dst->i_reordered_pts = src->i_pts;
+ dst->param = src->param;
for( i=0; i<3; i++ )
{
@@ -298,7 +328,7 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
{
int i;
for( i = 0; i < 4; i++ )
- plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
+ plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
@@ -309,8 +339,8 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
int i_subsample = i ? 1 : 0;
int i_width = h->param.i_width >> i_subsample;
int i_height = h->param.i_height >> i_subsample;
- int i_padx = ( h->sps->i_mb_width * 16 - h->param.i_width ) >> i_subsample;
- int i_pady = ( h->sps->i_mb_height * 16 - h->param.i_height ) >> i_subsample;
+ int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
+ int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
if( i_padx )
{
@@ -631,9 +661,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int stride2y = stridey << b_interlaced;
int strideuv = h->fdec->i_stride[1];
int stride2uv = strideuv << b_interlaced;
+ uint8_t (*nnz_backup)[16] = h->scratch_buffer;
if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
- munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
+ munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
{
@@ -698,10 +729,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
{\
/* *** Get bS for each 4px for the current edge *** */\
if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
- *(uint32_t*)bS = 0x03030303;\
+ M32( bS ) = 0x03030303;\
else\
{\
- *(uint32_t*)bS = 0x00000000;\
+ M32( bS ) = 0x00000000;\
for( i = 0; i < 4; i++ )\
{\
int x = i_dir == 0 ? i_edge : i;\
@@ -717,15 +748,20 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
bS[i] = bS[i-1];\
else\
{\
- /* FIXME: A given frame may occupy more than one position in\
- * the reference list. So we should compare the frame numbers,\
- * not the indices in the ref list.\
- * No harm yet, as we don't generate that case.*/\
int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
int i4p= mb_4x4+x+y*s4x4;\
int i4q= mbn_4x4+xn+yn*s4x4;\
- if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
+ int refs_equal;\
+ /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
+ if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
+ refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
+ else if( !h->mb.b_interlaced )\
+ refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
+ else\
+ refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
+ && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
+ if((!refs_equal ||\
abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
(h->sh.i_type == SLICE_TYPE_B &&\
@@ -747,7 +783,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
{\
int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
- DECLARE_ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
+ ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
if( i_edge )\
i_edge+= b_8x8_transform;\
else\
@@ -767,7 +803,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
goto end##i_dir;\
}\
DEBLOCK_STRENGTH(i_dir);\
- if( *(uint32_t*)bS )\
+ if( M32( bS ) )\
FILTER_DIR( , i_dir);\
end##i_dir:\
i_edge += b_8x8_transform+1;\
@@ -778,7 +814,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
{\
DEBLOCK_STRENGTH(i_dir);\
- if( *(uint32_t*)bS )\
+ if( M32( bS ) )\
FILTER_DIR( , i_dir);\
}\
}
@@ -788,7 +824,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
}
if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
- munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
+ munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
}
void x264_frame_deblock( x264_t *h )
@@ -832,6 +868,13 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta,
void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
#endif // ARCH_PPC
+#ifdef HAVE_ARMV6
+void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
+#endif
+
void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
{
pf->deblock_v_luma = deblock_v_luma_c;
@@ -873,6 +916,16 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_h_luma = x264_deblock_h_luma_altivec;
}
#endif // ARCH_PPC
+
+#ifdef HAVE_ARMV6
+ if( cpu&X264_CPU_NEON )
+ {
+ pf->deblock_v_luma = x264_deblock_v_luma_neon;
+ pf->deblock_h_luma = x264_deblock_h_luma_neon;
+ pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
+ pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
+ }
+#endif
}
@@ -937,20 +990,49 @@ void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
assert( frame->i_reference_count > 0 );
frame->i_reference_count--;
if( frame->i_reference_count == 0 )
- x264_frame_push( h->frames.unused, frame );
- assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
+ x264_frame_push( h->frames.unused[frame->b_fdec], frame );
}
-x264_frame_t *x264_frame_pop_unused( x264_t *h )
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
{
x264_frame_t *frame;
- if( h->frames.unused[0] )
- frame = x264_frame_pop( h->frames.unused );
+ if( h->frames.unused[b_fdec][0] )
+ frame = x264_frame_pop( h->frames.unused[b_fdec] );
else
- frame = x264_frame_new( h );
- assert( frame->i_reference_count == 0 );
+ frame = x264_frame_new( h, b_fdec );
+ if( !frame )
+ return NULL;
+ frame->b_last_minigop_bframe = 0;
frame->i_reference_count = 1;
frame->b_intra_calculated = 0;
+ frame->b_scenecut = 1;
+ frame->b_keyframe = 0;
+
+ memset( frame->weight, 0, sizeof(frame->weight) );
+ memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
+
+ return frame;
+}
+
+void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
+{
+ assert( frame->i_reference_count > 0 );
+ frame->i_reference_count--;
+ if( frame->i_reference_count == 0 )
+ x264_frame_push( h->frames.blank_unused, frame );
+}
+
+x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
+{
+ x264_frame_t *frame;
+ if( h->frames.blank_unused[0] )
+ frame = x264_frame_pop( h->frames.blank_unused );
+ else
+ frame = x264_malloc( sizeof(x264_frame_t) );
+ if( !frame )
+ return NULL;
+ frame->b_duplicate = 1;
+ frame->i_reference_count = 1;
return frame;
}
@@ -973,3 +1055,63 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
}
} while( !b_ok );
}
+
+void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+ int i_width, int i_height, x264_weight_t *w )
+{
+ int x;
+ /* Weight horizontal strips of height 16. This was found to be the optimal height
+ * in terms of the cache loads. */
+ while( i_height > 0 )
+ {
+ for( x = 0; x < i_width; x += 16 )
+ w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
+ i_height -= 16;
+ dst += 16 * i_dst_stride;
+ src += 16 * i_src_stride;
+ }
+}
+
+void x264_frame_delete_list( x264_frame_t **list )
+{
+ int i = 0;
+ if( !list )
+ return;
+ while( list[i] )
+ x264_frame_delete( list[i++] );
+ x264_free( list );
+}
+
+int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
+{
+ if( max_size < 0 )
+ return -1;
+ slist->i_max_size = max_size;
+ slist->i_size = 0;
+ CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
+ if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
+ x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
+ x264_pthread_cond_init( &slist->cv_empty, NULL ) )
+ return -1;
+ return 0;
+fail:
+ return -1;
+}
+
+void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
+{
+ x264_pthread_mutex_destroy( &slist->mutex );
+ x264_pthread_cond_destroy( &slist->cv_fill );
+ x264_pthread_cond_destroy( &slist->cv_empty );
+ x264_frame_delete_list( slist->list );
+}
+
+void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
+{
+ x264_pthread_mutex_lock( &slist->mutex );
+ while( slist->i_size == slist->i_max_size )
+ x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
+ slist->list[ slist->i_size++ ] = frame;
+ x264_pthread_mutex_unlock( &slist->mutex );
+ x264_pthread_cond_broadcast( &slist->cv_fill );
+}
diff --git a/common/frame.h b/common/frame.h
index aad77f5..b1852b3 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -28,16 +28,24 @@
#define PADH 32
#define PADV 32
-typedef struct
+typedef struct x264_frame
{
/* */
int i_poc;
int i_type;
int i_qpplus1;
int64_t i_pts;
- int i_frame; /* Presentation frame number */
- int i_frame_num; /* Coded frame number */
+ int64_t i_reordered_pts;
+ x264_param_t *param;
+
+ int i_frame; /* Presentation frame number */
+ int i_coded; /* Coded frame number */
+ int i_frame_num; /* 7.4.3 frame_num */
int b_kept_as_ref;
+ int b_keyframe;
+ uint8_t b_fdec;
+ uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
+ uint8_t i_bframes; /* number of bframes following this nonb in coded order */
float f_qp_avg_rc; /* QPs as decided by ratecontrol */
float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
@@ -59,10 +67,18 @@ typedef struct
uint8_t *buffer[4];
uint8_t *buffer_lowres[4];
+ x264_weight_t weight[16][3]; /* [ref_index][plane] */
+ uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
+ int b_duplicate;
+ struct x264_frame *orig;
+
/* motion data */
int8_t *mb_type;
int16_t (*mv[2])[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+ uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
+ /* Actually a width-2 bitfield with 4 values per uint8_t. */
+ uint8_t (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
int i_ref[2];
@@ -81,18 +97,44 @@ typedef struct
int *i_row_bits;
int *i_row_qp;
float *f_qp_offset;
+ float *f_qp_offset_aq;
int b_intra_calculated;
uint16_t *i_intra_cost;
+ uint16_t *i_propagate_cost;
uint16_t *i_inv_qscale_factor;
+ int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
+ float f_weighted_cost_delta[X264_BFRAME_MAX+2];
+ uint32_t i_pixel_sum;
+ uint64_t i_pixel_ssd;
+
+ /* vbv */
+ uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
+ int i_planned_satd[X264_LOOKAHEAD_MAX+1];
/* threading */
int i_lines_completed; /* in pixels */
+ int i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
+ /* periodic intra refresh */
+ float f_pir_position;
+ int i_pir_start_col;
+ int i_pir_end_col;
} x264_frame_t;
+/* synchronized frame list */
+typedef struct
+{
+ x264_frame_t **list;
+ int i_max_size;
+ int i_size;
+ x264_pthread_mutex_t mutex;
+ x264_pthread_cond_t cv_fill; /* event signaling that the list became fuller */
+ x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
+} x264_synch_frame_list_t;
+
typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
typedef struct
@@ -107,7 +149,7 @@ typedef struct
x264_deblock_intra_t deblock_h_chroma_intra;
} x264_deblock_function_t;
-x264_frame_t *x264_frame_new( x264_t *h );
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
void x264_frame_delete( x264_frame_t *frame );
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
@@ -133,8 +175,18 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_shift( x264_frame_t **list );
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
-x264_frame_t *x264_frame_pop_unused( x264_t *h );
+void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
+x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
+void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+ int i_width, int i_height, x264_weight_t *w );
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_sort( x264_frame_t **list, int b_dts );
+void x264_frame_delete_list( x264_frame_t **list );
+
+int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int nelem );
+void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist );
+void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame );
+
#define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
#define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
diff --git a/common/macroblock.c b/common/macroblock.c
index 836d203..10f09ac 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -33,7 +33,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
int16_t *mv_a = h->mb.cache.mv[i_list][i8 - 1];
int i_refb = h->mb.cache.ref[i_list][i8 - 8];
int16_t *mv_b = h->mb.cache.mv[i_list][i8 - 8];
- int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width ];
+ int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width];
int i_count = 0;
@@ -50,7 +50,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
{
if( i_refb == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_b;
+ CP32( mvp, mv_b );
return;
}
}
@@ -58,7 +58,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
{
if( i_refa == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
return;
}
}
@@ -69,7 +69,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
{
if( i_refa == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
return;
}
}
@@ -77,7 +77,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
{
if( i_refc == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_c;
+ CP32( mvp, mv_c );
return;
}
}
@@ -95,14 +95,14 @@ median:
else if( i_count == 1 )
{
if( i_refa == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else if( i_refb == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_b;
+ CP32( mvp, mv_b );
else
- *(uint32_t*)mvp = *(uint32_t*)mv_c;
+ CP32( mvp, mv_c );
}
else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else
goto median;
}
@@ -136,14 +136,14 @@ median:
else if( i_count == 1 )
{
if( i_refa == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else if( i_refb == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_b;
+ CP32( mvp, mv_b );
else
- *(uint32_t*)mvp = *(uint32_t*)mv_c;
+ CP32( mvp, mv_c );
}
else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else
goto median;
}
@@ -157,10 +157,10 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
if( i_refa == -2 || i_refb == -2 ||
- !( i_refa | *(uint32_t*)mv_a ) ||
- !( i_refb | *(uint32_t*)mv_b ) )
+ !( i_refa | M32( mv_a ) ) ||
+ !( i_refb | M32( mv_b ) ) )
{
- *(uint32_t*)mv = 0;
+ M32( mv ) = 0;
}
else
{
@@ -173,7 +173,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
int i8;
- const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
+ const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
@@ -190,7 +190,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
const int x8 = i8%2;
const int y8 = i8/2;
const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
- const int i_ref = h->mb.map_col_to_list0[ h->fref1[0]->ref[0][ i_part_8x8 ] ];
+ const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
if( i_ref >= 0 )
{
@@ -221,7 +221,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
{
int ref[2];
- DECLARE_ALIGNED_8( int16_t mv[2][2] );
+ ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
int i_list;
int i8;
const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
@@ -259,11 +259,12 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
if( ref[0] >= 0 )
x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] );
else
- *(uint32_t*)mv[0] = 0;
+ M32( mv[0] ) = 0;
+
if( ref[1] >= 0 )
x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] );
else
- *(uint32_t*)mv[1] = 0;
+ M32( mv[1] ) = 0;
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
@@ -325,56 +326,58 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
if( b_changed != NULL && b_available )
{
- int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
- if( IS_INTRA(type_col) || type_col == P_SKIP )
+ int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
+ int changed = 0;
+
+ if( IS_INTRA( type_col ) || type_col == P_SKIP )
{
- *b_changed = h->mb.cache.direct_ref[0][0] != h->mb.cache.ref[0][X264_SCAN8_0]
- || h->mb.cache.direct_ref[1][0] != h->mb.cache.ref[1][X264_SCAN8_0]
- || *(uint32_t*)h->mb.cache.direct_mv[0][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[0][X264_SCAN8_0]
- || *(uint32_t*)h->mb.cache.direct_mv[1][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[1][X264_SCAN8_0];
+ changed |= M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][X264_SCAN8_0] );
+ changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][X264_SCAN8_0] );
+ changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][X264_SCAN8_0];
+ changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][X264_SCAN8_0];
}
else
{
- int i, l;
- *b_changed = 0;
+ int l;
for( l = 0; l < 2; l++ )
- for( i = 0; i < 4; i++ )
- *b_changed |= h->mb.cache.direct_ref[l][i] != h->mb.cache.ref[l][x264_scan8[i*4]];
- *b_changed = *b_changed || memcmp(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+ {
+ changed |= M32( h->mb.cache.direct_mv[l][0] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 0]] );
+ if( changed ) break;
+ changed |= M32( h->mb.cache.direct_mv[l][1] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 4]] );
+ changed |= M32( h->mb.cache.direct_mv[l][2] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 8]] );
+ changed |= M32( h->mb.cache.direct_mv[l][3] ) ^ M32( h->mb.cache.mv[l][x264_scan8[12]] );
+ if( changed ) break;
+ changed |= h->mb.cache.direct_ref[l][0] ^ h->mb.cache.ref[l][x264_scan8[ 0]];
+ changed |= h->mb.cache.direct_ref[l][1] ^ h->mb.cache.ref[l][x264_scan8[ 4]];
+ changed |= h->mb.cache.direct_ref[l][2] ^ h->mb.cache.ref[l][x264_scan8[ 8]];
+ changed |= h->mb.cache.direct_ref[l][3] ^ h->mb.cache.ref[l][x264_scan8[12]];
+ }
}
- if( !*b_changed )
+ *b_changed = changed;
+ if( !changed )
return b_available;
}
/* cache ref & mv */
if( b_available )
{
- int i, l;
+ int l;
for( l = 0; l < 2; l++ )
- for( i = 0; i < 4; i++ )
- h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
- h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+ {
+ CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
+ CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
+ CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
+ CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
+ h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
+ h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
+ h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
+ h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
+ }
}
return b_available;
}
-void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
-{
- const int x = 2*(idx%2);
- const int y = 2*(idx/2);
- x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
- x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
- *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] =
- *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]];
- *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] =
- *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8];
- *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] =
- *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]];
- *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] =
- *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8];
-}
-
/* This just improves encoder performance, it's not part of the spec */
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
{
@@ -382,7 +385,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
int i = 0;
#define SET_MVP(mvp) { \
- *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \
+ CP32( mvc[i], mvp ); \
i++; \
}
@@ -397,7 +400,11 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
{
int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
: h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
- if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff;
+ if( lowres_mv[0][0] != 0x7fff )
+ {
+ M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
+ i++;
+ }
}
/* spatial predictors */
@@ -462,72 +469,83 @@ static void setup_inverse_delta_pocs( x264_t *h )
}
}
-static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref = h->mb.cache.ref[0][i8];
- const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
- mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+ mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] );
// chroma is offset if MCing from a field of opposite parity
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
+ if( h->sh.weight[i_ref][1].weightfn )
+ h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->sh.weight[i_ref][1], height*2 );
+
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
+
+ if( h->sh.weight[i_ref][2].weightfn )
+ h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->sh.weight[i_ref][2],height*2 );
+
}
-static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref = h->mb.cache.ref[1][i8];
- const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
- mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+ mvx, mvy, 4*width, 4*height, weight_none );
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
}
-static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8];
const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
- const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
- int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
+ int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
- DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
- DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, tmp1,[16*16] );
uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
- mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
+ mvx0, mvy0, 4*width, 4*height, weight_none );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
- mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+ mvx1, mvy1, 4*width, 4*height, weight_none );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight );
@@ -536,14 +554,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
- h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
- h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
}
@@ -683,7 +701,6 @@ int x264_macroblock_cache_init( x264_t *h )
/* all coeffs */
CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
- CHECKED_MALLOC( h->mb.nnz_backup, h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t) );
if( h->param.b_cabac )
{
@@ -694,15 +711,61 @@ int x264_macroblock_cache_init( x264_t *h )
for( i=0; i<2; i++ )
{
- int i_refs = X264_MIN(16, (i ? 1 : h->param.i_frame_reference) + h->param.b_bframe_pyramid) << h->param.b_interlaced;
+ int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ i_refs = X264_MIN(16, i_refs + 2); //smart weights add two duplicate frames
+ else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
+ i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
+
for( j=0; j < i_refs; j++ )
CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
}
+ if( h->param.analyse.i_weighted_pred )
+ {
+ int i_padv = PADV << h->param.b_interlaced;
+#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+ int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
+ int i_stride, luma_plane_size;
+ int numweightbuf;
+
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
+ {
+ // only need buffer for lookahead
+ if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
+ {
+ // Fake analysis only works on lowres
+ i_stride = ALIGN( h->sps->i_mb_width*8 + 2*PADH, align );
+ luma_plane_size = i_stride * (h->sps->i_mb_height*8+2*i_padv);
+ // Only need 1 buffer for analysis
+ numweightbuf = 1;
+ }
+ else
+ numweightbuf = 0;
+ }
+ else
+ {
+ i_stride = ALIGN( h->sps->i_mb_width*16 + 2*PADH, align );
+ luma_plane_size = i_stride * (h->sps->i_mb_height*16+2*i_padv);
+
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ //SMART can weight one ref and one offset -1
+ numweightbuf = 2;
+ else
+ //blind only has one weighted copy (offset -1)
+ numweightbuf = 1;
+ }
+
+ for( i = 0; i < numweightbuf; i++ )
+ CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
+#undef ALIGN
+ }
+
for( i=0; i<=h->param.b_interlaced; i++ )
for( j=0; j<3; j++ )
{
- CHECKED_MALLOC( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
+ /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
+ CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
h->mb.intra_border_backup[i][j] += 8;
}
@@ -710,41 +773,6 @@ int x264_macroblock_cache_init( x264_t *h )
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
- /* fdec: fenc:
- * yyyyyyy
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * uuu vvv UUVV
- * uUU vVV UUVV
- * uUU vVV
- */
- h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
- h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
- h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
- h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
-
- h->mb.i_neighbour4[6] =
- h->mb.i_neighbour4[9] =
- h->mb.i_neighbour4[12] =
- h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
- h->mb.i_neighbour4[3] =
- h->mb.i_neighbour4[7] =
- h->mb.i_neighbour4[11] =
- h->mb.i_neighbour4[13] =
- h->mb.i_neighbour4[15] =
- h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
-
- int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
- int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
- int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
- int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
- ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
- CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
-
return 0;
fail: return -1;
}
@@ -757,6 +785,9 @@ void x264_macroblock_cache_end( x264_t *h )
for( i=0; i<2; i++ )
for( j=0; j<32; j++ )
x264_free( h->mb.mvr[i][j] );
+ for( i=0; i<16; i++ )
+ x264_free( h->mb.p_weight_buf[i] );
+
if( h->param.b_cabac )
{
x264_free( h->mb.chroma_pred_mode );
@@ -765,12 +796,10 @@ void x264_macroblock_cache_end( x264_t *h )
}
x264_free( h->mb.intra4x4_pred_mode );
x264_free( h->mb.non_zero_count );
- x264_free( h->mb.nnz_backup );
x264_free( h->mb.mb_transform_size );
x264_free( h->mb.skipbp );
x264_free( h->mb.cbp );
x264_free( h->mb.qp );
- x264_free( h->scratch_buffer );
}
void x264_macroblock_slice_init( x264_t *h )
{
@@ -791,16 +820,16 @@ void x264_macroblock_slice_init( x264_t *h )
for( i = 0; i < h->i_ref1; i++ )
h->fdec->ref_poc[1][i] = h->fref1[i]->i_poc;
- h->mb.map_col_to_list0[-1] = -1;
- h->mb.map_col_to_list0[-2] = -2;
+ map_col_to_list0(-1) = -1;
+ map_col_to_list0(-2) = -2;
for( i = 0; i < h->fref1[0]->i_ref[0]; i++ )
{
int poc = h->fref1[0]->ref_poc[0][i];
- h->mb.map_col_to_list0[i] = -2;
+ map_col_to_list0(i) = -2;
for( j = 0; j < h->i_ref0; j++ )
if( h->fref0[j]->i_poc == poc )
{
- h->mb.map_col_to_list0[i] = j;
+ map_col_to_list0(i) = j;
break;
}
}
@@ -809,6 +838,37 @@ void x264_macroblock_slice_init( x264_t *h )
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
setup_inverse_delta_pocs( h );
+
+ h->mb.i_neighbour4[6] =
+ h->mb.i_neighbour4[9] =
+ h->mb.i_neighbour4[12] =
+ h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
+ h->mb.i_neighbour4[3] =
+ h->mb.i_neighbour4[7] =
+ h->mb.i_neighbour4[11] =
+ h->mb.i_neighbour4[13] =
+ h->mb.i_neighbour4[15] =
+ h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
+}
+
+void x264_macroblock_thread_init( x264_t *h )
+{
+ /* fdec: fenc:
+ * yyyyyyy
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * uuu vvv UUVV
+ * uUU vVV UUVV
+ * uUU vVV
+ */
+ h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
+ h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+ h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
+ h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
}
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
@@ -837,8 +897,10 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
const int i_pix_offset = h->mb.b_interlaced
? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
: w * (i_mb_x + i_mb_y * i_stride);
+ const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
+ &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
- const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k;
if( h->mb.b_interlaced )
@@ -847,19 +909,25 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
h->mb.pic.p_fenc_plane[i], i_stride2, w );
- memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
- if( h->mb.b_interlaced )
- {
- const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ if( i_mb_y > 0 )
+ memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+ else
+ memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
+ if( h->mb.b_interlaced || h->mb.b_reencode_mb )
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
- }
for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
{
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
if( i == 0 )
+ {
for( k = 1; k < 4; k++ )
h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+ if( h->sh.weight[j][0].weightfn )
+ h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> h->mb.b_interlaced][ref_pix_offset[j&1]];
+ else
+ h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
+ }
}
if( h->sh.i_type == SLICE_TYPE_B )
for( j = 0; j < h->mb.pic.i_fref[1]; j++ )
@@ -894,24 +962,28 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.i_b4_xy = i_mb_4x4;
h->mb.i_mb_top_xy = i_top_xy;
h->mb.i_neighbour = 0;
+ h->mb.i_neighbour_intra = 0;
/* load cache */
if( i_top_xy >= h->sh.i_first_mb )
{
h->mb.i_mb_type_top =
- i_top_type= h->mb.type[i_top_xy];
+ i_top_type = h->mb.type[i_top_xy];
h->mb.cache.i_cbp_top = h->mb.cbp[i_top_xy];
h->mb.i_neighbour |= MB_TOP;
+ if( !h->param.b_constrained_intra || IS_INTRA( i_top_type ) )
+ h->mb.i_neighbour_intra |= MB_TOP;
+
/* load intra4x4 */
- *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
+ CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[i_top_xy][0] );
/* load non_zero_count */
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[i_top_xy][12] );
/* shift because x264_scan8[16] is misaligned */
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][18] ) << 8;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][22] ) << 8;
}
else
{
@@ -919,20 +991,12 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.cache.i_cbp_top = -1;
/* load intra4x4 */
- h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
- h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
- h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] =
- h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = -1;
+ M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;
/* load non_zero_count */
- h->mb.cache.non_zero_count[x264_scan8[0] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[1] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[4] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[5] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
- h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = 0x80808080U;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = 0x80808080U;
}
if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
@@ -944,6 +1008,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.i_neighbour |= MB_LEFT;
+ if( !h->param.b_constrained_intra || IS_INTRA( i_left_type ) )
+ h->mb.i_neighbour_intra |= MB_LEFT;
+
/* load intra4x4 */
h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][4];
h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][5];
@@ -987,6 +1054,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
h->mb.i_neighbour |= MB_TOPRIGHT;
h->mb.i_mb_type_topright = h->mb.type[ i_top_xy + 1 ];
+ if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) )
+ h->mb.i_neighbour_intra |= MB_TOPRIGHT;
}
else
h->mb.i_mb_type_topright = -1;
@@ -994,6 +1063,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
h->mb.i_neighbour |= MB_TOPLEFT;
h->mb.i_mb_type_topleft = h->mb.type[ i_top_xy - 1 ];
+ if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) )
+ h->mb.i_neighbour_intra |= MB_TOPLEFT;
}
else
h->mb.i_mb_type_topleft = -1;
@@ -1014,7 +1085,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
+ !!(h->mb.i_neighbour & MB_TOP);
}
- if( !h->mb.b_interlaced )
+ if( !h->mb.b_interlaced && !h->mb.b_reencode_mb )
{
copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
@@ -1060,13 +1131,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
const int ir = i_top_8x8 - 1;
const int iv = i_top_4x4 - 1;
h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir];
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+ CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
}
else
{
const int i8 = x264_scan8[0] - 1 - 1*8;
h->mb.cache.ref[i_list][i8] = -2;
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+ M32( h->mb.cache.mv[i_list][i8] ) = 0;
}
if( h->mb.i_neighbour & MB_TOP )
@@ -1078,15 +1149,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
h->mb.cache.ref[i_list][i8+2] =
h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
- *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0];
- *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2];
+ CP64( h->mb.cache.mv[i_list][i8+0], h->mb.mv[i_list][iv+0] );
+ CP64( h->mb.cache.mv[i_list][i8+2], h->mb.mv[i_list][iv+2] );
}
else
{
const int i8 = x264_scan8[0] - 8;
- *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0;
- *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0;
- *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U;
+ M64( h->mb.cache.mv[i_list][i8+0] ) = 0;
+ M64( h->mb.cache.mv[i_list][i8+2] ) = 0;
+ M32( &h->mb.cache.ref[i_list][i8] ) = (uint8_t)(-2) * 0x01010101U;
}
if( h->mb.i_neighbour & MB_TOPRIGHT )
@@ -1095,13 +1166,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
const int ir = i_top_8x8 + 2;
const int iv = i_top_4x4 + 4;
h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir];
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+ CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
}
else
{
const int i8 = x264_scan8[0] + 4 - 1*8;
h->mb.cache.ref[i_list][i8] = -2;
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+ M32( h->mb.cache.mv[i_list][i8] ) = 0;
}
if( h->mb.i_neighbour & MB_LEFT )
@@ -1114,10 +1185,10 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.cache.ref[i_list][i8+2*8] =
h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4];
+ CP32( h->mb.cache.mv[i_list][i8+0*8], h->mb.mv[i_list][iv + 0*s4x4] );
+ CP32( h->mb.cache.mv[i_list][i8+1*8], h->mb.mv[i_list][iv + 1*s4x4] );
+ CP32( h->mb.cache.mv[i_list][i8+2*8], h->mb.mv[i_list][iv + 2*s4x4] );
+ CP32( h->mb.cache.mv[i_list][i8+3*8], h->mb.mv[i_list][iv + 3*s4x4] );
}
else
{
@@ -1125,7 +1196,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
for( i = 0; i < 4; i++ )
{
h->mb.cache.ref[i_list][i8+i*8] = -2;
- *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0;
+ M32( h->mb.cache.mv[i_list][i8+i*8] ) = 0;
}
}
@@ -1135,45 +1206,49 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
const int i8 = x264_scan8[0] - 8;
const int iv = i_top_4x4;
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0];
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2];
+ CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
+ CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
}
else
{
const int i8 = x264_scan8[0] - 8;
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] =
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0;
+ M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
+ M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
}
if( i_left_type >= 0 )
{
const int i8 = x264_scan8[0] - 1;
const int iv = i_mb_4x4 - 1;
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4];
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4];
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4];
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4];
+ CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+ CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+ CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+ CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
}
else
{
const int i8 = x264_scan8[0] - 1;
for( i = 0; i < 4; i++ )
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0;
+ M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
}
}
}
/* load skip */
- if( h->sh.i_type == SLICE_TYPE_B && h->param.b_cabac )
+ if( h->sh.i_type == SLICE_TYPE_B )
{
- uint8_t skipbp;
- x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
- skipbp = i_left_type >= 0 ? h->mb.skipbp[i_left_xy] : 0;
- h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
- h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
- skipbp = i_top_type >= 0 ? h->mb.skipbp[i_top_xy] : 0;
- h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
- h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
+ h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
+ if( h->param.b_cabac )
+ {
+ uint8_t skipbp;
+ x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
+ skipbp = i_left_type >= 0 ? h->mb.skipbp[i_left_xy] : 0;
+ h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
+ h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
+ skipbp = i_top_type >= 0 ? h->mb.skipbp[i_top_xy] : 0;
+ h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
+ h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
+ }
}
if( h->sh.i_type == SLICE_TYPE_P )
@@ -1181,20 +1256,20 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
}
h->mb.i_neighbour4[0] =
- h->mb.i_neighbour8[0] = (h->mb.i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT))
- | ((h->mb.i_neighbour & MB_TOP) ? MB_TOPRIGHT : 0);
+ h->mb.i_neighbour8[0] = (h->mb.i_neighbour_intra & (MB_TOP|MB_LEFT|MB_TOPLEFT))
+ | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOPRIGHT : 0);
h->mb.i_neighbour4[4] =
- h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
+ h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour_intra & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
h->mb.i_neighbour4[2] =
h->mb.i_neighbour4[8] =
h->mb.i_neighbour4[10] =
- h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
+ h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour_intra & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
h->mb.i_neighbour4[5] =
- h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour & MB_TOPRIGHT)
- | ((h->mb.i_neighbour & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
+ h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour_intra & MB_TOPRIGHT)
+ | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
}
-static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
{
int w = i ? 8 : 16;
int i_stride = h->fdec->i_stride[!!i];
@@ -1221,7 +1296,7 @@ void x264_macroblock_cache_save( x264_t *h )
int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
- int i, y;
+ int y;
x264_macroblock_store_pic( h, 0 );
x264_macroblock_store_pic( h, 1 );
@@ -1235,13 +1310,16 @@ void x264_macroblock_cache_save( x264_t *h )
/* save intra4x4 */
if( i_mb_type == I_4x4 )
{
- *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
- *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
- h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
- h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
+ CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
+ M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+ h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+ h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
}
+ else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
+ M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
else
- *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
+ M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
+
if( i_mb_type == I_PCM )
{
@@ -1251,20 +1329,19 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.i_cbp_luma = 0xf;
h->mb.cbp[i_mb_xy] = 0x72f; /* all set */
h->mb.b_transform_8x8 = 0;
- for( i = 0; i < 16 + 2*4; i++ )
- non_zero_count[i] = 16;
+ memset( non_zero_count, 16, 24 );
}
else
{
/* save non zero count */
- *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8];
- *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8];
- *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8];
- *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8];
- *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8;
- *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8;
- *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8;
- *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8;
+ CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
+ CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
+ CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
+ CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
+ M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
+ M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
+ M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
+ M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
h->mb.i_qp = h->mb.i_last_qp;
@@ -1287,8 +1364,8 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
}
if( h->sh.i_type == SLICE_TYPE_B )
{
@@ -1298,8 +1375,8 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
}
}
}
@@ -1308,12 +1385,12 @@ void x264_macroblock_cache_save( x264_t *h )
int i_list;
for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
{
- *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
- *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
+ M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+ M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
+ M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
+ M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
}
}
}
@@ -1330,28 +1407,28 @@ void x264_macroblock_cache_save( x264_t *h )
{
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
}
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
}
}
else
{
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+ M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
+ M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
}
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
+ M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
+ M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
}
}
@@ -1373,45 +1450,50 @@ void x264_macroblock_cache_save( x264_t *h )
}
}
+
void x264_macroblock_bipred_init( x264_t *h )
{
- int i_ref0, i_ref1;
- for( i_ref0 = 0; i_ref0 < h->i_ref0; i_ref0++ )
- {
- int poc0 = h->fref0[i_ref0]->i_poc;
- for( i_ref1 = 0; i_ref1 < h->i_ref1; i_ref1++ )
+ int i_ref0, i_ref1, field;
+ for( field = 0; field <= h->sh.b_mbaff; field++ )
+ for( i_ref0 = 0; i_ref0 < (h->i_ref0<<h->sh.b_mbaff); i_ref0++ )
{
- int dist_scale_factor;
- int poc1 = h->fref1[i_ref1]->i_poc;
- int td = x264_clip3( poc1 - poc0, -128, 127 );
- if( td == 0 /* || pic0 is a long-term ref */ )
- dist_scale_factor = 256;
- else
+ int poc0 = h->fref0[i_ref0>>h->sh.b_mbaff]->i_poc;
+ if( h->sh.b_mbaff && field^(i_ref0&1) )
+ poc0 += h->sh.i_delta_poc_bottom;
+ for( i_ref1 = 0; i_ref1 < (h->i_ref1<<h->sh.b_mbaff); i_ref1++ )
{
- int tb = x264_clip3( h->fdec->i_poc - poc0, -128, 127 );
- int tx = (16384 + (abs(td) >> 1)) / td;
- dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
- }
- h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
+ int dist_scale_factor;
+ int poc1 = h->fref1[i_ref1>>h->sh.b_mbaff]->i_poc;
+ if( h->sh.b_mbaff && field^(i_ref1&1) )
+ poc1 += h->sh.i_delta_poc_bottom;
+ int cur_poc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
+ int td = x264_clip3( poc1 - poc0, -128, 127 );
+ if( td == 0 /* || pic0 is a long-term ref */ )
+ dist_scale_factor = 256;
+ else
+ {
+ int tb = x264_clip3( cur_poc - poc0, -128, 127 );
+ int tx = (16384 + (abs(td) >> 1)) / td;
+ dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
+ }
- dist_scale_factor >>= 2;
- if( h->param.analyse.b_weighted_bipred
- && dist_scale_factor >= -64
- && dist_scale_factor <= 128 )
- {
- h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor;
- // ssse3 implementation of biweight doesn't support the extrema.
- // if we ever generate them, we'll have to drop that optimization.
- assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+ // FIXME: will need this if we ever do temporal MV pred with interlaced
+ if( !h->sh.b_mbaff )
+ h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
+
+ dist_scale_factor >>= 2;
+ if( h->param.analyse.b_weighted_bipred
+ && dist_scale_factor >= -64
+ && dist_scale_factor <= 128 )
+ {
+ h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 64 - dist_scale_factor;
+ // ssse3 implementation of biweight doesn't support the extrema.
+ // if we ever generate them, we'll have to drop that optimization.
+ assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+ }
+ else
+ h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 32;
}
- else
- h->mb.bipred_weight[i_ref0][i_ref1] = 32;
}
- }
- if( h->sh.b_mbaff )
- {
- for( i_ref0 = 2*h->i_ref0-1; i_ref0 >= 0; i_ref0-- )
- for( i_ref1 = 2*h->i_ref1-1; i_ref1 >= 0; i_ref1-- )
- h->mb.bipred_weight[i_ref0][i_ref1] = h->mb.bipred_weight[i_ref0>>1][i_ref1>>1];
- }
}
+
diff --git a/common/macroblock.h b/common/macroblock.h
index d16b8de..48f3105 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -263,6 +263,7 @@ enum cabac_ctx_block_cat_e
int x264_macroblock_cache_init( x264_t *h );
void x264_macroblock_slice_init( x264_t *h );
+void x264_macroblock_thread_init( x264_t *h );
void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y );
void x264_macroblock_cache_save( x264_t *h );
void x264_macroblock_cache_end( x264_t *h );
@@ -291,10 +292,6 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
* if b_changed != NULL, set it to whether refs or mvs differ from
* before this functioncall. */
int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
-/* x264_mb_load_mv_direct8x8:
- * set h->mb.cache.mv and h->mb.cache.ref for B_DIRECT
- * must be called only after x264_mb_predict_mv_direct16x16 */
-void x264_mb_load_mv_direct8x8( x264_t *h, int idx );
/* x264_mb_predict_mv_ref16x16:
* set mvc with D_16x16 prediction.
* uses all neighbors, even those that didn't end up using this ref.
@@ -338,21 +335,22 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
}
static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val )
{
+ uint32_t *d = dst;
if( width == 4 )
{
uint32_t val2 = val * 0x01010101;
- ((uint32_t*)dst)[0] = val2;
- if( height >= 2 ) ((uint32_t*)dst)[2] = val2;
- if( height == 4 ) ((uint32_t*)dst)[4] = val2;
- if( height == 4 ) ((uint32_t*)dst)[6] = val2;
+ M32( d+0 ) = val2;
+ if( height >= 2 ) M32( d+2 ) = val2;
+ if( height == 4 ) M32( d+4 ) = val2;
+ if( height == 4 ) M32( d+6 ) = val2;
}
else // 2
{
uint32_t val2 = val * 0x0101;
- ((uint16_t*)dst)[ 0] = val2;
- if( height >= 2 ) ((uint16_t*)dst)[ 4] = val2;
- if( height == 4 ) ((uint16_t*)dst)[ 8] = val2;
- if( height == 4 ) ((uint16_t*)dst)[12] = val2;
+ M16( d+0 ) = val2;
+ if( height >= 2 ) M16( d+2 ) = val2;
+ if( height == 4 ) M16( d+4 ) = val2;
+ if( height == 4 ) M16( d+6 ) = val2;
}
}
static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
@@ -360,25 +358,27 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int
int dy;
if( width == 1 || WORD_SIZE < 8 )
{
+ uint32_t *d = dst;
for( dy = 0; dy < height; dy++ )
{
- ((uint32_t*)dst)[8*dy+0] = val;
- if( width >= 2 ) ((uint32_t*)dst)[8*dy+1] = val;
- if( width == 4 ) ((uint32_t*)dst)[8*dy+2] = val;
- if( width == 4 ) ((uint32_t*)dst)[8*dy+3] = val;
+ M32( d+8*dy+0 ) = val;
+ if( width >= 2 ) M32( d+8*dy+1 ) = val;
+ if( width == 4 ) M32( d+8*dy+2 ) = val;
+ if( width == 4 ) M32( d+8*dy+3 ) = val;
}
}
else
{
uint64_t val64 = val + ((uint64_t)val<<32);
+ uint64_t *d = dst;
for( dy = 0; dy < height; dy++ )
{
- ((uint64_t*)dst)[4*dy+0] = val64;
- if( width == 4 ) ((uint64_t*)dst)[4*dy+1] = val64;
+ M64( d+4*dy+0 ) = val64;
+ if( width == 4 ) M64( d+4*dy+1 ) = val64;
}
}
}
-#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv)
+#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) )
static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
{
x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
@@ -401,22 +401,20 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
}
#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
-#define array_non_zero_int array_non_zero_int_c
-static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
+#define array_non_zero_int array_non_zero_int
+static ALWAYS_INLINE int array_non_zero_int( int16_t *v, int i_count )
{
- union {uint16_t s[4]; uint64_t l;} *x = v;
if(i_count == 8)
- return !!x[0].l;
+ return !!M64( &v[0] );
else if(i_count == 16)
- return !!(x[0].l|x[1].l);
+ return !!(M64( &v[0] ) | M64( &v[4] ));
else if(i_count == 32)
- return !!(x[0].l|x[1].l|x[2].l|x[3].l);
+ return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] ));
else
{
int i;
- i_count /= sizeof(uint64_t);
- for( i = 0; i < i_count; i++ )
- if( x[i].l ) return 1;
+ for( i = 0; i < i_count; i+=4 )
+ if( M64( &v[i] ) ) return 1;
return 0;
}
}
@@ -462,7 +460,7 @@ static inline int x264_mb_transform_8x8_allowed( x264_t *h )
return 0;
if( h->mb.i_type != P_8x8 )
return partition_tab[h->mb.i_type];
- return *(uint32_t*)h->mb.i_sub_partition == D_L0_8x8*0x01010101;
+ return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
}
#endif
diff --git a/common/mc.c b/common/mc.c
index e5d6cc8..ac740cf 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -29,6 +29,9 @@
#ifdef ARCH_PPC
#include "ppc/mc.h"
#endif
+#ifdef ARCH_ARM
+#include "arm/mc.h"
+#endif
static inline void pixel_avg( uint8_t *dst, int i_dst_stride,
@@ -117,6 +120,67 @@ PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
+static void x264_weight_cache( x264_t *h, x264_weight_t *w )
+{
+ w->weightfn = h->mc.weight;
+}
+#define opscale(x) dst[x] = x264_clip_uint8( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
+#define opscale_noden(x) dst[x] = x264_clip_uint8( src[x] * weight->i_scale + weight->i_offset )
+static inline void mc_weight( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+{
+
+ int x, y;
+ if( weight->i_denom >= 1 )
+ {
+ for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
+ {
+ for( x = 0; x < i_width; x++ )
+ opscale( x );
+ }
+ }
+ else
+ {
+ for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
+ for( x = 0; x < i_width; x++ )
+ opscale_noden( x );
+ }
+}
+
+#define MC_WEIGHT_C( name, lx ) \
+ static void name( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int height ) \
+{ \
+ int x, y; \
+ if( weight->i_denom >= 1 ) \
+ { \
+ for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
+ for( x = 0; x < lx; x++ ) \
+ opscale( x ); \
+ } \
+ else \
+ { \
+ for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
+ for( x = 0; x < lx; x++ ) \
+ opscale_noden( x ); \
+ } \
+}
+
+MC_WEIGHT_C( mc_weight_w20, 20 )
+MC_WEIGHT_C( mc_weight_w16, 16 )
+MC_WEIGHT_C( mc_weight_w12, 12 )
+MC_WEIGHT_C( mc_weight_w8, 8 )
+MC_WEIGHT_C( mc_weight_w4, 4 )
+MC_WEIGHT_C( mc_weight_w2, 2 )
+
+static weight_fn_t x264_mc_weight_wtab[6] =
+{
+ mc_weight_w2,
+ mc_weight_w4,
+ mc_weight_w8,
+ mc_weight_w12,
+ mc_weight_w16,
+ mc_weight_w20,
+};
+const x264_weight_t weight_none[3] = { {{0}} };
static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
{
int y;
@@ -160,7 +224,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static void mc_luma( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
- int i_width, int i_height )
+ int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -171,17 +235,19 @@ static void mc_luma( uint8_t *dst, int i_dst_stride,
uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
+ if( weight->weightfn )
+ mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
}
+ else if( weight->weightfn )
+ mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
else
- {
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
- }
}
static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
- int i_width, int i_height )
+ int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -192,6 +258,13 @@ static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride,
uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
+ if( weight->weightfn )
+ mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
+ return dst;
+ }
+ else if( weight->weightfn )
+ {
+ mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
return dst;
}
else
@@ -314,7 +387,7 @@ void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
// duplicate last row and column so that their interpolation doesn't have to be special-cased
for( y=0; y<i_height; y++ )
src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
- memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), i_width );
+ memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), i_width+1 );
h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
x264_frame_expand_border_lowres( frame );
@@ -356,6 +429,33 @@ static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
}
}
+#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
+// gcc isn't smart enough to use the "idiv" instruction
+static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y) {
+ int32_t quotient, remainder;
+ asm("idiv %4"
+ :"=a"(quotient), "=d"(remainder)
+ :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
+ );
+ return quotient;
+}
+#else
+#define div_64_32(x,y) ((x)/(y))
+#endif
+
+/* Estimate the total amount of influence on future quality that could be had if we
+ * were to improve the reference samples used to inter predict any given macroblock. */
+static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+{
+ int i;
+ for( i=0; i<len; i++ )
+ {
+ int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
+ dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
+ }
+}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
@@ -373,6 +473,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
+ pf->weight = x264_mc_weight_wtab;
+ pf->offsetadd = x264_mc_weight_wtab;
+ pf->offsetsub = x264_mc_weight_wtab;
+ pf->weight_cache = x264_weight_cache;
+
pf->copy_16x16_unaligned = mc_copy_w16;
pf->copy[PIXEL_16x16] = mc_copy_w16;
pf->copy[PIXEL_8x8] = mc_copy_w8;
@@ -392,6 +497,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = integral_init4v;
pf->integral_init8v = integral_init8v;
+ pf->mbtree_propagate_cost = mbtree_propagate_cost;
+
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
@@ -399,6 +506,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
if( cpu&X264_CPU_ALTIVEC )
x264_mc_altivec_init( pf );
#endif
+#ifdef HAVE_ARMV6
+ x264_mc_init_arm( cpu, pf );
+#endif
}
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
diff --git a/common/mc.h b/common/mc.h
index 594940f..68bba48 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -21,6 +21,33 @@
#ifndef X264_MC_H
#define X264_MC_H
+struct x264_weight_t;
+typedef void (* weight_fn_t)( uint8_t *, int, uint8_t *,int, const struct x264_weight_t *, int );
+typedef struct x264_weight_t
+{
+ /* aligning the first member is a gcc hack to force the struct to be
+ * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
+ ALIGNED_16( int16_t cachea[8] );
+ int16_t cacheb[8];
+ int32_t i_denom;
+ int32_t i_scale;
+ int32_t i_offset;
+ weight_fn_t *weightfn;
+} ALIGNED_16( x264_weight_t );
+
+extern const x264_weight_t weight_none[3];
+
+#define SET_WEIGHT( w, b, s, d, o )\
+{\
+ (w).i_scale = (s);\
+ (w).i_denom = (d);\
+ (w).i_offset = (o);\
+ if( b )\
+ h->mc.weight_cache( h, &w );\
+ else\
+ w.weightfn = NULL;\
+}
+
/* Do the MC
* XXX: Only width = 4, 8 or 16 are valid
* width == 4 -> height == 4 or 8
@@ -32,12 +59,12 @@ typedef struct
{
void (*mc_luma)(uint8_t *dst, int i_dst, uint8_t **src, int i_src,
int mvx, int mvy,
- int i_width, int i_height );
+ int i_width, int i_height, const x264_weight_t *weight );
/* may round up the dimensions if they're not a power of 2 */
uint8_t* (*get_ref)(uint8_t *dst, int *i_dst, uint8_t **src, int i_src,
int mvx, int mvy,
- int i_width, int i_height );
+ int i_width, int i_height, const x264_weight_t *weight );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
@@ -74,6 +101,13 @@ typedef struct
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
+ weight_fn_t *weight;
+ weight_fn_t *offsetadd;
+ weight_fn_t *offsetsub;
+ void (*weight_cache)( x264_t *, x264_weight_t * );
+
+ void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
diff --git a/common/mdate.c b/common/mdate.c
index 1a02cdf..7a1c8a5 100644
--- a/common/mdate.c
+++ b/common/mdate.c
@@ -18,7 +18,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#ifndef __MINGW32__
#include <sys/time.h>
#else
#include <sys/types.h>
@@ -31,9 +31,8 @@
int64_t x264_mdate( void )
{
-#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#ifndef __MINGW32__
struct timeval tv_date;
-
gettimeofday( &tv_date, NULL );
return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
#else
diff --git a/common/osdep.h b/common/osdep.h
index 168d6b2..7f680ed 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -27,6 +27,9 @@
#define _LARGEFILE_SOURCE 1
#define _FILE_OFFSET_BITS 64
#include <stdio.h>
+#include <sys/stat.h>
+
+#include "config.h"
#ifdef HAVE_STDINT_H
#include <stdint.h>
@@ -34,30 +37,18 @@
#include <inttypes.h>
#endif
+#ifndef HAVE_LOG2F
+#define log2f(x) (logf((x))/0.693147180559945f)
+#endif
+
#ifdef _WIN32
#include <io.h> // _setmode()
#include <fcntl.h> // _O_BINARY
#endif
-#ifdef _MSC_VER
-#define inline __inline
-#define strcasecmp stricmp
-#define strncasecmp strnicmp
-#define snprintf _snprintf
-#define fseek _fseeki64
-#define ftell _ftelli64
-#define isfinite _finite
-#define strtok_r strtok_s
-#define _CRT_SECURE_NO_DEPRECATE
-#define X264_VERSION "" // no configure script for msvc
-#endif
-
#if (defined(SYS_OPENBSD) && !defined(isfinite)) || defined(SYS_SunOS)
#define isfinite finite
#endif
-#if defined(_MSC_VER) || defined(SYS_SunOS) || defined(SYS_MACOSX)
-#define sqrtf sqrt
-#endif
#ifdef _WIN32
#define rename(src,dst) (unlink(dst), rename(src,dst)) // POSIX says that rename() removes the destination, but win32 doesn't.
#ifndef strtok_r
@@ -65,33 +56,63 @@
#endif
#endif
-#ifdef _MSC_VER
-#define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var
-#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
+#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
+#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
+#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
+
+// ARM compiliers don't reliably align stack variables
+// - EABI requires only 8 byte stack alignment to be maintained
+// - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function
+// - armcc can't either, but is nice enough to actually tell you so
+// - Apple gcc only maintains 4 byte alignment
+// - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils...
+#if defined(ARCH_ARM) && defined(SYS_MACOSX)
+#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
+ uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 7]; \
+ type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+7) & ~7)
+#else
+#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
+ ALIGNED_8( type name sub1 __VA_ARGS__ )
+#endif
+
+#ifdef ARCH_ARM
+#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
+ uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 15];\
+ type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+15) & ~15)
+#else
+#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
+ ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
-#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
-#define DECLARE_ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
-#define DECLARE_ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
#define UNUSED __attribute__((unused))
#define ALWAYS_INLINE __attribute__((always_inline)) inline
#define NOINLINE __attribute__((noinline))
+#define MAY_ALIAS __attribute__((may_alias))
+#define x264_constant_p(x) __builtin_constant_p(x)
#else
#define UNUSED
#define ALWAYS_INLINE inline
#define NOINLINE
+#define MAY_ALIAS
+#define x264_constant_p(x) 0
#endif
/* threads */
#if defined(SYS_BEOS)
#include <kernel/OS.h>
#define x264_pthread_t thread_id
-#define x264_pthread_create(t,u,f,d) { *(t)=spawn_thread(f,"",10,d); \
- resume_thread(*(t)); }
+static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(void *), void *d )
+{
+ *t = spawn_thread( f, "", 10, d );
+ if( *t < B_NO_ERROR )
+ return -1;
+ resume_thread( *t );
+ return 0;
+}
#define x264_pthread_join(t,s) { long tmp; \
- wait_for_thread(t,(s)?(long*)(s):&tmp); }
+ wait_for_thread(t,(s)?(long*)(*(s)):&tmp); }
#ifndef usleep
#define usleep(t) snooze(t)
#endif
@@ -103,7 +124,7 @@
#else
#define x264_pthread_t int
-#define x264_pthread_create(t,u,f,d)
+#define x264_pthread_create(t,u,f,d) 0
#define x264_pthread_join(t,s)
#endif //SYS_*
@@ -121,39 +142,53 @@
#define x264_pthread_cond_destroy pthread_cond_destroy
#define x264_pthread_cond_broadcast pthread_cond_broadcast
#define x264_pthread_cond_wait pthread_cond_wait
+#define x264_pthread_attr_t pthread_attr_t
+#define x264_pthread_attr_init pthread_attr_init
+#define x264_pthread_attr_destroy pthread_attr_destroy
+#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
#else
#define x264_pthread_mutex_t int
-#define x264_pthread_mutex_init(m,f)
+#define x264_pthread_mutex_init(m,f) 0
#define x264_pthread_mutex_destroy(m)
#define x264_pthread_mutex_lock(m)
#define x264_pthread_mutex_unlock(m)
#define x264_pthread_cond_t int
-#define x264_pthread_cond_init(c,f)
+#define x264_pthread_cond_init(c,f) 0
#define x264_pthread_cond_destroy(c)
#define x264_pthread_cond_broadcast(c)
#define x264_pthread_cond_wait(c,m)
+#define x264_pthread_attr_t int
+#define x264_pthread_attr_init(a) 0
+#define x264_pthread_attr_destroy(a)
+#define X264_PTHREAD_MUTEX_INITIALIZER 0
#endif
#define WORD_SIZE sizeof(void*)
+#define asm __asm__
+
#if !defined(_WIN64) && !defined(__LP64__)
-#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER)
#define BROKEN_STACK_ALIGNMENT /* define it if stack is not mod16 */
#endif
#endif
#ifdef WORDS_BIGENDIAN
#define endian_fix(x) (x)
+#define endian_fix64(x) (x)
#define endian_fix32(x) (x)
-#elif defined(__GNUC__) && defined(HAVE_MMX)
+#define endian_fix16(x) (x)
+#else
+#if defined(__GNUC__) && defined(HAVE_MMX)
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
asm("bswap %0":"+r"(x));
return x;
}
-static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+#elif defined(__GNUC__) && defined(HAVE_ARMV6)
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
- asm("bswap %0":"+r"(x));
+ asm("rev %0, %0":"+r"(x));
return x;
}
#else
@@ -161,12 +196,26 @@ static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
}
+#endif
+#if defined(__GNUC__) && defined(ARCH_X86_64)
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
+{
+ asm("bswap %0":"+r"(x));
+ return x;
+}
+#else
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
+{
+ return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
+}
+#endif
static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
{
- if( WORD_SIZE == 8 )
- return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
- else
- return endian_fix32(x);
+ return WORD_SIZE == 8 ? endian_fix64(x) : endian_fix32(x);
+}
+static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x )
+{
+ return (x<<8)|(x>>8);
}
#endif
@@ -176,7 +225,7 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
static int ALWAYS_INLINE x264_clz( uint32_t x )
{
static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};
- int y, z = ((x - 0x10000) >> 27) & 16;
+ int y, z = (((x >> 16) - 1) >> 27) & 16;
x >>= z^16;
z += y = ((x - 0x100) >> 28) & 8;
x >>= y^8;
@@ -186,4 +235,31 @@ static int ALWAYS_INLINE x264_clz( uint32_t x )
}
#endif
+#ifdef USE_REAL_PTHREAD
+#ifdef SYS_MINGW
+#define x264_lower_thread_priority(p)\
+{\
+ x264_pthread_t handle = pthread_self();\
+ struct sched_param sp;\
+ int policy = SCHED_OTHER;\
+ pthread_getschedparam( handle, &policy, &sp );\
+ sp.sched_priority -= p;\
+ pthread_setschedparam( handle, policy, &sp );\
+}
+#else
+#include <unistd.h>
+#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
+#endif /* USE_REAL_PTHREAD */
+#else
+#define x264_lower_thread_priority(p)
+#endif
+
+static inline uint8_t x264_is_regular_file( FILE *filehandle )
+{
+ struct stat file_stat;
+ if( fstat( fileno( filehandle ), &file_stat ) )
+ return 0;
+ return S_ISREG( file_stat.st_mode );
+}
+
#endif /* X264_OSDEP_H */
diff --git a/common/pixel.c b/common/pixel.c
index 5932f07..7c60237 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -29,6 +29,9 @@
#ifdef ARCH_PPC
# include "ppc/pixel.h"
#endif
+#ifdef ARCH_ARM
+# include "arm/pixel.h"
+#endif
#ifdef ARCH_UltraSparc
# include "sparc/pixel.h"
#endif
@@ -139,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
/****************************************************************************
* pixel_var_wxh
****************************************************************************/
-#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride ) \
+#define PIXEL_VAR_C( name, w ) \
+static uint64_t name( uint8_t *pix, int i_stride ) \
{ \
- uint32_t var = 0, sum = 0, sqr = 0; \
+ uint32_t sum = 0, sqr = 0; \
int x, y; \
for( y = 0; y < w; y++ ) \
{ \
@@ -153,12 +156,35 @@ static int name( uint8_t *pix, int i_stride ) \
} \
pix += i_stride; \
} \
- var = sqr - (sum * sum >> shift); \
- return var; \
+ return sum + ((uint64_t)sqr << 32); \
}
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
-PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8, 8 )
+
+/****************************************************************************
+ * pixel_var2_wxh
+ ****************************************************************************/
+static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
+{
+ uint32_t var = 0, sum = 0, sqr = 0;
+ int x, y;
+ for( y = 0; y < 8; y++ )
+ {
+ for( x = 0; x < 8; x++ )
+ {
+ int diff = pix1[x] - pix2[x];
+ sum += diff;
+ sqr += diff * diff;
+ }
+ pix1 += i_stride1;
+ pix2 += i_stride2;
+ }
+ sum = abs(sum);
+ var = sqr - (sum * sum >> 6);
+ *ssd = sqr;
+ return var;
+}
#define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
@@ -429,6 +455,10 @@ SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
#endif
+#ifdef HAVE_ARMV6
+SATD_X_DECL7( _neon )
+#endif
+
/****************************************************************************
* structural similarity metric
****************************************************************************/
@@ -611,6 +641,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
+ pixf->var2_8x8 = pixel_var2_8x8;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
@@ -636,6 +667,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
+ pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
if( cpu&X264_CPU_CACHELINE_32 )
{
@@ -682,6 +714,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
+ pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -761,6 +794,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
+ pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
@@ -787,6 +821,47 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
#endif //HAVE_MMX
+#ifdef HAVE_ARMV6
+ if( cpu&X264_CPU_ARMV6 )
+ {
+ pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
+ pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
+ pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
+ pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
+ }
+ if( cpu&X264_CPU_NEON )
+ {
+ INIT5( sad, _neon );
+ INIT5( sad_aligned, _neon );
+ INIT7( sad_x3, _neon );
+ INIT7( sad_x4, _neon );
+ INIT7( ssd, _neon );
+ INIT7( satd, _neon );
+ INIT7( satd_x3, _neon );
+ INIT7( satd_x4, _neon );
+ INIT4( hadamard_ac, _neon );
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
+ pixf->var2_8x8 = x264_pixel_var2_8x8_neon;
+
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
+ pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
+
+ if( cpu&X264_CPU_FAST_NEON_MRC )
+ {
+ pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
+ pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
+ pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
+ pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
+ }
+ else // really just scheduled for dual issue / A8
+ {
+ INIT5( sad_aligned, _neon_dual );
+ }
+ }
+#endif
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
diff --git a/common/pixel.h b/common/pixel.h
index 207c74f..1102642 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -73,8 +73,9 @@ typedef struct
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+ int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
- int (*var[4])( uint8_t *pix, int stride );
+ uint64_t (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c
index 3531812..14171e6 100644
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -41,7 +41,7 @@
static inline void write16x4(uint8_t *dst, int dst_stride,
register vec_u8_t r0, register vec_u8_t r1,
register vec_u8_t r2, register vec_u8_t r3) {
- DECLARE_ALIGNED_16(unsigned char result[64]);
+ ALIGNED_16(unsigned char result[64]);
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4;
@@ -220,7 +220,7 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
}
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
- DECLARE_ALIGNED_16(unsigned char temp[16]); \
+ ALIGNED_16(unsigned char temp[16]); \
register vec_u8_t alphavec; \
register vec_u8_t betavec; \
register vec_u8_t mask; \
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 56ec9c1..a588d8f 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -181,7 +181,7 @@ static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst,
static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
- int i_width, int i_height )
+ int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -201,8 +201,11 @@ static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
default:
x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
}
-
+ if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
}
+ else if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
else
{
switch(i_width) {
@@ -224,7 +227,7 @@ static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
- int i_width, int i_height )
+ int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -248,6 +251,13 @@ static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
break;
}
+ if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+ return dst;
+ }
+ else if( weight->weightfn )
+ {
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
return dst;
}
else
@@ -303,7 +313,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
- DECLARE_ALIGNED_16( uint16_t coeff[4] );
+ ALIGNED_16( uint16_t coeff[4] );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
@@ -384,7 +394,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
- DECLARE_ALIGNED_16( uint16_t coeff[4] );
+ ALIGNED_16( uint16_t coeff[4] );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 360e71d..64d4c49 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -33,7 +33,7 @@ static int name( uint8_t *pix1, int i_pix1, \
uint8_t *pix2, int i_pix2 ) \
{ \
int y; \
- DECLARE_ALIGNED_16( int sum ); \
+ ALIGNED_16( int sum ); \
\
LOAD_ZERO; \
PREP_LOAD; \
@@ -118,7 +118,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
PREP_LOAD_SRC( pix1 );
@@ -163,7 +163,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v;
@@ -217,7 +217,7 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
@@ -271,7 +271,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
@@ -331,7 +331,7 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
@@ -415,7 +415,7 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
LOAD_ZERO;
PREP_LOAD;
@@ -499,7 +499,7 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
LOAD_ZERO;
PREP_LOAD;
@@ -630,10 +630,10 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
@@ -751,9 +751,9 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
@@ -846,10 +846,10 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
@@ -964,9 +964,9 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
@@ -1062,10 +1062,10 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
@@ -1183,9 +1183,9 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
@@ -1283,10 +1283,10 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
@@ -1404,9 +1404,9 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
@@ -1506,7 +1506,7 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
- DECLARE_ALIGNED_16( int sum );
+ ALIGNED_16( int sum );
int y;
LOAD_ZERO;
@@ -1586,7 +1586,7 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
- DECLARE_ALIGNED_16( int sum );
+ ALIGNED_16( int sum );
int y;
LOAD_ZERO;
@@ -1636,10 +1636,10 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
/****************************************************************************
* variance
****************************************************************************/
-static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
{
- DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
- DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
+ ALIGNED_16(uint32_t sum_tab[4]);
+ ALIGNED_16(uint32_t sqr_tab[4]);
LOAD_ZERO;
vec_u32_t sqr_v = zero_u32v;
@@ -1661,14 +1661,13 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
uint32_t sum = sum_tab[3];
uint32_t sqr = sqr_tab[3];
- uint32_t var = sqr - (sum * sum >> 8);
- return var;
+ return sum + ((uint64_t)sqr<<32);
}
-static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
{
- DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
- DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
+ ALIGNED_16(uint32_t sum_tab[4]);
+ ALIGNED_16(uint32_t sqr_tab[4]);
LOAD_ZERO;
vec_u32_t sqr_v = zero_u32v;
@@ -1700,8 +1699,7 @@ static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
uint32_t sum = sum_tab[3];
uint32_t sqr = sqr_tab[3];
- uint32_t var = sqr - (sum * sum >> 6);
- return var;
+ return sum + ((uint64_t)sqr<<32);
}
@@ -1870,8 +1868,8 @@ static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
{
- DECLARE_ALIGNED_16( int32_t sum4_tab[4] );
- DECLARE_ALIGNED_16( int32_t sum8_tab[4] );
+ ALIGNED_16( int32_t sum4_tab[4] );
+ ALIGNED_16( int32_t sum8_tab[4] );
LOAD_ZERO;
VEC_LOAD_HIGH( pix, 0 );
@@ -1937,7 +1935,7 @@ static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u
int sum8 = sum8_tab[3];
- DECLARE_ALIGNED_16( int16_t tmp0_4_tab[8] );
+ ALIGNED_16( int16_t tmp0_4_tab[8] );
vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
sum4 -= tmp0_4_tab[0];
@@ -1997,7 +1995,7 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2,
int sums[2][4] )
{
- DECLARE_ALIGNED_16( int temp[4] );
+ ALIGNED_16( int temp[4] );
int y;
vec_u8_t pix1v, pix2v;
diff --git a/common/predict.c b/common/predict.c
index ce4b9bf..0718c81 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -33,6 +33,9 @@
#ifdef ARCH_PPC
# include "ppc/predict.h"
#endif
+#ifdef ARCH_ARM
+# include "arm/predict.h"
+#endif
/****************************************************************************
* 16x16 prediction for intra luma block
@@ -41,11 +44,10 @@
#define PREDICT_16x16_DC(v) \
for( i = 0; i < 16; i++ )\
{\
- uint32_t *p = (uint32_t*)src;\
- *p++ = v;\
- *p++ = v;\
- *p++ = v;\
- *p++ = v;\
+ M32( src+ 0 ) = v;\
+ M32( src+ 4 ) = v;\
+ M32( src+ 8 ) = v;\
+ M32( src+12 ) = v;\
src += FDEC_STRIDE;\
}
@@ -101,32 +103,28 @@ static void predict_16x16_h( uint8_t *src )
for( i = 0; i < 16; i++ )
{
const uint32_t v = 0x01010101 * src[-1];
- uint32_t *p = (uint32_t*)src;
-
- *p++ = v;
- *p++ = v;
- *p++ = v;
- *p++ = v;
-
+ M32( src+ 0 ) = v;
+ M32( src+ 4 ) = v;
+ M32( src+ 8 ) = v;
+ M32( src+12 ) = v;
src += FDEC_STRIDE;
}
}
static void predict_16x16_v( uint8_t *src )
{
- uint32_t v0 = *(uint32_t*)&src[ 0-FDEC_STRIDE];
- uint32_t v1 = *(uint32_t*)&src[ 4-FDEC_STRIDE];
- uint32_t v2 = *(uint32_t*)&src[ 8-FDEC_STRIDE];
- uint32_t v3 = *(uint32_t*)&src[12-FDEC_STRIDE];
+ uint32_t v0 = M32( &src[ 0-FDEC_STRIDE] );
+ uint32_t v1 = M32( &src[ 4-FDEC_STRIDE] );
+ uint32_t v2 = M32( &src[ 8-FDEC_STRIDE] );
+ uint32_t v3 = M32( &src[12-FDEC_STRIDE] );
int i;
for( i = 0; i < 16; i++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = v0;
- *p++ = v1;
- *p++ = v2;
- *p++ = v3;
+ M32( src+ 0 ) = v0;
+ M32( src+ 4 ) = v1;
+ M32( src+ 8 ) = v2;
+ M32( src+12 ) = v3;
src += FDEC_STRIDE;
}
}
@@ -175,9 +173,8 @@ static void predict_8x8c_dc_128( uint8_t *src )
for( y = 0; y < 8; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = 0x80808080;
- *p++ = 0x80808080;
+ M32( src+0 ) = 0x80808080;
+ M32( src+4 ) = 0x80808080;
src += FDEC_STRIDE;
}
}
@@ -196,16 +193,14 @@ static void predict_8x8c_dc_left( uint8_t *src )
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc0;
- *p++ = dc0;
+ M32( src+0 ) = dc0;
+ M32( src+4 ) = dc0;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc1;
- *p++ = dc1;
+ M32( src+0 ) = dc1;
+ M32( src+4 ) = dc1;
src += FDEC_STRIDE;
}
@@ -225,9 +220,8 @@ static void predict_8x8c_dc_top( uint8_t *src )
for( y = 0; y < 8; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc0;
- *p++ = dc1;
+ M32( src+0 ) = dc0;
+ M32( src+4 ) = dc1;
src += FDEC_STRIDE;
}
}
@@ -261,17 +255,15 @@ static void predict_8x8c_dc( uint8_t *src )
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc0;
- *p++ = dc1;
+ M32( src+0 ) = dc0;
+ M32( src+4 ) = dc1;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc2;
- *p++ = dc3;
+ M32( src+0 ) = dc2;
+ M32( src+4 ) = dc3;
src += FDEC_STRIDE;
}
}
@@ -282,23 +274,21 @@ static void predict_8x8c_h( uint8_t *src )
for( i = 0; i < 8; i++ )
{
uint32_t v = 0x01010101 * src[-1];
- uint32_t *p = (uint32_t*)src;
- *p++ = v;
- *p++ = v;
+ M32( src+0 ) = v;
+ M32( src+4 ) = v;
src += FDEC_STRIDE;
}
}
static void predict_8x8c_v( uint8_t *src )
{
- uint32_t v0 = *(uint32_t*)&src[0-FDEC_STRIDE];
- uint32_t v1 = *(uint32_t*)&src[4-FDEC_STRIDE];
+ uint32_t v0 = M32( src+0-FDEC_STRIDE );
+ uint32_t v1 = M32( src+4-FDEC_STRIDE );
int i;
for( i = 0; i < 8; i++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = v0;
- *p++ = v1;
+ M32( src+0 ) = v0;
+ M32( src+4 ) = v1;
src += FDEC_STRIDE;
}
}
@@ -340,7 +330,7 @@ static void predict_8x8c_p( uint8_t *src )
****************************************************************************/
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-#define SRC32(x,y) *(uint32_t*)&SRC(x,y)
+#define SRC32(x,y) M32( &SRC(x,y) )
#define PREDICT_4x4_DC(v)\
SRC32(0,0) = SRC32(0,1) = SRC32(0,2) = SRC32(0,3) = v;
@@ -532,7 +522,7 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor,
}
else
{
- *(uint64_t*)(edge+24) = SRC(7,-1) * 0x0101010101010101ULL;
+ M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL;
edge[32] = SRC(7,-1);
}
}
@@ -558,8 +548,8 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor,
#define PREDICT_8x8_DC(v) \
int y; \
for( y = 0; y < 8; y++ ) { \
- ((uint32_t*)src)[0] = \
- ((uint32_t*)src)[1] = v; \
+ M32( src+0 ) = v; \
+ M32( src+4 ) = v; \
src += FDEC_STRIDE; \
}
@@ -590,17 +580,17 @@ static void predict_8x8_dc( uint8_t *src, uint8_t edge[33] )
static void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
{
PREDICT_8x8_LOAD_LEFT
-#define ROW(y) ((uint32_t*)(src+y*FDEC_STRIDE))[0] =\
- ((uint32_t*)(src+y*FDEC_STRIDE))[1] = 0x01010101U * l##y
+#define ROW(y) M32( src+y*FDEC_STRIDE+0 ) =\
+ M32( src+y*FDEC_STRIDE+4 ) = 0x01010101U * l##y;
ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW
}
static void predict_8x8_v( uint8_t *src, uint8_t edge[33] )
{
- const uint64_t top = *(uint64_t*)(edge+16);
+ const uint64_t top = M64( edge+16 );
int y;
for( y = 0; y < 8; y++ )
- *(uint64_t*)(src+y*FDEC_STRIDE) = top;
+ M64( src+y*FDEC_STRIDE ) = top;
}
static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] )
{
@@ -770,6 +760,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
x264_predict_16x16_init_altivec( pf );
}
#endif
+
+#ifdef HAVE_ARMV6
+ x264_predict_16x16_init_arm( cpu, pf );
+#endif
}
void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -792,6 +786,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
x264_predict_8x8c_init_altivec( pf );
}
#endif
+
+#ifdef HAVE_ARMV6
+ x264_predict_8x8c_init_arm( cpu, pf );
+#endif
}
void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -813,6 +811,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_
#ifdef HAVE_MMX
x264_predict_8x8_init_mmx( cpu, pf, predict_filter );
#endif
+
+#ifdef HAVE_ARMV6
+ x264_predict_8x8_init_arm( cpu, pf, predict_filter );
+#endif
}
void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
@@ -833,5 +835,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
#ifdef HAVE_MMX
x264_predict_4x4_init_mmx( cpu, pf );
#endif
+
+#ifdef HAVE_ARMV6
+ x264_predict_4x4_init_arm( cpu, pf );
+#endif
}
diff --git a/common/quant.c b/common/quant.c
index daf2b5a..7434a3d 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -29,6 +29,9 @@
#ifdef ARCH_PPC
# include "ppc/quant.h"
#endif
+#ifdef ARCH_ARM
+# include "arm/quant.h"
+#endif
#define QUANT_ONE( coef, mf, f ) \
{ \
@@ -39,141 +42,101 @@
nz |= (coef); \
}
-static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+static int quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
{
int i, nz = 0;
for( i = 0; i < 64; i++ )
- QUANT_ONE( dct[0][i], mf[i], bias[i] );
+ QUANT_ONE( dct[i], mf[i], bias[i] );
return !!nz;
}
-static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+static int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
{
int i, nz = 0;
for( i = 0; i < 16; i++ )
- QUANT_ONE( dct[0][i], mf[i], bias[i] );
+ QUANT_ONE( dct[i], mf[i], bias[i] );
return !!nz;
}
-static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
+static int quant_4x4_dc( int16_t dct[16], int mf, int bias )
{
int i, nz = 0;
for( i = 0; i < 16; i++ )
- QUANT_ONE( dct[0][i], mf, bias );
+ QUANT_ONE( dct[i], mf, bias );
return !!nz;
}
-static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
+static int quant_2x2_dc( int16_t dct[4], int mf, int bias )
{
int nz = 0;
- QUANT_ONE( dct[0][0], mf, bias );
- QUANT_ONE( dct[0][1], mf, bias );
- QUANT_ONE( dct[0][2], mf, bias );
- QUANT_ONE( dct[0][3], mf, bias );
+ QUANT_ONE( dct[0], mf, bias );
+ QUANT_ONE( dct[1], mf, bias );
+ QUANT_ONE( dct[2], mf, bias );
+ QUANT_ONE( dct[3], mf, bias );
return !!nz;
}
#define DEQUANT_SHL( x ) \
- dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits
+ dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
#define DEQUANT_SHR( x ) \
- dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits)
+ dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
-static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+static void dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
{
const int i_mf = i_qp%6;
const int i_qbits = i_qp/6 - 4;
- int y;
+ int i;
if( i_qbits >= 0 )
{
- for( y = 0; y < 4; y++ )
- {
- DEQUANT_SHL( 0 );
- DEQUANT_SHL( 1 );
- DEQUANT_SHL( 2 );
- DEQUANT_SHL( 3 );
- }
+ for( i = 0; i < 16; i++ )
+ DEQUANT_SHL( i );
}
else
{
const int f = 1 << (-i_qbits-1);
- for( y = 0; y < 4; y++ )
- {
- DEQUANT_SHR( 0 );
- DEQUANT_SHR( 1 );
- DEQUANT_SHR( 2 );
- DEQUANT_SHR( 3 );
- }
+ for( i = 0; i < 16; i++ )
+ DEQUANT_SHR( i );
}
}
-static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
+static void dequant_8x8( int16_t dct[64], int dequant_mf[6][64], int i_qp )
{
const int i_mf = i_qp%6;
const int i_qbits = i_qp/6 - 6;
- int y;
+ int i;
if( i_qbits >= 0 )
{
- for( y = 0; y < 8; y++ )
- {
- DEQUANT_SHL( 0 );
- DEQUANT_SHL( 1 );
- DEQUANT_SHL( 2 );
- DEQUANT_SHL( 3 );
- DEQUANT_SHL( 4 );
- DEQUANT_SHL( 5 );
- DEQUANT_SHL( 6 );
- DEQUANT_SHL( 7 );
- }
+ for( i = 0; i < 64; i++ )
+ DEQUANT_SHL( i );
}
else
{
const int f = 1 << (-i_qbits-1);
- for( y = 0; y < 8; y++ )
- {
- DEQUANT_SHR( 0 );
- DEQUANT_SHR( 1 );
- DEQUANT_SHR( 2 );
- DEQUANT_SHR( 3 );
- DEQUANT_SHR( 4 );
- DEQUANT_SHR( 5 );
- DEQUANT_SHR( 6 );
- DEQUANT_SHR( 7 );
- }
+ for( i = 0; i < 64; i++ )
+ DEQUANT_SHR( i );
}
}
-static void dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+static void dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
{
const int i_qbits = i_qp/6 - 6;
- int y;
+ int i;
if( i_qbits >= 0 )
{
- const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
-
- for( y = 0; y < 4; y++ )
- {
- dct[y][0] *= i_dmf;
- dct[y][1] *= i_dmf;
- dct[y][2] *= i_dmf;
- dct[y][3] *= i_dmf;
- }
+ const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
+ for( i = 0; i < 16; i++ )
+ dct[i] *= i_dmf;
}
else
{
- const int i_dmf = dequant_mf[i_qp%6][0][0];
+ const int i_dmf = dequant_mf[i_qp%6][0];
const int f = 1 << (-i_qbits-1);
-
- for( y = 0; y < 4; y++ )
- {
- dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
- dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
- dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
- dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
- }
+ for( i = 0; i < 16; i++ )
+ dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
}
}
@@ -215,7 +178,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max )
int idx = i_max - 1;
/* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */
- while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 )
+ while( idx >= 0 && M32( &dct[idx-1] ) == 0 )
idx -= 2;
if( idx >= 0 && dct[idx] == 0 )
idx--;
@@ -255,7 +218,7 @@ static int ALWAYS_INLINE x264_coeff_last_internal( int16_t *l, int i_count )
{
int i_last;
for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
- if( *(uint64_t*)(l+i_last-3) )
+ if( M64( l+i_last-3 ) )
break;
while( i_last >= 0 && l[i_last] == 0 )
i_last--;
@@ -428,6 +391,25 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_altivec;
}
#endif
+
+#ifdef HAVE_ARMV6
+ if( cpu&X264_CPU_ARMV6 )
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm;
+
+ if( cpu&X264_CPU_NEON )
+ {
+ pf->quant_2x2_dc = x264_quant_2x2_dc_neon;
+ pf->quant_4x4 = x264_quant_4x4_neon;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_neon;
+ pf->quant_8x8 = x264_quant_8x8_neon;
+ pf->dequant_4x4 = x264_dequant_4x4_neon;
+ pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
+ pf->dequant_8x8 = x264_dequant_8x8_neon;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+ }
+#endif
pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
diff --git a/common/quant.h b/common/quant.h
index b8a7b98..1cfe95d 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -25,14 +25,14 @@
typedef struct
{
- int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
- int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
- int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
- int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
+ int (*quant_8x8)( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+ int (*quant_4x4)( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+ int (*quant_4x4_dc)( int16_t dct[16], int mf, int bias );
+ int (*quant_2x2_dc)( int16_t dct[4], int mf, int bias );
- void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
- void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
- void (*dequant_4x4_dc)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+ void (*dequant_8x8)( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+ void (*dequant_4x4)( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+ void (*dequant_4x4_dc)( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
diff --git a/common/set.c b/common/set.c
index 6c7ddc4..f9379f0 100644
--- a/common/set.c
+++ b/common/set.c
@@ -20,7 +20,7 @@
#include "common.h"
-#define SHIFT(x,s) ((s)<0 ? (x)<<-(s) : (s)==0 ? (x) : ((x)+(1<<((s)-1)))>>(s))
+#define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
#define DIV(n,d) (((n) + ((d)>>1)) / (d))
static const int dequant4_scale[6][3] =
@@ -71,13 +71,14 @@ int x264_cqm_init( x264_t *h )
int def_quant8[6][64];
int def_dequant4[6][16];
int def_dequant8[6][64];
- int quant4_mf[4][6][4][4];
- int quant8_mf[2][6][8][8];
+ int quant4_mf[4][6][16];
+ int quant8_mf[2][6][64];
int q, i, j, i_list;
int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
32 - h->param.analyse.i_luma_deadzone[0],
32 - 11, 32 - 21 };
int max_qp_err = -1;
+ int max_chroma_qp_err = -1;
for( i = 0; i < 6; i++ )
{
@@ -93,9 +94,9 @@ int x264_cqm_init( x264_t *h )
}
else
{
- h-> quant4_mf[i] = x264_malloc(52*size*sizeof(uint16_t) );
- h->dequant4_mf[i] = x264_malloc( 6*size*sizeof(int) );
- h->unquant4_mf[i] = x264_malloc(52*size*sizeof(int) );
+ CHECKED_MALLOC( h-> quant4_mf[i], 52*size*sizeof(uint16_t) );
+ CHECKED_MALLOC( h->dequant4_mf[i], 6*size*sizeof(int) );
+ CHECKED_MALLOC( h->unquant4_mf[i], 52*size*sizeof(int) );
}
for( j = (i<4 ? 0 : 4); j < i; j++ )
@@ -105,7 +106,7 @@ int x264_cqm_init( x264_t *h )
if( j < i )
h->quant4_bias[i] = h->quant4_bias[j];
else
- h->quant4_bias[i] = x264_malloc(52*size*sizeof(uint16_t) );
+ CHECKED_MALLOC( h->quant4_bias[i], 52*size*sizeof(uint16_t) );
}
for( q = 0; q < 6; q++ )
@@ -129,14 +130,14 @@ int x264_cqm_init( x264_t *h )
for( i_list = 0; i_list < 4; i_list++ )
for( i = 0; i < 16; i++ )
{
- h->dequant4_mf[i_list][q][0][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
- quant4_mf[i_list][q][0][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
+ h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
+ quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
}
for( i_list = 0; i_list < 2; i_list++ )
for( i = 0; i < 64; i++ )
{
- h->dequant8_mf[i_list][q][0][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
- quant8_mf[i_list][q][0][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
+ h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
+ quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
}
}
for( q = 0; q < 52; q++ )
@@ -144,19 +145,21 @@ int x264_cqm_init( x264_t *h )
for( i_list = 0; i_list < 4; i_list++ )
for( i = 0; i < 16; i++ )
{
- h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][0][i];
- h-> quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][0][i], q/6 - 1);
+ h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
+ h-> quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
// round to nearest, unless that would cause the deadzone to be negative
h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
- if( j > 0xffff && q > max_qp_err )
+ if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
max_qp_err = q;
+ if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) )
+ max_chroma_qp_err = q;
}
if( h->param.analyse.b_transform_8x8 )
for( i_list = 0; i_list < 2; i_list++ )
for( i = 0; i < 64; i++ )
{
- h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][0][i];
- h-> quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][0][i], q/6);
+ h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
+ h-> quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
if( j > 0xffff && q > max_qp_err )
max_qp_err = q;
@@ -165,34 +168,46 @@ int x264_cqm_init( x264_t *h )
if( !h->mb.b_lossless && max_qp_err >= h->param.rc.i_qp_min )
{
- x264_log( h, X264_LOG_ERROR, "Quantization overflow.\n" );
- x264_log( h, X264_LOG_ERROR, "Your CQM is incompatible with QP < %d, but min QP is set to %d\n",
- max_qp_err+1, h->param.rc.i_qp_min );
+ x264_log( h, X264_LOG_ERROR, "Quantization overflow. Your CQM is incompatible with QP < %d,\n", max_qp_err+1 );
+ x264_log( h, X264_LOG_ERROR, "but min QP is set to %d.\n", h->param.rc.i_qp_min );
+ return -1;
+ }
+ if( !h->mb.b_lossless && max_chroma_qp_err >= h->chroma_qp_table[h->param.rc.i_qp_min] )
+ {
+ x264_log( h, X264_LOG_ERROR, "Quantization overflow. Your CQM is incompatible with QP < %d,\n", max_chroma_qp_err+1 );
+ x264_log( h, X264_LOG_ERROR, "but min chroma QP is implied to be %d.\n", h->chroma_qp_table[h->param.rc.i_qp_min] );
return -1;
}
return 0;
+fail:
+ x264_cqm_delete( h );
+ return -1;
}
+#define CQM_DELETE( n, max )\
+ for( i = 0; i < max; i++ )\
+ {\
+ for( j = 0; j < i; j++ )\
+ if( h->quant##n##_mf[i] == h->quant##n##_mf[j] )\
+ break;\
+ if( j == i )\
+ {\
+ x264_free( h-> quant##n##_mf[i] );\
+ x264_free( h->dequant##n##_mf[i] );\
+ x264_free( h->unquant##n##_mf[i] );\
+ }\
+ for( j = 0; j < i; j++ )\
+ if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\
+ break;\
+ if( j == i )\
+ x264_free( h->quant##n##_bias[i] );\
+ }
+
void x264_cqm_delete( x264_t *h )
{
int i, j;
- for( i = 0; i < 6; i++ )
- {
- for( j = 0; j < i; j++ )
- if( h->quant4_mf[i] == h->quant4_mf[j] )
- break;
- if( j == i )
- {
- x264_free( h-> quant4_mf[i] );
- x264_free( h->dequant4_mf[i] );
- x264_free( h->unquant4_mf[i] );
- }
- for( j = 0; j < i; j++ )
- if( h->quant4_bias[i] == h->quant4_bias[j] )
- break;
- if( j == i )
- x264_free( h->quant4_bias[i] );
- }
+ CQM_DELETE( 4, 4 );
+ CQM_DELETE( 8, 2 );
}
static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
diff --git a/common/visualize.c b/common/visualize.c
index f7100f0..1d3dd84 100644
--- a/common/visualize.c
+++ b/common/visualize.c
@@ -94,10 +94,13 @@ static void mv(int x0, int y0, int16_t dmv[2], int ref, int zoom, char *col)
/* }}} */
/* {{{ [fold] void x264_visualize_init( x264_t *h ) */
-void x264_visualize_init( x264_t *h )
+int x264_visualize_init( x264_t *h )
{
int mb = h->sps->i_mb_width * h->sps->i_mb_height;
- h->visualize = x264_malloc(mb * sizeof(visualize_t));
+ CHECKED_MALLOC( h->visualize, mb * sizeof(visualize_t) );
+ return 0;
+fail:
+ return -1;
}
/* }}} */
/* {{{ [fold] void x264_visualize_mb( x264_t *h ) */
diff --git a/common/visualize.h b/common/visualize.h
index b611f6c..f9753d7 100644
--- a/common/visualize.h
+++ b/common/visualize.h
@@ -23,7 +23,7 @@
#include "common/common.h"
-void x264_visualize_init( x264_t *h );
+int x264_visualize_init( x264_t *h );
void x264_visualize_mb( x264_t *h );
void x264_visualize_show( x264_t *h );
void x264_visualize_close( x264_t *h );
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm
index 20eb7b8..990f0ee 100644
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -96,11 +96,13 @@ cglobal x264_cpu_cpuid, 0,6
cglobal x264_stack_align
push ebp
mov ebp, esp
- sub esp, 4
+ sub esp, 8
and esp, ~15
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
+ mov edx, [ebp+16]
+ mov [esp+4], edx
call ecx
leave
ret
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 6e92df6..d4a0cae 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -26,16 +26,30 @@
%include "x86inc.asm"
%include "x86util.asm"
+%macro SHUFFLE_16BIT 8
+ %rep 8
+ db %1*2
+ db %1*2+1
+ %rotate 1
+ %endrep
+%endmacro
+
SECTION_RODATA
+pw_32_0: times 4 dw 32
+ times 4 dw 0
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1
+
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
-pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
-pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
+pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
+pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
+pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
+pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 16 db 1
+pw_1: times 8 dw 1
SECTION .text
@@ -145,6 +159,59 @@ cglobal x264_add4x4_idct_mmx, 2,2
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
RET
+INIT_XMM
+cglobal x264_add4x4_idct_sse4, 2,2,6
+ mova m0, [r1+0x00] ; row1/row0
+ mova m2, [r1+0x10] ; row3/row2
+ mova m1, m0 ; row1/row0
+ psraw m0, 1 ; row1>>1/...
+ mova m3, m2 ; row3/row2
+ psraw m2, 1 ; row3>>1/...
+ movsd m0, m1 ; row1>>1/row0
+ movsd m2, m3 ; row3>>1/row2
+ psubw m0, m3 ; row1>>1-row3/row0-2
+ paddw m2, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 wd, 0, 2, 1
+ SUMSUB_BA m2, m0, m1
+ pshuflw m1, m2, 10110001b
+ pshufhw m2, m2, 10110001b
+ punpckldq m1, m0
+ punpckhdq m2, m0
+ SWAP 0, 1
+
+ mova m1, [pw_32_0 GLOBAL]
+ paddw m1, m0 ; row1/row0 corrected
+ psraw m0, 1 ; row1>>1/...
+ mova m3, m2 ; row3/row2
+ psraw m2, 1 ; row3>>1/...
+ movsd m0, m1 ; row1>>1/row0
+ movsd m2, m3 ; row3>>1/row2
+ psubw m0, m3 ; row1>>1-row3/row0-2
+ paddw m2, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 qdq, 0, 2, 1
+ SUMSUB_BA m2, m0, m1
+
+ movd m4, [r0+FDEC_STRIDE*0]
+ movd m1, [r0+FDEC_STRIDE*1]
+ movd m3, [r0+FDEC_STRIDE*2]
+ movd m5, [r0+FDEC_STRIDE*3]
+ punpckldq m1, m4 ; row0/row1
+ pxor m4, m4
+ punpckldq m3, m5 ; row3/row2
+ punpcklbw m1, m4
+ psraw m2, 6
+ punpcklbw m3, m4
+ psraw m0, 6
+ paddsw m2, m1
+ paddsw m0, m3
+ packuswb m0, m2 ; row0/row1/row3/row2
+ pextrd [r0+FDEC_STRIDE*0], m0, 3
+ pextrd [r0+FDEC_STRIDE*1], m0, 2
+ movd [r0+FDEC_STRIDE*2], m0
+ pextrd [r0+FDEC_STRIDE*3], m0, 1
+ RET
+
+INIT_MMX
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
@@ -428,6 +495,79 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
ret
;-----------------------------------------------------------------------------
+; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+
+%macro DCTDC_2ROW_MMX 3
+ movq %1, [r1+FENC_STRIDE*(0+%3)]
+ movq m1, [r1+FENC_STRIDE*(1+%3)]
+ movq m2, [r2+FDEC_STRIDE*(0+%3)]
+ movq m3, [r2+FDEC_STRIDE*(1+%3)]
+ movq %2, %1
+ punpckldq %1, m1
+ punpckhdq %2, m1
+ movq m1, m2
+ punpckldq m2, m3
+ punpckhdq m1, m3
+ psadbw %1, m7
+ psadbw %2, m7
+ psadbw m2, m7
+ psadbw m1, m7
+ psubw %1, m2
+ psubw %2, m1
+%endmacro
+
+INIT_MMX
+cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+ pxor m7, m7
+ call .loop
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ add r0, 4
+.loop:
+ DCTDC_2ROW_MMX m0, m4, 0
+ DCTDC_2ROW_MMX m5, m6, 2
+ paddw m0, m5
+ paddw m4, m6
+ punpcklwd m0, m4
+ movd [r0], m0
+ ret
+
+INIT_XMM
+%macro DCTDC_2ROW_SSE2 3
+ movq m0, [r1+FENC_STRIDE*(0+%1)]
+ movq m1, [r1+FENC_STRIDE*(1+%1)]
+ movq m2, [r2+FDEC_STRIDE*(0+%1)]
+ movq m3, [r2+FDEC_STRIDE*(1+%1)]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ psadbw m0, m7
+ psadbw m2, m7
+%if %2
+ paddw %3, m0
+ paddw m6, m2
+%else
+ SWAP %3, m0
+ SWAP m6, m2
+%endif
+%endmacro
+
+cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+ pxor m7, m7
+ DCTDC_2ROW_SSE2 0, 0, m4
+ DCTDC_2ROW_SSE2 2, 1, m4
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ psubq m4, m6
+ DCTDC_2ROW_SSE2 0, 0, m5
+ DCTDC_2ROW_SSE2 2, 1, m5
+ psubq m5, m6
+ packssdw m4, m5
+ packssdw m4, m4
+ movq [r0], m4
+ RET
+
+;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
@@ -704,9 +844,106 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
RET
;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+
+; Output order:
+; 0 1 2 8 9 3 4 10
+; 16 11 5 6 7 12 17 24
+; 18 13 14 15 19 25 32 26
+; 20 21 22 23 27 33 40 34
+; 28 29 30 31 35 41 48 42
+; 36 37 38 39 43 49 50 44
+; 45 46 47 51 56 57 52 53
+; 54 55 58 59 60 61 62 63
+
+cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+ movq mm0, [r1+2*0] ; 03 02 01 00
+ movq mm1, [r1+2*4] ; 07 06 05 04
+ movq mm2, [r1+2*8] ; 11 10 09 08
+ pshufw mm3, mm0, 011111111b ; 03 03 03 03
+ movd r2, mm2 ; 09 08
+ pshufw mm2, mm2, 000111001b ; 08 11 10 09
+ punpcklwd mm3, mm1 ; 05 03 04 03
+ pinsrw mm0, r2, 3 ; 08 02 01 00
+ movq mm4, mm2
+ punpcklwd mm2, mm3 ; 04 10 03 09
+ pshufw mm2, mm2, 010110100b ; 10 04 03 09
+ movq [r0+2*0], mm0 ; 08 02 01 00
+ movq [r0+2*4], mm2 ; 10 04 03 09
+ movq mm3, [r1+2*12] ; 15 14 13 12
+ movq mm5, [r1+2*16] ; 19 18 17 16
+ punpckldq mm6, mm5 ; 17 16 XX XX
+ psrlq mm1, 16 ; XX 07 06 05
+ punpckhwd mm6, mm4 ; 08 17 11 16
+ punpckldq mm6, mm1 ; 06 05 11 16
+ movq [r0+2*8], mm6 ; 06 05 11 16
+ psrlq mm1, 16 ; XX XX 07 06
+ punpcklwd mm1, mm5 ; 17 07 16 06
+ movq mm0, [r1+2*20] ; 23 22 21 20
+ movq mm2, [r1+2*24] ; 27 26 25 24
+ movq mm6, mm3
+ punpckhdq mm1, mm1 ; 17 07 17 07
+ punpcklwd mm6, mm2 ; 25 13 24 12
+ pextrw r2, mm5, 2
+ movq [r0+2*24], mm0 ; 23 22 21 20
+ punpcklwd mm1, mm6 ; 24 17 12 07
+ movq [r0+2*12], mm1
+ pinsrw mm3, r2, 0 ; 15 14 13 18
+ movq [r0+2*16], mm3 ; 15 14 13 18
+ movq mm7, [r1+2*28]
+ movq mm0, [r1+2*32] ; 35 34 33 32
+ psrlq mm5, 48 ; XX XX XX 19
+ pshufw mm1, mm2, 011111001b ; 27 27 26 25
+ punpcklwd mm5, mm0 ; 33 XX 32 19
+ psrlq mm2, 48 ; XX XX XX 27
+ punpcklwd mm5, mm1 ; 26 32 25 19
+ movq [r0+2*32], mm7
+ movq [r0+2*20], mm5 ; 26 32 25 19
+ movq mm7, [r1+2*36]
+ movq mm1, [r1+2*40] ; 43 42 41 40
+ pshufw mm3, mm0, 011111001b ; 35 35 34 33
+ punpcklwd mm2, mm1 ; 41 XX 40 27
+ movq [r0+2*40], mm7
+ punpcklwd mm2, mm3 ; 34 40 33 27
+ movq [r0+2*28], mm2
+ movq mm7, [r1+2*44] ; 47 46 45 44
+ movq mm2, [r1+2*48] ; 51 50 49 48
+ psrlq mm0, 48 ; XX XX XX 35
+ punpcklwd mm0, mm2 ; 49 XX 48 35
+ pshufw mm3, mm1, 011111001b ; 43 43 42 41
+ punpcklwd mm0, mm3 ; 42 48 41 35
+ movq [r0+2*36], mm0
+ pextrw r2, mm2, 3 ; 51
+ psrlq mm1, 48 ; XX XX XX 43
+ punpcklwd mm1, mm7 ; 45 XX 44 43
+ psrlq mm2, 16 ; XX 51 50 49
+ punpcklwd mm1, mm2 ; 50 44 49 43
+ pshufw mm1, mm1, 010110100b ; 44 50 49 43
+ movq [r0+2*44], mm1
+ psrlq mm7, 16 ; XX 47 46 45
+ pinsrw mm7, r2, 3 ; 51 47 46 45
+ movq [r0+2*48], mm7
+ movq mm0, [r1+2*56] ; 59 58 57 56
+ movq mm1, [r1+2*52] ; 55 54 53 52
+ movq mm2, mm0
+ movq mm7, [r1+2*60]
+ punpckldq mm2, mm1 ; 53 52 57 56
+ punpckhdq mm1, mm0 ; 59 58 55 54
+ movq [r0+2*52], mm2
+ movq [r0+2*56], mm1
+ movq [r0+2*60], mm7
+ RET
+
+;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
+%macro ZIGZAG_SUB_4x4 2
+%ifidn %1, ac
+cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+%else
+cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE]
@@ -725,7 +962,11 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
punpckldq xmm6, xmm7
punpcklqdq xmm0, xmm2
punpcklqdq xmm4, xmm6
+%ifidn %2, frame
movdqa xmm7, [pb_sub4frame GLOBAL]
+%else
+ movdqa xmm7, [pb_sub4field GLOBAL]
+%endif
pshufb xmm0, xmm7
pshufb xmm4, xmm7
pxor xmm6, xmm6
@@ -737,9 +978,28 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
punpckhbw xmm5, xmm6
psubw xmm0, xmm4
psubw xmm1, xmm5
+%ifidn %1, ac
+ movd r2d, xmm0
+ pand xmm0, [pb_subacmask GLOBAL]
+%endif
movdqa [r0], xmm0
+ pxor xmm2, xmm2
movdqa [r0+16], xmm1
+ por xmm0, xmm1
+ pcmpeqb xmm0, xmm2
+ pmovmskb eax, xmm0
+%ifidn %1, ac
+ mov [r3], r2w
+%endif
+ sub eax, 0xffff
+ shr eax, 31
RET
+%endmacro
+
+ZIGZAG_SUB_4x4 , frame
+ZIGZAG_SUB_4x4 ac, frame
+ZIGZAG_SUB_4x4 , field
+ZIGZAG_SUB_4x4 ac, field
;-----------------------------------------------------------------------------
; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 4451821..a8f46ca 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -24,50 +24,56 @@
#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H
-void x264_sub4x4_dct_mmx ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_mmx ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_mmx ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_mmx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_mmx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_ssse3 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
+void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
+void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
+void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] );
+void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] );
+void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [16] );
+void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][16] );
+void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][16] );
+void x264_add16x16_idct_dc_sse2 ( uint8_t *p_dst, int16_t dct [16] );
+void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
+void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
-void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
-void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
-void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
-void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
-void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] );
-void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
-void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
-void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] );
-void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
-void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] );
+void x264_dct4x4dc_mmx ( int16_t d[16] );
+void x264_idct4x4dc_mmx ( int16_t d[16] );
-void x264_dct4x4dc_mmx ( int16_t d[4][4] );
-void x264_idct4x4dc_mmx ( int16_t d[4][4] );
+void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse2 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_mmx ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_ssse3 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_ssse3( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
+void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
+void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
+void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] );
+void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
-void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] );
-void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
-
-void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[8][8] );
-void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[8][8] );
-void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] );
-void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
+int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
+int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 6b435d8..f486a8d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -6,6 +6,7 @@
;* Authors: Loren Merritt <lorenm at u.washington.edu>
;* Jason Garrett-Glaser <darkshikari at gmail.com>
;* Laurent Aimar <fenrir at via.ecp.fr>
+;* Dylan Yudaken <dyudaken at gmail.com>
;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
@@ -25,8 +26,10 @@
%include "x86inc.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
+pw_1: times 8 dw 1
pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_32: times 8 dw 32
@@ -36,9 +39,8 @@ sw_64: dd 64
SECTION .text
;=============================================================================
-; weighted prediction
+; implicit weighted biprediction
;=============================================================================
-; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
@@ -63,12 +65,12 @@ SECTION .text
%endmacro
%endif
-%macro SPLATW 2
+%macro SPLATW 2-3 0
%if mmsize==16
- pshuflw %1, %2, 0
+ pshuflw %1, %2, %3*0x55
punpcklqdq %1, %1
%else
- pshufw %1, %2, 0
+ pshufw %1, %2, %3*0x55
%endif
%endmacro
@@ -174,6 +176,225 @@ INIT_XMM
AVG_WEIGHT ssse3, 8, 7
AVG_WEIGHT ssse3, 16, 7
+;=============================================================================
+; P frame explicit weighted prediction
+;=============================================================================
+
+%macro WEIGHT_START 1
+ mova m3, [r4]
+ mova m6, [r4+16]
+ movd m5, [r4+32]
+ pxor m2, m2
+%if (%1 == 20 || %1 == 12) && mmsize == 16
+ movdq2q mm3, xmm3
+ movdq2q mm4, xmm4
+ movdq2q mm5, xmm5
+ movdq2q mm6, xmm6
+ pxor mm2, mm2
+%endif
+%endmacro
+
+%macro WEIGHT_START_SSSE3 1
+ mova m3, [r4]
+ mova m4, [r4+16]
+ pxor m2, m2
+%if %1 == 20 || %1 == 12
+ movdq2q mm3, xmm3
+ movdq2q mm4, xmm4
+ pxor mm2, mm2
+%endif
+%endmacro
+
+;; macro to weight mmsize bytes taking half from %1 and half from %2
+%macro WEIGHT 2 ; (src1,src2)
+ movh m0, [%1]
+ movh m1, [%2]
+ punpcklbw m0, m2 ;setup
+ punpcklbw m1, m2 ;setup
+ pmullw m0, m3 ;scale
+ pmullw m1, m3 ;scale
+ paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
+ psraw m0, m5 ;denom
+ psraw m1, m5 ;denom
+%endmacro
+
+%macro WEIGHT_SSSE3 2
+ movh m0, [%1]
+ movh m1, [%2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ psllw m0, 7
+ psllw m1, 7
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+%endmacro
+
+%macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
+%if %3 == 16
+ mova [%2], %1
+%elif %3 == 8
+ movq [%2], %1
+%else
+ movd [%2], %1 ; width 2 can write garbage for last 2 bytes
+%endif
+%endmacro
+
+%macro WEIGHT_ROW 3 ; (src,dst,width)
+ ;; load weights
+ WEIGHT %1, (%1+(mmsize/2))
+ packuswb m0, m1 ;put bytes into m0
+ WEIGHT_SAVE_ROW m0, %2, %3
+%endmacro
+
+%macro WEIGHT_SAVE_COL 2 ;(dst,size)
+%if %2 == 8
+ packuswb m0, m1
+ movq [%1], m0
+ movhps [%1+r1], m0
+%else
+ packuswb m0, m0
+ packuswb m1, m1
+ movd [%1], m0 ; width 2 can write garbage for last 2 bytes
+ movd [%1+r1], m1
+%endif
+%endmacro
+
+%macro WEIGHT_COL 3 ; (src,dst,width)
+%if %3 <= 4 && mmsize == 16
+ INIT_MMX
+ ;; load weights
+ WEIGHT %1, (%1+r3)
+ WEIGHT_SAVE_COL %2, %3
+ INIT_XMM
+%else
+ WEIGHT %1, (%1+r3)
+ WEIGHT_SAVE_COL %2, %3
+%endif
+
+%endmacro
+
+%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+ WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
+ WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
+ %assign x (x+mmsize)
+%else
+ WEIGHT_COL (%1+x),(%2+x),(%3-x)
+ %exitrep
+%endif
+%if x >= %3
+ %exitrep
+%endif
+%endrep
+%endmacro
+
+
+;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
+
+%ifdef ARCH_X86_64
+%define NUMREGS 6
+%define LOAD_HEIGHT
+%define HEIGHT_REG r5d
+%else
+%define NUMREGS 5
+%define LOAD_HEIGHT mov r4d, r5m
+%define HEIGHT_REG r4d
+%endif
+
+%macro WEIGHTER 2
+ cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+ WEIGHT_START %1
+ LOAD_HEIGHT
+.loop:
+ WEIGHT_TWO_ROW r2, r0, %1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub HEIGHT_REG, 2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+WEIGHTER 4, mmxext
+WEIGHTER 8, mmxext
+WEIGHTER 12, mmxext
+WEIGHTER 16, mmxext
+WEIGHTER 20, mmxext
+INIT_XMM
+WEIGHTER 8, sse2
+WEIGHTER 16, sse2
+WEIGHTER 20, sse2
+%define WEIGHT WEIGHT_SSSE3
+%define WEIGHT_START WEIGHT_START_SSSE3
+INIT_MMX
+WEIGHTER 4, ssse3
+INIT_XMM
+WEIGHTER 8, ssse3
+WEIGHTER 16, ssse3
+WEIGHTER 20, ssse3
+
+%macro OFFSET_OP 7
+ mov%6 m0, [%1]
+ mov%6 m1, [%2]
+ p%5usb m0, m2
+ p%5usb m1, m2
+ mov%7 [%3], m0
+ mov%7 [%4], m1
+%endmacro
+
+%macro OFFSET_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
+ %assign x (x+mmsize)
+%else
+ OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+ %exitrep
+%endif
+%if x >= %3
+ %exitrep
+%endif
+%endrep
+%endmacro
+
+;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
+%macro OFFSET 3
+ cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+ mova m2, [r4]
+ LOAD_HEIGHT
+.loop:
+ OFFSET_TWO_ROW r2, r0, %1, %3
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub HEIGHT_REG, 2
+ jg .loop
+ REP_RET
+%endmacro
+
+%macro OFFSETPN 2
+ OFFSET %1, %2, add
+ OFFSET %1, %2, sub
+%endmacro
+INIT_MMX
+OFFSETPN 4, mmxext
+OFFSETPN 8, mmxext
+OFFSETPN 12, mmxext
+OFFSETPN 16, mmxext
+OFFSETPN 20, mmxext
+INIT_XMM
+OFFSETPN 12, sse2
+OFFSETPN 16, sse2
+OFFSETPN 20, sse2
+%undef LOAD_HEIGHT
+%undef HEIGHT_REG
+%undef NUMREGS
+
;=============================================================================
@@ -510,6 +731,66 @@ AVG_CACHELINE_CHECK 12, 64, mmxext
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
+; computed jump assumes this loop is exactly 48 bytes
+%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
+ALIGN 16
+avg_w16_align%1_%2_ssse3:
+%if %2&15==0
+ movdqa xmm1, [r2+16]
+ palignr xmm1, [r2], %1
+ pavgb xmm1, [r2+r4]
+%else
+ movdqa xmm1, [r2+16]
+ movdqa xmm2, [r2+r4+16]
+ palignr xmm1, [r2], %1
+ palignr xmm2, [r2+r4], %2
+ pavgb xmm1, xmm2
+%endif
+ movdqa [r0], xmm1
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg avg_w16_align%1_%2_ssse3
+ rep ret
+%endmacro
+
+%assign j 1
+%assign k 2
+%rep 15
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
+
+cglobal x264_pixel_avg2_w16_cache64_ssse3
+ mov eax, r2m
+ and eax, 0x3f
+ cmp eax, 0x30
+ jle x264_pixel_avg2_w16_sse2
+ PROLOGUE 6,7
+ lea r6, [r4+r2]
+ and r4, ~0xf
+ and r6, 0x1f
+ and r2, ~0xf
+ lea r6, [r6*3] ;(offset + align*2)*3
+ sub r4, r2
+ shl r6, 4 ;jump = (offset + align*2)*48
+%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
+%ifdef PIC
+ lea r11, [avg_w16_addr GLOBAL]
+ add r6, r11
+%else
+ lea r6, [avg_w16_addr + r6 GLOBAL]
+%endif
+%ifdef UNIX64
+ jmp r6
+%else
+ call r6
+ RET
+%endif
+
+
;=============================================================================
; pixel copy
;=============================================================================
@@ -869,8 +1150,9 @@ MC_CHROMA mmxext
INIT_XMM
MC_CHROMA sse2, 8
+%macro MC_CHROMA_SSSE3 2
INIT_MMX
-cglobal x264_mc_chroma_ssse3, 0,6,8
+cglobal x264_mc_chroma_ssse3%1, 0,6,%2
MC_CHROMA_START
and r4d, 7
and r5d, 7
@@ -887,19 +1169,27 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
- movifnidn r0, r0mp
+ movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
SPLATW m7, m7
- movh m0, [r2]
- punpcklbw m0, [r2+1]
- add r2, r3
+ mov r5, r2
+ and r2, ~3
+ and r5, 3
+%ifdef PIC
+ lea r11, [ch_shuffle GLOBAL]
+ movu m5, [r11 + r5*2]
+%else
+ movu m5, [ch_shuffle + r5*2 GLOBAL]
+%endif
+ movu m0, [r2]
+ pshufb m0, m5
.loop4:
- movh m1, [r2]
- movh m3, [r2+r3]
- punpcklbw m1, [r2+1]
- punpcklbw m3, [r2+r3+1]
+ movu m1, [r2+r3]
+ pshufb m1, m5
+ movu m3, [r2+2*r3]
+ pshufb m3, m5
lea r2, [r2+2*r3]
mova m2, m1
mova m4, m3
@@ -907,8 +1197,8 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
- paddw m0, m5
- paddw m2, m5
+ paddw m0, [pw_32 GLOBAL]
+ paddw m2, [pw_32 GLOBAL]
paddw m1, m0
paddw m3, m2
mova m0, m4
@@ -925,23 +1215,28 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
INIT_XMM
.width8:
- mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
- movifnidn r0, r0mp
+ movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
SPLATW m7, m7
+%ifidn %1, _cache64
+ mov r5, r2
+ and r5, 0x3f
+ cmp r5, 0x38
+ jge .split
+%endif
+ mova m5, [pw_32 GLOBAL]
movh m0, [r2]
movh m1, [r2+1]
punpcklbw m0, m1
- add r2, r3
.loop8:
- movh m1, [r2]
- movh m2, [r2+1]
- movh m3, [r2+r3]
- movh m4, [r2+r3+1]
+ movh m1, [r2+1*r3]
+ movh m2, [r2+1*r3+1]
+ movh m3, [r2+2*r3]
+ movh m4, [r2+2*r3+1]
punpcklbw m1, m2
punpcklbw m3, m4
lea r2, [r2+2*r3]
@@ -965,6 +1260,53 @@ INIT_XMM
lea r0, [r0+2*r1]
jg .loop8
REP_RET
-
+%ifidn %1, _cache64
+.split:
+ and r2, ~7
+ and r5, 7
+%ifdef PIC
+ lea r11, [ch_shuffle GLOBAL]
+ movu m5, [r11 + r5*2]
+%else
+ movu m5, [ch_shuffle + r5*2 GLOBAL]
+%endif
+ movu m0, [r2]
+ pshufb m0, m5
+%ifdef ARCH_X86_64
+ mova m8, [pw_32 GLOBAL]
+ %define round m8
+%else
+ %define round [pw_32 GLOBAL]
+%endif
+.splitloop8:
+ movu m1, [r2+r3]
+ pshufb m1, m5
+ movu m3, [r2+2*r3]
+ pshufb m3, m5
+ lea r2, [r2+2*r3]
+ mova m2, m1
+ mova m4, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m3, m6
+ paddw m0, round
+ paddw m2, round
+ paddw m1, m0
+ paddw m3, m2
+ mova m0, m4
+ psrlw m1, 6
+ psrlw m3, 6
+ packuswb m1, m3
+ movh [r0], m1
+ movhps [r0+r1], m1
+ sub r4d, 2
+ lea r0, [r0+2*r1]
+ jg .splitloop8
+ REP_RET
+%endif
; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
+%endmacro
+MC_CHROMA_SSSE3 , 8
+MC_CHROMA_SSSE3 _cache64, 9
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 9745ac6..245c09f 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -34,6 +34,7 @@ filt_mul51: times 8 db 1, -5
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_32: times 8 dw 32
+pd_128: times 4 dd 128
SECTION .text
@@ -1081,3 +1082,43 @@ INIT_XMM
FRAME_INIT_LOWRES sse2, 12
%define PALIGNR PALIGNR_SSSE3
FRAME_INIT_LOWRES ssse3, 12
+
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+;-----------------------------------------------------------------------------
+cglobal x264_mbtree_propagate_cost_sse2, 6,6
+ shl r5d, 1
+ lea r0, [r0+r5*2]
+ add r1, r5
+ add r2, r5
+ add r3, r5
+ add r4, r5
+ neg r5
+ pxor xmm5, xmm5
+ movdqa xmm4, [pd_128 GLOBAL]
+.loop:
+ movq xmm2, [r2+r5] ; intra
+ movq xmm0, [r4+r5] ; invq
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm0, xmm5
+ pmaddwd xmm0, xmm2
+ paddd xmm0, xmm4
+ psrld xmm0, 8 ; intra*invq>>8
+ movq xmm1, [r1+r5] ; prop
+ movq xmm3, [r3+r5] ; inter
+ punpcklwd xmm1, xmm5
+ punpcklwd xmm3, xmm5
+ paddd xmm0, xmm1 ; prop + (intra*invq>>8)
+ cvtdq2ps xmm1, xmm2 ; intra
+ psubd xmm2, xmm3 ; intra - inter
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm2, xmm2
+ mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
+ divps xmm0, xmm1 ; / intra
+ cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
+ movdqa [r0+r5*2], xmm0
+ add r5, 8
+ jl .loop
+ REP_RET
+
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index aede5b8..b3683a3 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -42,6 +42,32 @@ DECL_SUF( x264_pixel_avg_8x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int
DECL_SUF( x264_pixel_avg_4x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_4x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+
+#define MC_WEIGHT(w,type) \
+ extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+
+#define MC_WEIGHT_OFFSET(w,type) \
+ extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ MC_WEIGHT(w,type)
+
+MC_WEIGHT_OFFSET( 4, mmxext )
+MC_WEIGHT_OFFSET( 8, mmxext )
+MC_WEIGHT_OFFSET( 12, mmxext )
+MC_WEIGHT_OFFSET( 16, mmxext )
+MC_WEIGHT_OFFSET( 20, mmxext )
+MC_WEIGHT_OFFSET( 12, sse2 )
+MC_WEIGHT_OFFSET( 16, sse2 )
+MC_WEIGHT_OFFSET( 20, sse2 )
+MC_WEIGHT( 8, sse2 )
+MC_WEIGHT( 4, ssse3 )
+MC_WEIGHT( 8, ssse3 )
+MC_WEIGHT( 12, ssse3 )
+MC_WEIGHT( 16, ssse3 )
+MC_WEIGHT( 20, ssse3 )
+#undef MC_OFFSET
+#undef MC_WEIGHT
+
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
@@ -59,6 +85,9 @@ extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height );
+extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
@@ -71,6 +100,8 @@ extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int strid
extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
@@ -91,6 +122,7 @@ PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
+PIXEL_AVG_WALL(cache64_ssse3)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
@@ -116,6 +148,7 @@ PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_m
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
@@ -130,6 +163,70 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
+ static void (* x264_mc_##function##_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+{\
+ x264_mc_##function##_w4_##name1,\
+ x264_mc_##function##_w4_##name1,\
+ x264_mc_##function##_w8_##name2,\
+ x264_mc_##function##_w##w12version##_##instr,\
+ x264_mc_##function##_w16_##instr,\
+ x264_mc_##function##_w20_##instr,\
+};
+
+MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,16)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
+
+static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+{
+ int i;
+ int16_t den1;
+
+ if( w->i_scale == 1<<w->i_denom )
+ {
+ if( w->i_offset < 0 )
+ w->weightfn = h->mc.offsetsub;
+ else
+ w->weightfn = h->mc.offsetadd;
+ memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
+ return;
+ }
+ w->weightfn = h->mc.weight;
+ den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
+ for( i = 0; i < 8; i++ )
+ {
+ w->cachea[i] = w->i_scale;
+ w->cacheb[i] = den1;
+ }
+}
+
+static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
+{
+ int i, den1;
+ if( w->i_scale == 1<<w->i_denom )
+ {
+ if( w->i_offset < 0 )
+ w->weightfn = h->mc.offsetsub;
+ else
+ w->weightfn = h->mc.offsetadd;
+
+ memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
+ return;
+ }
+ w->weightfn = h->mc.weight;
+ den1 = w->i_scale << (8 - w->i_denom);
+ for(i = 0;i<8;i++)
+ {
+ w->cachea[i] = den1;
+ w->cacheb[i] = w->i_offset;
+ }
+}
+
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
@@ -137,7 +234,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
uint8_t *src[4], int i_src_stride,\
int mvx, int mvy,\
- int i_width, int i_height )\
+ int i_width, int i_height, const x264_weight_t *weight )\
{\
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -148,12 +245,13 @@ static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
x264_pixel_avg_wtab_##instr1[i_width>>2](\
dst, i_dst_stride, src1, i_src_stride,\
src2, i_height );\
+ if( weight->weightfn )\
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
}\
+ else if( weight->weightfn )\
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
else\
- {\
- x264_mc_copy_wtab_##instr2[i_width>>2](\
- dst, i_dst_stride, src1, i_src_stride, i_height );\
- }\
+ x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
}
MC_LUMA(mmxext,mmxext,mmx)
@@ -163,12 +261,13 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
#define GET_REF(name)\
static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
uint8_t *src[4], int i_src_stride,\
int mvx, int mvy,\
- int i_width, int i_height )\
+ int i_width, int i_height, const x264_weight_t *weight )\
{\
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -179,6 +278,13 @@ static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
x264_pixel_avg_wtab_##name[i_width>>2](\
dst, *i_dst_stride, src1, i_src_stride,\
src2, i_height );\
+ if( weight->weightfn )\
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
+ return dst;\
+ }\
+ else if( weight->weightfn )\
+ {\
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
return dst;\
}\
else\
@@ -196,6 +302,7 @@ GET_REF(cache64_mmxext)
GET_REF(sse2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
+GET_REF(cache64_ssse3)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -257,6 +364,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->get_ref = get_ref_mmxext;
pf->mc_chroma = x264_mc_chroma_mmxext;
+ pf->weight = x264_mc_weight_wtab_mmxext;
+ pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
+ pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
+ pf->weight_cache = x264_weight_cache_mmxext;
+
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
@@ -296,10 +408,15 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
+ pf->weight = x264_mc_weight_wtab_sse2;
+ pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+ pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
@@ -340,6 +457,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
+ if( cpu&X264_CPU_CACHELINE_64 )
+ {
+ pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
+ pf->mc_luma = mc_luma_cache64_ssse3;
+ pf->get_ref = get_ref_cache64_ssse3;
+
+ /* ssse3 weight is slower on Nehalem, so only assign here. */
+ pf->weight_cache = x264_weight_cache_ssse3;
+ pf->weight = x264_mc_weight_wtab_ssse3;
+ }
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->integral_init4v = x264_integral_init4v_ssse3;
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6a19c40..d94daaf 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -77,13 +77,16 @@ SECTION .text
;=============================================================================
%macro SSD_LOAD_FULL 5
- mova m1, [r0+%1]
- mova m2, [r2+%2]
- mova m3, [r0+%3]
- mova m4, [r2+%4]
-%if %5
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ mova m1, [t0+%1]
+ mova m2, [t2+%2]
+ mova m3, [t0+%3]
+ mova m4, [t2+%4]
+%if %5==1
+ add t0, t1
+ add t2, t3
+%elif %5==2
+ lea t0, [t0+2*t1]
+ lea t2, [t2+2*t3]
%endif
%endmacro
@@ -91,7 +94,7 @@ SECTION .text
movh m%1, %3
movh m%2, %4
%if %5
- lea r0, [r0+2*r1]
+ lea t0, [t0+2*t1]
%endif
%endmacro
@@ -99,7 +102,7 @@ SECTION .text
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklbw m%1, m7
punpcklbw m%3, m7
@@ -113,7 +116,7 @@ SECTION .text
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklqdq m%1, m%2
punpcklqdq m%3, m%4
@@ -126,17 +129,17 @@ SECTION .text
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklbw m%1, m%3
punpcklbw m%2, m%4
%endmacro
%macro SSD_LOAD_HALF 5
- LOAD 1, 2, [r0+%1], [r0+%3], 1
- JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1
- LOAD 3, 4, [r0+%1], [r0+%3], %5
- JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5
+ LOAD 1, 2, [t0+%1], [t0+%3], 1
+ JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
+ LOAD 3, 4, [t0+%1], [t0+%3], %5
+ JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
%endmacro
%macro SSD_CORE 7-8
@@ -152,8 +155,8 @@ SECTION .text
mova m%2, m%1
mova m%4, m%3
punpckhbw m%1, m%5
- punpckhbw m%3, m%5
punpcklbw m%2, m%5
+ punpckhbw m%3, m%5
punpcklbw m%4, m%5
%endif
pmaddwd m%1, m%1
@@ -167,11 +170,11 @@ SECTION .text
DEINTB %6, %1, %7, %2, %5
psubw m%6, m%7
psubw m%1, m%2
- SWAP %2, %6
+ SWAP %6, %2, %1
DEINTB %6, %3, %7, %4, %5
psubw m%6, m%7
psubw m%3, m%4
- SWAP %4, %6
+ SWAP %6, %4, %3
%endif
pmaddwd m%1, m%1
pmaddwd m%2, m%2
@@ -187,7 +190,7 @@ SECTION .text
punpcklbw m%3, m%4
punpckhbw m%6, m%2
punpckhbw m%7, m%4
- SWAP %6, %2
+ SWAP %6, %2, %3
SWAP %7, %4
%endif
pmaddubsw m%1, m%5
@@ -200,28 +203,46 @@ SECTION .text
pmaddwd m%4, m%4
%endmacro
-%macro SSD_END 1
+%macro SSD_ITER 6
+ SSD_LOAD_%1 %2,%3,%4,%5,%6
+ SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
paddd m1, m2
paddd m3, m4
-%if %1
paddd m0, m1
-%else
- SWAP 0, 1
-%endif
paddd m0, m3
%endmacro
-%macro SSD_ITER 7
- SSD_LOAD_%1 %2,%3,%4,%5,%7
- SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
- SSD_END %6
-%endmacro
-
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3-4 0
-cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
+%if %1 != %2
+ %assign function_align 8
+%else
+ %assign function_align 16
+%endif
+cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+ mov al, %1*%2/mmsize/2
+
+%if %1 != %2
+ jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
+%else
+
+.startloop:
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3
+%ifnidn %3, mmx
+ PROLOGUE 0,0,8
+%endif
+%else
+ PROLOGUE 0,5
+ DECLARE_REG_TMP 1,2,3,4
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+ mov t3, r3m
+%endif
+
%ifidn %3, ssse3
mova m7, [hsub_mul GLOBAL]
%elifidn %3, sse2
@@ -229,57 +250,57 @@ cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
%elif %1 >= mmsize
pxor m7, m7
%endif
-%assign i 0
-%rep %2/4
+ pxor m0, m0
+
+ALIGN 16
+.loop:
%if %1 > mmsize
- SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0
- SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1
- SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0
- SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1
+ SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
%elif %1 == mmsize
- SSD_ITER FULL, 0, 0, r1, r3, i, 1
- SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1
+ SSD_ITER FULL, 0, 0, t1, t3, 2
%else
- SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1
+ SSD_ITER HALF, 0, 0, t1, t3, 2
%endif
-%assign i i+1
-%endrep
+ dec al
+ jg .loop
HADDD m0, m1
movd eax, m0
RET
+%endif
%endmacro
INIT_MMX
SSD 16, 16, mmx
SSD 16, 8, mmx
-SSD 8, 16, mmx
SSD 8, 8, mmx
+SSD 8, 16, mmx
+SSD 4, 4, mmx
SSD 8, 4, mmx
SSD 4, 8, mmx
-SSD 4, 4, mmx
INIT_XMM
SSD 16, 16, sse2slow, 8
+SSD 8, 8, sse2slow, 8
SSD 16, 8, sse2slow, 8
SSD 8, 16, sse2slow, 8
-SSD 8, 8, sse2slow, 8
SSD 8, 4, sse2slow, 8
%define SSD_CORE SSD_CORE_SSE2
%define JOIN JOIN_SSE2
SSD 16, 16, sse2, 8
+SSD 8, 8, sse2, 8
SSD 16, 8, sse2, 8
SSD 8, 16, sse2, 8
-SSD 8, 8, sse2, 8
SSD 8, 4, sse2, 8
%define SSD_CORE SSD_CORE_SSSE3
%define JOIN JOIN_SSSE3
SSD 16, 16, ssse3, 8
+SSD 8, 8, ssse3, 8
SSD 16, 8, ssse3, 8
SSD 8, 16, ssse3, 8
-SSD 8, 8, ssse3, 8
SSD 8, 4, ssse3, 8
INIT_MMX
-SSD 4, 8, ssse3
SSD 4, 4, ssse3
+SSD 4, 8, ssse3
+%assign function_align 16
;=============================================================================
; variance
@@ -295,14 +316,15 @@ SSD 4, 4, ssse3
%endif
%endmacro
-%macro VAR_END 1
+%macro VAR_END 0
HADDW m5, m7
- movd r1d, m5
- imul r1d, r1d
+ movd eax, m5
HADDD m6, m1
- shr r1d, %1
- movd eax, m6
- sub eax, r1d ; sqr - (sum * sum >> shift)
+ movd edx, m6
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
RET
%endmacro
@@ -349,12 +371,12 @@ INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
VAR_START 0
VAR_2ROW 8, 16
- VAR_END 8
+ VAR_END
cglobal x264_pixel_var_8x8_mmxext, 2,3
VAR_START 0
VAR_2ROW r1, 4
- VAR_END 6
+ VAR_END
INIT_XMM
cglobal x264_pixel_var_16x16_sse2, 2,3,8
@@ -368,7 +390,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8
VAR_CORE
dec r2d
jg .loop
- VAR_END 8
+ VAR_END
cglobal x264_pixel_var_8x8_sse2, 2,4,8
VAR_START 1
@@ -384,8 +406,121 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
VAR_CORE
dec r2d
jg .loop
- VAR_END 6
+ VAR_END
+%macro VAR2_END 0
+ HADDW m5, m7
+ movd r1d, m5
+ imul r1d, r1d
+ HADDD m6, m1
+ shr r1d, 6
+ movd eax, m6
+ mov [r4], eax
+ sub eax, r1d ; sqr - (sum * sum >> shift)
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
+INIT_MMX
+cglobal x264_pixel_var2_8x8_mmxext, 5,6
+ VAR_START 0
+ mov r5d, 8
+.loop:
+ movq m0, [r0]
+ movq m1, m0
+ movq m4, m0
+ movq m2, [r2]
+ movq m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ psubw m0, m2
+ psubw m1, m3
+ paddw m5, m0
+ paddw m5, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m6, m0
+ paddd m6, m1
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jg .loop
+ VAR2_END
+ RET
+%endif
+
+INIT_XMM
+cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+ VAR_START 1
+ mov r5d, 4
+.loop:
+ movq m1, [r0]
+ movhps m1, [r0+r1]
+ movq m3, [r2]
+ movhps m3, [r2+r3]
+ DEINTB 0, 1, 2, 3, 7
+ psubw m0, m2
+ psubw m1, m3
+ paddw m5, m0
+ paddw m5, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m6, m0
+ paddd m6, m1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ dec r5d
+ jg .loop
+ VAR2_END
+ RET
+
+cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+ pxor m5, m5 ; sum
+ pxor m6, m6 ; sum squared
+ mova m7, [hsub_mul GLOBAL]
+ mov r5d, 2
+.loop:
+ movq m0, [r0]
+ movq m2, [r2]
+ movq m1, [r0+r1]
+ movq m3, [r2+r3]
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m0, m2
+ punpcklbw m1, m3
+ movq m2, [r0]
+ movq m3, [r2]
+ punpcklbw m2, m3
+ movq m3, [r0+r1]
+ movq m4, [r2+r3]
+ punpcklbw m3, m4
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ paddw m5, m0
+ paddw m5, m1
+ paddw m5, m2
+ paddw m5, m3
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m6, m0
+ paddd m6, m1
+ paddd m6, m2
+ paddd m6, m3
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ dec r5d
+ jg .loop
+ VAR2_END
+ RET
;=============================================================================
; SATD
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 312aca8..9bba683 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -69,8 +69,8 @@ DECL_X4( sad, cache64_mmxext );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
-DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
@@ -102,6 +102,9 @@ void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
+int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 0248038..602ddcd 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -75,9 +75,9 @@ extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
extern void predict_16x16_v_sse2( uint8_t *src );
extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
-DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
-DECLARE_ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-DECLARE_ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
@@ -266,12 +266,12 @@ static void predict_8x8c_dc_left( uint8_t *src )
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)src = dc0;
+ M64( src ) = dc0;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)src = dc1;
+ M64( src ) = dc1;
src += FDEC_STRIDE;
}
@@ -296,8 +296,8 @@ static void predict_8x8c_dc_left( uint8_t *src )
#define PREDICT_8x8_DC(v) \
int y; \
for( y = 0; y < 8; y++ ) { \
- ((uint32_t*)src)[0] = \
- ((uint32_t*)src)[1] = v; \
+ M32( src+0 ) = v; \
+ M32( src+4 ) = v; \
src += FDEC_STRIDE; \
}
@@ -332,7 +332,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )
PREDICT_8x8_LOAD_TOP\
PREDICT_8x8_LOAD_LEFT\
int t;\
- DECLARE_ALIGNED_16( int16_t sa8d_1d[2][8] );\
+ ALIGNED_16( int16_t sa8d_1d[2][8] );\
SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 284cbbb..52e121a 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -305,6 +305,7 @@ QUANT_AC x264_quant_8x8_sse4, 8
%macro DEQUANT16_FLAT 2-5
mova m0, %1
+ psllw m0, m4
%assign i %0-2
%rep %0-1
%if i
@@ -313,7 +314,6 @@ QUANT_AC x264_quant_8x8_sse4, 8
%else
pmullw m0, [r0+%2]
%endif
- psllw m %+ i, m4
mova [r0+%2], m %+ i
%assign i i-1
%rotate 1
diff --git a/common/x86/quant.h b/common/x86/quant.h
index dff60a8..4e42b81 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -24,30 +24,30 @@
#ifndef X264_I386_QUANT_H
#define X264_I386_QUANT_H
-int x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
-int x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-int x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-int x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
-int x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-int x264_quant_4x4_dc_sse4( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_sse4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_sse4( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_4x4dc_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+int x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_ssse3( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_ssse3( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_ssse3( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_ssse3( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse4( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_sse4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse4( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 68d8584..342a984 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -28,9 +28,8 @@
SECTION_RODATA
pb_3: times 16 db 3
+pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
pw_8: times 4 dw 8
-pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
-pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
sw_64: dd 64
SECTION .text
@@ -450,16 +449,32 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifidn %1, ssse3
- movq m1, m0
- pshufb m0, [pb_shuf8x8c0 GLOBAL]
- pshufb m1, [pb_shuf8x8c1 GLOBAL]
+ movq2dq xmm0, m0
+ pshufb xmm0, [pb_shuf8x8c GLOBAL]
+ movq xmm1, [r0+FENC_STRIDE*0]
+ movq xmm2, [r0+FENC_STRIDE*1]
+ movq xmm3, [r0+FENC_STRIDE*2]
+ movq xmm4, [r0+FENC_STRIDE*3]
+ movhps xmm1, [r0+FENC_STRIDE*4]
+ movhps xmm2, [r0+FENC_STRIDE*5]
+ movhps xmm3, [r0+FENC_STRIDE*6]
+ movhps xmm4, [r0+FENC_STRIDE*7]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movd [r2], xmm1
%else
packuswb m0, m0
punpcklbw m0, m0
movq m1, m0
punpcklbw m0, m0 ; 4x dc0 4x dc1
punpckhbw m1, m1 ; 4x dc2 4x dc3
-%endif
movq m2, [r0+FENC_STRIDE*0]
movq m3, [r0+FENC_STRIDE*1]
movq m4, [r0+FENC_STRIDE*2]
@@ -483,6 +498,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
paddw m6, m0
paddw m2, m6
movd [r2], m2
+%endif
RET
%endmacro
diff --git a/common/x86/util.h b/common/x86/util.h
index ab1e208..efc700a 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -38,8 +38,8 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
"pminsw %%mm2, %%mm0 \n"
"pmaxsw %%mm1, %%mm0 \n"
"movd %%mm0, %0 \n"
- :"=m"(*(uint32_t*)dst)
- :"m"(*(uint32_t*)a), "m"(*(uint32_t*)b), "m"(*(uint32_t*)c)
+ :"=m"(*(x264_union32_t*)dst)
+ :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
);
}
#define x264_predictor_difference x264_predictor_difference_mmxext
@@ -69,44 +69,11 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
"jg 1b \n"
"movq %%mm4, %0 \n"
:"=m"(output), "+r"(i_mvc)
- :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc)
+ :"r"(mvc), "m"(M64( mvc ))
);
sum += output[0] + output[1] + output[2] + output[3];
return sum;
}
-#undef array_non_zero_int
-#define array_non_zero_int array_non_zero_int_mmx
-static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
-{
- if(i_count == 128)
- {
- int nonzero = 0;
- asm(
- "movq (%1), %%mm0 \n"
- "por 8(%1), %%mm0 \n"
- "por 16(%1), %%mm0 \n"
- "por 24(%1), %%mm0 \n"
- "por 32(%1), %%mm0 \n"
- "por 40(%1), %%mm0 \n"
- "por 48(%1), %%mm0 \n"
- "por 56(%1), %%mm0 \n"
- "por 64(%1), %%mm0 \n"
- "por 72(%1), %%mm0 \n"
- "por 80(%1), %%mm0 \n"
- "por 88(%1), %%mm0 \n"
- "por 96(%1), %%mm0 \n"
- "por 104(%1), %%mm0 \n"
- "por 112(%1), %%mm0 \n"
- "por 120(%1), %%mm0 \n"
- "packsswb %%mm0, %%mm0 \n"
- "movd %%mm0, %0 \n"
- :"=r"(nonzero)
- :"r"(v), "m"(*(struct {int16_t x[64];} *)v)
- );
- return !!nonzero;
- }
- else return array_non_zero_int_c( v, i_count );
-}
#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
{
@@ -131,7 +98,7 @@ static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16
"pminsw %5, %%mm0 \n"
"movd %%mm0, %0 \n"
:"=r"(amvd)
- :"m"(*(uint32_t*)mvdleft),"m"(*(uint32_t*)mvdtop),
+ :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
"m"(pw_28),"m"(pw_2184),"m"(pw_2)
);
return amvd;
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index fced5c6..2a91084 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -6,21 +6,32 @@
;* Authors: Loren Merritt <lorenm at u.washington.edu>
;* Anton Mitrofanov <BugMaster at narod.ru>
;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;*****************************************************************************
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel at videolan.org .
+
%ifdef ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32
%define WIN64
@@ -29,6 +40,12 @@
%endif
%endif
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0.
; This is true in practice (since we never do any 64bit arithmetic on strides,
@@ -37,14 +54,14 @@
; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section.
-%macro SECTION_RODATA 0
+%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho64
- SECTION .text align=16
+ SECTION .text align=%1
%elifidn __OUTPUT_FORMAT__,macho
- SECTION .text align=16
+ SECTION .text align=%1
fakegot:
%else
- SECTION .rodata align=16
+ SECTION .rodata align=%1
%endif
%endmacro
@@ -85,7 +102,7 @@
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
-; cglobal foo, 2,3, dst, src, tmp
+; cglobal foo, 2,3,0, dst, src, tmp
; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
; TODO Some functions can use some args directly from the stack. If they're the
@@ -222,6 +239,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
CAT_UNDEF arg_name %+ %%i, d
CAT_UNDEF arg_name %+ %%i, w
CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
CAT_UNDEF arg_name, %%i
%assign %%i %%i+1
%endrep
@@ -233,6 +251,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
%xdefine %1d r %+ %%i %+ d
%xdefine %1w r %+ %%i %+ w
%xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
CAT_XDEFINE arg_name, %%i, %1
%assign %%i %%i+1
%rotate 1
@@ -258,15 +277,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%endif
%endmacro
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
ASSERT %2 >= %1
%assign regs_used %2
ASSERT regs_used <= 7
- %if %0 > 2
- %assign xmm_regs_used %3
- %else
- %assign xmm_regs_used 0
- %endif
+ %assign xmm_regs_used %3
ASSERT xmm_regs_used <= 16
%if regs_used > 4
push r4
@@ -387,7 +402,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endif
%endmacro
-%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
ASSERT %2 >= %1
%assign regs_used %2
ASSERT regs_used <= 7
@@ -433,10 +448,8 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
; Symbol prefix for C linkage
%macro cglobal 1-2+
- %ifdef PREFIX
- %xdefine %1.skip_prologue _%1.skip_prologue
- %xdefine %1 _%1
- %endif
+ %xdefine %1 mangle(%1)
+ %xdefine %1.skip_prologue %1 %+ .skip_prologue
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%else
@@ -452,9 +465,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endmacro
%macro cextern 1
- %ifdef PREFIX
- %xdefine %1 _%1
- %endif
+ %xdefine %1 mangle(%1)
extern %1
%endmacro
@@ -464,9 +475,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif
-%assign FENC_STRIDE 16
-%assign FDEC_STRIDE 32
-
; merge mmx and sse*
%macro CAT_XDEFINE 3
@@ -573,7 +581,10 @@ INIT_MMX
%endrep
%endmacro
-%macro SAVE_MM_PERMUTATION 1
+; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
+; function name, then any later calls to that function will automatically
+; load the permutation, so values can be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 1 ; name to save as
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE %1_m, %%i, m %+ %%i
@@ -581,7 +592,7 @@ INIT_MMX
%endrep
%endmacro
-%macro LOAD_MM_PERMUTATION 1
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, %1_m %+ %%i
@@ -597,7 +608,7 @@ INIT_MMX
%endif
%endmacro
-;Substitutions that reduce instruction size but are functionally equivalent
+; Substitutions that reduce instruction size but are functionally equivalent
%macro add 2
%ifnum %2
%if %2==128
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index cfd7767..b822688 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -21,6 +21,9 @@
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
+%assign FENC_STRIDE 16
+%assign FDEC_STRIDE 32
+
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
@@ -28,6 +31,13 @@
SWAP %3, %4
%endmacro
+%macro SBUTTERFLY2 4
+ mova m%4, m%2
+ punpckh%1 m%2, m%3
+ punpckl%1 m%4, m%3
+ SWAP %2, %4, %3
+%endmacro
+
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
@@ -386,10 +396,10 @@
%macro SUMSUBD2_AB 4
mova %4, %1
mova %3, %2
- psraw %2, 1
- psraw %1, 1
- paddw %2, %4
- psubw %1, %3
+ psraw %2, 1 ; %2: %2>>1
+ psraw %1, 1 ; %1: %1>>1
+ paddw %2, %4 ; %2: %2>>1+%1
+ psubw %1, %3 ; %1: %1>>1-%2
%endmacro
%macro DCT4_1D 5
@@ -410,14 +420,24 @@
%macro IDCT4_1D 5-6
%ifnum %5
SUMSUBD2_AB m%2, m%4, m%6, m%5
+ ; %2: %2>>1-%4 %4: %2+%4>>1
SUMSUB_BA m%3, m%1, m%6
+ ; %3: %1+%3 %1: %1-%3
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
+ ; %4: %1+%3 + (%2+%4>>1)
+ ; %3: %1+%3 - (%2+%4>>1)
+ ; %2: %1-%3 + (%2>>1-%4)
+ ; %1: %1-%3 - (%2>>1-%4)
%else
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
SUMSUB_BA m%3, m%1
SUMSUB_BADC m%4, m%3, m%2, m%1
%endif
SWAP %1, %4, %3
+ ; %1: %1+%3 + (%2+%4>>1) row0
+ ; %2: %1-%3 + (%2>>1-%4) row1
+ ; %3: %1-%3 - (%2>>1-%4) row2
+ ; %4: %1+%3 - (%2+%4>>1) row3
%endmacro
diff --git a/config.guess b/config.guess
index 0f0fe71..e792aac 100755
--- a/config.guess
+++ b/config.guess
@@ -1,10 +1,10 @@
#! /bin/sh
# Attempt to guess a canonical system name.
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-# 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
-# Inc.
+# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+# Free Software Foundation, Inc.
-timestamp='2007-03-06'
+timestamp='2009-09-18'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
@@ -27,16 +27,16 @@ timestamp='2007-03-06'
# the same distribution terms that you use for the rest of that program.
-# Originally written by Per Bothner <per at bothner.com>.
-# Please send patches to <config-patches at gnu.org>. Submit a context
-# diff and a properly formatted ChangeLog entry.
+# Originally written by Per Bothner. Please send patches (context
+# diff format) to <config-patches at gnu.org> and include a ChangeLog
+# entry.
#
# This script attempts to guess a canonical system name similar to
# config.sub. If it succeeds, it prints the system name on stdout, and
# exits with 0. Otherwise, it exits with 1.
#
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
me=`echo "$0" | sed -e 's,.*/,,'`
@@ -56,8 +56,8 @@ version="\
GNU config.guess ($timestamp)
Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
-Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -170,7 +170,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
arm*|i386|m68k|ns32k|sh3*|sparc|vax)
eval $set_cc_for_build
if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
- | grep __ELF__ >/dev/null
+ | grep -q __ELF__
then
# Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
# Return netbsd for either. FIX?
@@ -324,14 +324,30 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
case `/usr/bin/uname -p` in
sparc) echo sparc-icl-nx7; exit ;;
esac ;;
+ s390x:SunOS:*:*)
+ echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+ exit ;;
sun4H:SunOS:5.*:*)
echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
exit ;;
sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
exit ;;
- i86pc:SunOS:5.*:*)
- echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+ i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+ eval $set_cc_for_build
+ SUN_ARCH="i386"
+ # If there is a compiler, see if it is configured for 64-bit objects.
+ # Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+ # This test works for both compilers.
+ if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+ if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+ grep IS_64BIT_ARCH >/dev/null
+ then
+ SUN_ARCH="x86_64"
+ fi
+ fi
+ echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
exit ;;
sun4*:SunOS:6*:*)
# According to config.sub, this is the proper way to canonicalize
@@ -532,7 +548,7 @@ EOF
echo rs6000-ibm-aix3.2
fi
exit ;;
- *:AIX:*:[45])
+ *:AIX:*:[456])
IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
IBM_ARCH=rs6000
@@ -640,7 +656,7 @@ EOF
# => hppa64-hp-hpux11.23
if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
- grep __LP64__ >/dev/null
+ grep -q __LP64__
then
HP_ARCH="hppa2.0w"
else
@@ -793,16 +809,22 @@ EOF
exit ;;
*:Interix*:[3456]*)
case ${UNAME_MACHINE} in
- x86)
+ x86)
echo i586-pc-interix${UNAME_RELEASE}
exit ;;
- EM64T | authenticamd)
+ EM64T | authenticamd | genuineintel)
echo x86_64-unknown-interix${UNAME_RELEASE}
exit ;;
+ IA64)
+ echo ia64-unknown-interix${UNAME_RELEASE}
+ exit ;;
esac ;;
[345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
echo i${UNAME_MACHINE}-pc-mks
exit ;;
+ 8664:Windows_NT:*)
+ echo x86_64-pc-mks
+ exit ;;
i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
# How do we know it's Interix rather than the generic POSIX subsystem?
# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
@@ -832,8 +854,29 @@ EOF
i*86:Minix:*:*)
echo ${UNAME_MACHINE}-pc-minix
exit ;;
+ alpha:Linux:*:*)
+ case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+ EV5) UNAME_MACHINE=alphaev5 ;;
+ EV56) UNAME_MACHINE=alphaev56 ;;
+ PCA56) UNAME_MACHINE=alphapca56 ;;
+ PCA57) UNAME_MACHINE=alphapca56 ;;
+ EV6) UNAME_MACHINE=alphaev6 ;;
+ EV67) UNAME_MACHINE=alphaev67 ;;
+ EV68*) UNAME_MACHINE=alphaev68 ;;
+ esac
+ objdump --private-headers /bin/sh | grep -q ld.so.1
+ if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+ echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+ exit ;;
arm*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-gnu
+ eval $set_cc_for_build
+ if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+ | grep -q __ARM_EABI__
+ then
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
+ else
+ echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+ fi
exit ;;
avr32*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-gnu
@@ -847,6 +890,9 @@ EOF
frv:Linux:*:*)
echo frv-unknown-linux-gnu
exit ;;
+ i*86:Linux:*:*)
+ echo ${UNAME_MACHINE}-pc-linux-gnu
+ exit ;;
ia64:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
@@ -856,40 +902,17 @@ EOF
m68*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
- mips:Linux:*:*)
+ mips:Linux:*:* | mips64:Linux:*:*)
eval $set_cc_for_build
sed 's/^ //' << EOF >$dummy.c
#undef CPU
- #undef mips
- #undef mipsel
+ #undef ${UNAME_MACHINE}
+ #undef ${UNAME_MACHINE}el
#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
- CPU=mipsel
+ CPU=${UNAME_MACHINE}el
#else
#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
- CPU=mips
- #else
- CPU=
- #endif
- #endif
-EOF
- eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
- /^CPU/{
- s: ::g
- p
- }'`"
- test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
- ;;
- mips64:Linux:*:*)
- eval $set_cc_for_build
- sed 's/^ //' << EOF >$dummy.c
- #undef CPU
- #undef mips64
- #undef mips64el
- #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
- CPU=mips64el
- #else
- #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
- CPU=mips64
+ CPU=${UNAME_MACHINE}
#else
CPU=
#endif
@@ -905,25 +928,11 @@ EOF
or32:Linux:*:*)
echo or32-unknown-linux-gnu
exit ;;
- ppc:Linux:*:*)
- echo powerpc-unknown-linux-gnu
- exit ;;
- ppc64:Linux:*:*)
- echo powerpc64-unknown-linux-gnu
+ padre:Linux:*:*)
+ echo sparc-unknown-linux-gnu
exit ;;
- alpha:Linux:*:*)
- case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
- EV5) UNAME_MACHINE=alphaev5 ;;
- EV56) UNAME_MACHINE=alphaev56 ;;
- PCA56) UNAME_MACHINE=alphapca56 ;;
- PCA57) UNAME_MACHINE=alphapca56 ;;
- EV6) UNAME_MACHINE=alphaev6 ;;
- EV67) UNAME_MACHINE=alphaev67 ;;
- EV68*) UNAME_MACHINE=alphaev68 ;;
- esac
- objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
- if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
- echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+ parisc64:Linux:*:* | hppa64:Linux:*:*)
+ echo hppa64-unknown-linux-gnu
exit ;;
parisc:Linux:*:* | hppa:Linux:*:*)
# Look for CPU level
@@ -933,8 +942,11 @@ EOF
*) echo hppa-unknown-linux-gnu ;;
esac
exit ;;
- parisc64:Linux:*:* | hppa64:Linux:*:*)
- echo hppa64-unknown-linux-gnu
+ ppc64:Linux:*:*)
+ echo powerpc64-unknown-linux-gnu
+ exit ;;
+ ppc:Linux:*:*)
+ echo powerpc-unknown-linux-gnu
exit ;;
s390:Linux:*:* | s390x:Linux:*:*)
echo ${UNAME_MACHINE}-ibm-linux
@@ -954,72 +966,9 @@ EOF
x86_64:Linux:*:*)
echo x86_64-unknown-linux-gnu
exit ;;
- xtensa:Linux:*:*)
- echo xtensa-unknown-linux-gnu
+ xtensa*:Linux:*:*)
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
- i*86:Linux:*:*)
- # The BFD linker knows what the default object file format is, so
- # first see if it will tell us. cd to the root directory to prevent
- # problems with other programs or directories called `ld' in the path.
- # Set LC_ALL=C to ensure ld outputs messages in English.
- ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
- | sed -ne '/supported targets:/!d
- s/[ ][ ]*/ /g
- s/.*supported targets: *//
- s/ .*//
- p'`
- case "$ld_supported_targets" in
- elf32-i386)
- TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
- ;;
- a.out-i386-linux)
- echo "${UNAME_MACHINE}-pc-linux-gnuaout"
- exit ;;
- coff-i386)
- echo "${UNAME_MACHINE}-pc-linux-gnucoff"
- exit ;;
- "")
- # Either a pre-BFD a.out linker (linux-gnuoldld) or
- # one that does not give us useful --help.
- echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
- exit ;;
- esac
- # Determine whether the default compiler is a.out or elf
- eval $set_cc_for_build
- sed 's/^ //' << EOF >$dummy.c
- #include <features.h>
- #ifdef __ELF__
- # ifdef __GLIBC__
- # if __GLIBC__ >= 2
- LIBC=gnu
- # else
- LIBC=gnulibc1
- # endif
- # else
- LIBC=gnulibc1
- # endif
- #else
- #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
- LIBC=gnu
- #else
- LIBC=gnuaout
- #endif
- #endif
- #ifdef __dietlibc__
- LIBC=dietlibc
- #endif
-EOF
- eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
- /^LIBC/{
- s: ::g
- p
- }'`"
- test x"${LIBC}" != x && {
- echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
- exit
- }
- test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
- ;;
i*86:DYNIX/ptx:4*:*)
# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
# earlier versions are messed up and put the nodename in both
@@ -1048,7 +997,7 @@ EOF
i*86:syllable:*:*)
echo ${UNAME_MACHINE}-pc-syllable
exit ;;
- i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+ i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
echo i386-unknown-lynxos${UNAME_RELEASE}
exit ;;
i*86:*DOS:*:*)
@@ -1092,8 +1041,11 @@ EOF
pc:*:*:*)
# Left here for compatibility:
# uname -m prints for DJGPP always 'pc', but it prints nothing about
- # the processor, so we play safe by assuming i386.
- echo i386-pc-msdosdjgpp
+ # the processor, so we play safe by assuming i586.
+ # Note: whatever this is, it MUST be the same as what config.sub
+ # prints for the "djgpp" host, or else GDB configury will decide that
+ # this is a cross-build.
+ echo i586-pc-msdosdjgpp
exit ;;
Intel:Mach:3*:*)
echo i386-pc-mach3
@@ -1131,6 +1083,16 @@ EOF
3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
&& { echo i486-ncr-sysv4; exit; } ;;
+ NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+ OS_REL='.3'
+ test -r /etc/.relid \
+ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+ /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+ && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+ /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+ && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+ /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
echo m68k-unknown-lynxos${UNAME_RELEASE}
exit ;;
@@ -1143,7 +1105,7 @@ EOF
rs6000:LynxOS:2.*:*)
echo rs6000-unknown-lynxos${UNAME_RELEASE}
exit ;;
- PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+ PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
echo powerpc-unknown-lynxos${UNAME_RELEASE}
exit ;;
SM[BE]S:UNIX_SV:*:*)
@@ -1206,6 +1168,9 @@ EOF
BePC:BeOS:*:*) # BeOS running on Intel PC compatible.
echo i586-pc-beos
exit ;;
+ BePC:Haiku:*:*) # Haiku running on Intel PC compatible.
+ echo i586-pc-haiku
+ exit ;;
SX-4:SUPER-UX:*:*)
echo sx4-nec-superux${UNAME_RELEASE}
exit ;;
@@ -1233,6 +1198,16 @@ EOF
*:Darwin:*:*)
UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
case $UNAME_PROCESSOR in
+ i386)
+ eval $set_cc_for_build
+ if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+ if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+ grep IS_64BIT_ARCH >/dev/null
+ then
+ UNAME_PROCESSOR="x86_64"
+ fi
+ fi ;;
unknown) UNAME_PROCESSOR=powerpc ;;
esac
echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
@@ -1314,6 +1289,9 @@ EOF
i*86:rdos:*:*)
echo ${UNAME_MACHINE}-pc-rdos
exit ;;
+ i*86:AROS:*:*)
+ echo ${UNAME_MACHINE}-pc-aros
+ exit ;;
esac
#echo '(No uname command or uname output not recognized.)' 1>&2
@@ -1474,9 +1452,9 @@ This script, last modified $timestamp, has failed to recognize
the operating system you are using. It is advised that you
download the most up to date version of the config scripts from
- http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.guess
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
and
- http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.sub
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
If the version you run ($0) is already up to date, please
send the following data and any information you think might be
diff --git a/config.sub b/config.sub
index 5defff6..8ca084b 100755
--- a/config.sub
+++ b/config.sub
@@ -1,10 +1,10 @@
#! /bin/sh
# Configuration validation subroutine script.
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-# 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
-# Inc.
+# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+# Free Software Foundation, Inc.
-timestamp='2007-01-18'
+timestamp='2009-08-19'
# This file is (in principle) common to ALL GNU software.
# The presence of a machine in this file suggests that SOME GNU software
@@ -32,13 +32,16 @@ timestamp='2007-01-18'
# Please send patches to <config-patches at gnu.org>. Submit a context
-# diff and a properly formatted ChangeLog entry.
+# diff and a properly formatted GNU ChangeLog entry.
#
# Configuration subroutine to validate and canonicalize a configuration type.
# Supply the specified configuration type as an argument.
# If it is invalid, we print an error message on stderr and exit with code 1.
# Otherwise, we print the canonical config type on stdout and succeed.
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
# This file is supposed to be the same for all GNU packages
# and recognize all the CPU types, system types and aliases
# that are meaningful with *any* GNU software.
@@ -72,8 +75,8 @@ Report bugs and patches to <config-patches at gnu.org>."
version="\
GNU config.sub ($timestamp)
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
-Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -122,6 +125,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
case $maybe_os in
nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
+ kopensolaris*-gnu* | \
storm-chaos* | os2-emx* | rtmk-nova*)
os=-$maybe_os
basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
@@ -148,10 +152,13 @@ case $os in
-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
- -apple | -axis | -knuth | -cray)
+ -apple | -axis | -knuth | -cray | -microblaze)
os=
basic_machine=$1
;;
+ -bluegene*)
+ os=-cnk
+ ;;
-sim | -cisco | -oki | -wec | -winbond)
os=
basic_machine=$1
@@ -249,13 +256,16 @@ case $basic_machine in
| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
| i370 | i860 | i960 | ia64 \
| ip2k | iq2000 \
+ | lm32 \
| m32c | m32r | m32rle | m68000 | m68k | m88k \
- | maxq | mb | microblaze | mcore | mep \
+ | maxq | mb | microblaze | mcore | mep | metag \
| mips | mipsbe | mipseb | mipsel | mipsle \
| mips16 \
| mips64 | mips64el \
- | mips64vr | mips64vrel \
+ | mips64octeon | mips64octeonel \
| mips64orion | mips64orionel \
+ | mips64r5900 | mips64r5900el \
+ | mips64vr | mips64vrel \
| mips64vr4100 | mips64vr4100el \
| mips64vr4300 | mips64vr4300el \
| mips64vr5000 | mips64vr5000el \
@@ -268,6 +278,7 @@ case $basic_machine in
| mipsisa64sr71k | mipsisa64sr71kel \
| mipstx39 | mipstx39el \
| mn10200 | mn10300 \
+ | moxie \
| mt \
| msp430 \
| nios | nios2 \
@@ -277,7 +288,7 @@ case $basic_machine in
| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
| pyramid \
| score \
- | sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
| sh64 | sh64le \
| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -286,7 +297,7 @@ case $basic_machine in
| v850 | v850e \
| we32k \
| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
- | z8k)
+ | z8k | z80)
basic_machine=$basic_machine-unknown
;;
m6811 | m68hc11 | m6812 | m68hc12)
@@ -329,14 +340,17 @@ case $basic_machine in
| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
| i*86-* | i860-* | i960-* | ia64-* \
| ip2k-* | iq2000-* \
+ | lm32-* \
| m32c-* | m32r-* | m32rle-* \
| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
- | m88110-* | m88k-* | maxq-* | mcore-* \
+ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
| mips16-* \
| mips64-* | mips64el-* \
- | mips64vr-* | mips64vrel-* \
+ | mips64octeon-* | mips64octeonel-* \
| mips64orion-* | mips64orionel-* \
+ | mips64r5900-* | mips64r5900el-* \
+ | mips64vr-* | mips64vrel-* \
| mips64vr4100-* | mips64vr4100el-* \
| mips64vr4300-* | mips64vr4300el-* \
| mips64vr5000-* | mips64vr5000el-* \
@@ -358,20 +372,24 @@ case $basic_machine in
| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
| pyramid-* \
| romp-* | rs6000-* \
- | sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
| sparclite-* \
| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
| tahoe-* | thumb-* \
- | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* | tile-* \
| tron-* \
| v850-* | v850e-* | vax-* \
| we32k-* \
| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
- | xstormy16-* | xtensa-* \
+ | xstormy16-* | xtensa*-* \
| ymp-* \
- | z8k-*)
+ | z8k-* | z80-*)
+ ;;
+ # Recognize the basic CPU types without company name, with glob match.
+ xtensa*)
+ basic_machine=$basic_machine-unknown
;;
# Recognize the various machine names and aliases which stand
# for a CPU type and a company and sometimes even an OS.
@@ -435,6 +453,10 @@ case $basic_machine in
basic_machine=m68k-apollo
os=-bsd
;;
+ aros)
+ basic_machine=i386-pc
+ os=-aros
+ ;;
aux)
basic_machine=m68k-apple
os=-aux
@@ -443,10 +465,26 @@ case $basic_machine in
basic_machine=ns32k-sequent
os=-dynix
;;
+ blackfin)
+ basic_machine=bfin-unknown
+ os=-linux
+ ;;
+ blackfin-*)
+ basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+ os=-linux
+ ;;
+ bluegene*)
+ basic_machine=powerpc-ibm
+ os=-cnk
+ ;;
c90)
basic_machine=c90-cray
os=-unicos
;;
+ cegcc)
+ basic_machine=arm-unknown
+ os=-cegcc
+ ;;
convex-c1)
basic_machine=c1-convex
os=-bsd
@@ -475,8 +513,8 @@ case $basic_machine in
basic_machine=craynv-cray
os=-unicosmp
;;
- cr16c)
- basic_machine=cr16c-unknown
+ cr16)
+ basic_machine=cr16-unknown
os=-elf
;;
crds | unos)
@@ -514,6 +552,10 @@ case $basic_machine in
basic_machine=m88k-motorola
os=-sysv3
;;
+ dicos)
+ basic_machine=i686-pc
+ os=-dicos
+ ;;
djgpp)
basic_machine=i586-pc
os=-msdosdjgpp
@@ -668,6 +710,14 @@ case $basic_machine in
basic_machine=m68k-isi
os=-sysv
;;
+ m68knommu)
+ basic_machine=m68k-unknown
+ os=-linux
+ ;;
+ m68knommu-*)
+ basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+ os=-linux
+ ;;
m88k-omron*)
basic_machine=m88k-omron
;;
@@ -679,10 +729,17 @@ case $basic_machine in
basic_machine=ns32k-utek
os=-sysv
;;
+ microblaze)
+ basic_machine=microblaze-xilinx
+ ;;
mingw32)
basic_machine=i386-pc
os=-mingw32
;;
+ mingw32ce)
+ basic_machine=arm-unknown
+ os=-mingw32ce
+ ;;
miniframe)
basic_machine=m68000-convergent
;;
@@ -809,6 +866,14 @@ case $basic_machine in
basic_machine=i860-intel
os=-osf
;;
+ parisc)
+ basic_machine=hppa-unknown
+ os=-linux
+ ;;
+ parisc-*)
+ basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+ os=-linux
+ ;;
pbd)
basic_machine=sparc-tti
;;
@@ -1017,6 +1082,10 @@ case $basic_machine in
basic_machine=tic6x-unknown
os=-coff
;;
+ tile*)
+ basic_machine=tile-unknown
+ os=-linux-gnu
+ ;;
tx39)
basic_machine=mipstx39-unknown
;;
@@ -1092,6 +1161,10 @@ case $basic_machine in
basic_machine=z8k-unknown
os=-sim
;;
+ z80-*-coff)
+ basic_machine=z80-unknown
+ os=-sim
+ ;;
none)
basic_machine=none-none
os=-none
@@ -1130,7 +1203,7 @@ case $basic_machine in
we32k)
basic_machine=we32k-att
;;
- sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
+ sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
basic_machine=sh-unknown
;;
sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
@@ -1200,10 +1273,11 @@ case $os in
# Each alternative MUST END IN A *, to match a version number.
# -sysv* is not here because it comes later, after sysvr4.
-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
- | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
| -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+ | -kopensolaris* \
| -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
- | -aos* \
+ | -aos* | -aros* \
| -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
| -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
| -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
@@ -1212,7 +1286,7 @@ case $os in
| -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
| -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
| -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
- | -chorusos* | -chorusrdb* \
+ | -chorusos* | -chorusrdb* | -cegcc* \
| -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
| -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
| -uxpv* | -beos* | -mpeix* | -udk* \
@@ -1352,6 +1426,9 @@ case $os in
-zvmoe)
os=-zvmoe
;;
+ -dicos*)
+ os=-dicos
+ ;;
-none)
;;
*)
@@ -1549,7 +1626,7 @@ case $basic_machine in
-sunos*)
vendor=sun
;;
- -aix*)
+ -cnk*|-aix*)
vendor=ibm
;;
-beos*)
diff --git a/configure b/configure
index 53198e8..b254383 100755
--- a/configure
+++ b/configure
@@ -7,10 +7,12 @@ echo ""
echo "available options:"
echo ""
echo " --help print this message"
-echo " --disable-avis-input disables avisynth input (win32 only)"
+echo " --disable-avs-input disables avisynth input (win32 only)"
+echo " --disable-lavf-input disables libavformat input"
+echo " --disable-ffms-input disables ffmpegsource input"
echo " --disable-mp4-output disables mp4 output (using gpac)"
echo " --disable-pthread disables multithreaded encoding"
-echo " --disable-asm disables assembly optimizations on x86"
+echo " --disable-asm disables assembly optimizations on x86 and arm"
echo " --enable-debug adds -g, doesn't strip"
echo " --enable-gprof adds -pg, doesn't strip"
echo " --enable-visualize enables visualization (X11 only)"
@@ -25,24 +27,83 @@ echo ""
exit 1
fi
+log_check() {
+ echo -n "checking $1... " >> config.log
+}
+
+log_ok() {
+ echo "yes" >> config.log
+}
+
+log_fail() {
+ echo "no" >> config.log
+}
+
+log_msg() {
+ echo "$1" >> config.log
+}
+
cc_check() {
+ if [ -z "$3" ]; then
+ if [ -z "$1$2" ]; then
+ log_check "whether $CC works"
+ elif [ -z "$1" ]; then
+ log_check "for $2"
+ else
+ log_check "for $1"
+ fi
+ elif [ -z "$1" ]; then
+ log_check "whether $CC supports $3"
+ else
+ log_check "for $3 in $1";
+ fi
rm -f conftest.c
[ -n "$1" ] && echo "#include <$1>" > conftest.c
echo "int main () { $3 return 0; }" >> conftest.c
- $CC conftest.c $CFLAGS $LDFLAGS $2 -o conftest 2>$DEVNULL
+ if $CC conftest.c $CFLAGS $LDFLAGS $LDFLAGSCLI $2 -o conftest >conftest.log 2>&1; then
+ res=$?
+ log_ok
+ else
+ res=$?
+ log_fail
+ log_msg "Failed commandline was:"
+ log_msg "--------------------------------------------------"
+ log_msg "$CC conftest.c $CFLAGS $LDFLAGS $LDFLAGSCLI $2"
+ cat conftest.log >> config.log
+ log_msg "--------------------------------------------------"
+ fi
+ return $res
}
as_check() {
+ log_check "whether $AS supports $1"
echo "$1" > conftest.asm
- $AS conftest.asm $ASFLAGS $2 -o conftest.o 2>$DEVNULL
+ if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then
+ res=$?
+ log_ok
+ else
+ res=$?
+ log_fail
+ log_msg "Failed commandline was:"
+ log_msg "--------------------------------------------------"
+ log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o"
+ cat conftest.log >> config.log
+ log_msg "--------------------------------------------------"
+ fi
+ return $res
+}
+
+define() {
+ echo "#define $1$([ -n "$2" ] && echo " $2")" >> config.h
}
die() {
+ log_msg "DIED: $@"
echo "$@"
exit 1
}
-rm -f config.h config.mak x264.pc conftest*
+rm -f config.h config.mak config.log x264.pc conftest*
prefix='/usr/local'
exec_prefix='${prefix}'
@@ -51,7 +112,9 @@ libdir='${exec_prefix}/lib'
includedir='${prefix}/include'
DEVNULL='/dev/null'
-avis_input="auto"
+avs_input="auto"
+lavf_input="auto"
+ffms_input="auto"
mp4_output="auto"
pthread="auto"
asm="yes"
@@ -63,6 +126,7 @@ shared="no"
CFLAGS="$CFLAGS -Wall -I."
LDFLAGS="$LDFLAGS"
+LDFLAGSCLI="$LDFLAGSCLI"
ASFLAGS="$ASFLAGS"
HAVE_GETOPT_LONG=1
cross_prefix=""
@@ -95,11 +159,23 @@ for opt do
--disable-asm)
asm="no"
;;
- --enable-avis-input)
- avis_input="yes"
+ --enable-avs-input)
+ avs_input="auto"
+ ;;
+ --disable-avs-input)
+ avs_input="no"
+ ;;
+ --enable-lavf-input)
+ lavf_input="auto"
+ ;;
+ --disable-lavf-input)
+ lavf_input="no"
;;
- --disable-avis-input)
- avis_input="no"
+ --enable-ffms-input)
+ ffms_input="auto"
+ ;;
+ --disable-ffms-input)
+ ffms_input="no"
;;
--enable-mp4-output)
mp4_output="yes"
@@ -138,7 +214,7 @@ for opt do
;;
--enable-visualize)
LDFLAGS="$LDFLAGS -L/usr/X11R6/lib -lX11"
- CFLAGS="$CFLAGS -DVISUALIZE=1"
+ define HAVE_VISUALIZE
vis="yes"
;;
--host=*)
@@ -157,7 +233,6 @@ CC="${CC-${cross_prefix}gcc}"
AR="${AR-${cross_prefix}ar}"
RANLIB="${RANLIB-${cross_prefix}ranlib}"
STRIP="${STRIP-${cross_prefix}strip}"
-AS=""
if [ "x$host" = x ]; then
host=`./config.guess`
@@ -174,14 +249,14 @@ host_os="${host#*-}"
case $host_os in
beos*)
SYS="BEOS"
- CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+ define HAVE_MALLOC_H
;;
darwin*)
SYS="MACOSX"
CFLAGS="$CFLAGS -falign-loops=16"
- LDFLAGS="$LDFLAGS -lm -lmx"
+ LDFLAGS="$LDFLAGS -lm"
if [ "$pic" = "no" ]; then
- CFLAGS="$CFLAGS -mdynamic-no-pic"
+ cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic"
fi
;;
freebsd*)
@@ -190,7 +265,7 @@ case $host_os in
;;
kfreebsd*-gnu)
SYS="FREEBSD"
- CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+ define HAVE_MALLOC_H
LDFLAGS="$LDFLAGS -lm"
;;
netbsd*)
@@ -204,7 +279,7 @@ case $host_os in
;;
*linux*)
SYS="LINUX"
- CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+ define HAVE_MALLOC_H
LDFLAGS="$LDFLAGS -lm"
;;
cygwin*)
@@ -223,7 +298,7 @@ case $host_os in
;;
sunos*|solaris*)
SYS="SunOS"
- CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+ define HAVE_MALLOC_H
LDFLAGS="$LDFLAGS -lm"
HAVE_GETOPT_LONG=0
;;
@@ -240,6 +315,9 @@ case $host_cpu in
if [[ "$asm" == yes && "$CFLAGS" != *-march* ]]; then
CFLAGS="$CFLAGS -march=i686"
fi
+ if [[ "$asm" == yes && "$CFLAGS" != *-mfpmath* ]]; then
+ CFLAGS="$CFLAGS -mfpmath=sse -msse"
+ fi
if [ "$SYS" = MACOSX ]; then
ASFLAGS="$ASFLAGS -f macho -DPREFIX"
elif [ "$SYS" = MINGW ]; then
@@ -253,8 +331,10 @@ case $host_cpu in
AS="yasm"
if [ "$SYS" = MACOSX ];then
ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX"
- CFLAGS="$CFLAGS -arch x86_64"
- LDFLAGS="$LDFLAGS -arch x86_64"
+ if cc_check '' "-arch x86_64"; then
+ CFLAGS="$CFLAGS -arch x86_64"
+ LDFLAGS="$LDFLAGS -arch x86_64"
+ fi
elif [ "$SYS" = MINGW ]; then
ASFLAGS="$ASFLAGS -f win32 -m amd64 -DPREFIX"
else
@@ -265,9 +345,10 @@ case $host_cpu in
ARCH="PPC"
if [ $SYS = MACOSX ]
then
- ALTIVECFLAGS="$ALTIVECFLAGS -faltivec -fastf -mcpu=G4"
+ CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4"
else
- ALTIVECFLAGS="$ALTIVECFLAGS -maltivec -mabi=altivec -DHAVE_ALTIVEC_H"
+ CFLAGS="$CFLAGS -maltivec -mabi=altivec"
+ define HAVE_ALTIVEC_H
fi
;;
sparc)
@@ -286,6 +367,7 @@ case $host_cpu in
;;
arm*)
ARCH="ARM"
+ AS="${AS-${cross_prefix}gcc}"
;;
s390|s390x)
ARCH="S390"
@@ -298,23 +380,39 @@ case $host_cpu in
;;
esac
+log_msg "x264 configure script"
+if [ -n "$*" ]; then
+ msg="Command line options:"
+ for i in $@; do
+ msg="$msg \"$i\""
+ done
+ log_msg "$msg"
+fi
+log_msg ""
+
# check requirements
cc_check || die "No working C compiler found."
-if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" \) ] ; then
+if cc_check '' -std=gnu99 ; then
+ CFLAGS="$CFLAGS -std=gnu99"
+elif cc_check '' -std=c99 ; then
+ CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE"
+fi
+
+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" \) ] ; then
pic="yes"
fi
if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if ! as_check "pinsrd xmm0, [esp], 0" ; then
+ if ! as_check "lzcnt eax, eax" ; then
VER=`($AS --version || echo no assembler) 2>$DEVNULL | head -n 1`
echo "Found $VER"
- echo "Minimum version is yasm-0.6.1"
+ echo "Minimum version is yasm-0.6.2"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
- if ! cc_check '' '' 'asm("pabsw %xmm0, %xmm0");' ; then
+ if ! cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' ; then
VER=`(as --version || echo no gnu as) 2>$DEVNULL | head -n 1`
echo "Found $VER"
echo "Minimum version is binutils-2.17"
@@ -322,16 +420,37 @@ if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
- CFLAGS="$CFLAGS -DHAVE_MMX"
+ define HAVE_MMX
fi
+
+if [ $asm = yes -a $ARCH = ARM ] ; then
+ # set flags so neon is built by default
+ echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-mfloat-abi)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp"
+
+ if cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6 && ASFLAGS="$ASFLAGS -DHAVE_ARMV6"
+ cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2 && ASFLAGS="$ASFLAGS -DHAVE_ARMV6T2"
+ cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON && ASFLAGS="$ASFLAGS -DHAVE_NEON"
+ ASFLAGS="$ASFLAGS -c"
+ else
+ echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS."
+ echo "If you really want to run on such a CPU, configure with --disable-asm."
+ exit 1
+ fi
+fi
+
[ $asm = no ] && AS=""
[ "x$AS" = x ] && asm="no"
-CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
+define ARCH_$ARCH
+define SYS_$SYS
-echo "unsigned int endian = 'B' << 24 | 'I' << 16 | 'G' << 8 | 'E';" > conftest.c
+echo "int i = 0x42494745; double f = 0x1.0656e6469616ep+102;" > conftest.c
$CC $CFLAGS conftest.c -c -o conftest.o 2>$DEVNULL || die "endian test failed"
-grep -q BIGE conftest.o && CFLAGS="$CFLAGS -DWORDS_BIGENDIAN"
+if grep -q BIGE conftest.o && grep -q FPendian conftest.o ; then
+ define WORDS_BIGENDIAN
+elif !(grep -q EGIB conftest.o && grep -q naidnePF conftest.o) ; then
+ die "endian test failed"
+fi
# autodetect options that weren't forced nor disabled
@@ -352,11 +471,11 @@ if test "$pthread" = "auto" ; then
elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
pthread="yes"
libpthread="-lpthreadGC2 -lwsock32"
- CFLAGS="$CFLAGS -DPTW32_STATIC_LIB"
+ define PTW32_STATIC_LIB
elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
pthread="yes"
libpthread="-lpthreadGC2 -lws2_32"
- CFLAGS="$CFLAGS -DPTW32_STATIC_LIB"
+ define PTW32_STATIC_LIB
fi
;;
OPENBSD)
@@ -368,10 +487,82 @@ if test "$pthread" = "auto" ; then
esac
fi
if test "$pthread" = "yes" ; then
- CFLAGS="$CFLAGS -DHAVE_PTHREAD"
+ define HAVE_PTHREAD
LDFLAGS="$LDFLAGS $libpthread"
fi
+if cc_check "math.h" "-Werror" "return log2f(2);" ; then
+ define HAVE_LOG2F
+fi
+
+if [ "$lavf_input" = "auto" ] ; then
+ lavf_input="no"
+ if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then
+ LAVF_LIBS="$LAVF_LIBS $(${cross_prefix}pkg-config --libs libavformat libavcodec libswscale)"
+ LAVF_CFLAGS="$LAVF_CFLAGS $(${cross_prefix}pkg-config --cflags libavformat libavcodec libswscale)"
+ fi
+ if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then
+ LAVF_LIBS="-lavformat -lswscale"
+ for lib in -lpostproc -lavcodec -lavutil -lm -lz -lbz2 $libpthread -lavifil32; do
+ cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib"
+ done
+ fi
+ LAVF_LIBS="-L. $LAVF_LIBS"
+ if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" && \
+ cc_check libswscale/swscale.h "$LAVF_CFLAGS $LAVF_LIBS" ; then
+ # avcodec_decode_video2 is currently the most recently added function that we use; it was added in r18351
+ if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avcodec_decode_video2( NULL, NULL, NULL, NULL );" ; then
+ lavf_input="yes"
+ define LAVF_INPUT
+ else
+ echo "Warning: libavformat is too old, update to ffmpeg r18351+"
+ fi
+ fi
+fi
+
+if [ "$ffms_input" = "auto" ] ; then
+ ffms_major="2"; ffms_minor="13"; ffms_micro="1"; ffms_bump="0"
+
+ ffms_input="no"
+ [ $ffms_micro -gt 0 -o $ffms_bump -gt 0 ] && vmicro=".$ffms_micro"
+ [ $ffms_bump -gt 0 ] && vbump=".$ffms_bump"
+ if ${cross_prefix}pkg-config --atleast-version="$ffms_major.$ffms_minor$vmicro$vbump" ffms2 2>$DEVNULL; then
+ FFMS2_LIBS="$FFMS2_LIBS $(${cross_prefix}pkg-config --libs ffms2)"
+ FFMS2_CFLAGS="$FFMS2_LIBS $(${cross_prefix}pkg-config --cflags ffms2)"
+ api_check="no"
+ else
+ api_check="yes"
+ fi
+ [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2"
+
+ if cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS" "FFMS_DestroyVideoSource(0);" ; then
+ ffms_input="yes"
+ elif cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS -lstdc++ $LAVF_LIBS" "FFMS_DestroyVideoSource(0);" ; then
+ ffms_input="yes"
+ FFMS2_LIBS="$FFMS2_LIBS -lstdc++ $LAVF_LIBS"
+ fi
+
+ if [ $api_check = "yes" -a $ffms_input = "yes" ]; then
+ log_check "whether ffms2 version is at least $ffms_major.$ffms_minor$vmicro$vbump"
+ $CC $CFLAGS $FFMS2_CFLAGS -c -o conftest -x c - >$DEVNULL 2>&1 <<EOF
+#include <ffms.h>
+#if FFMS_VERSION < (($ffms_major << 24) | ($ffms_minor << 16) | ($ffms_micro << 8) | $ffms_bump)
+#error Requires ffms2 version 2.13.1
+#endif
+EOF
+ [ $? = 0 ] && log_ok || { ffms_input="no"; log_fail; }
+ fi
+fi
+
+if [ "$ffms_input" = "yes" ]; then
+ LDFLAGSCLI="$FFMS2_LIBS $LDFLAGSCLI"
+ [ -n "$FFMS2_CFLAGS" ] && CFLAGS="$CFLAGS $FFMS2_CFLAGS"
+ define FFMS_INPUT
+elif [ "$lavf_input" = "yes" ]; then
+ LDFLAGSCLI="$LAVF_LIBS $LDFLAGSCLI"
+ [ -n "$LAVF_CFLAGS" ] && CFLAGS="$CFLAGS $LAVF_CFLAGS"
+fi
+
MP4_LDFLAGS="-lgpac_static"
if [ $SYS = MINGW ]; then
MP4_LDFLAGS="$MP4_LDFLAGS -lwinmm"
@@ -381,26 +572,19 @@ if [ "$mp4_output" = "auto" ] ; then
cc_check gpac/isomedia.h "$MP4_LDFLAGS" && mp4_output="yes"
fi
if [ "$mp4_output" = "yes" ] ; then
- echo "#define MP4_OUTPUT" >> config.h
- LDFLAGS="$LDFLAGS $MP4_LDFLAGS"
+ define MP4_OUTPUT
+ LDFLAGSCLI="$LDFLAGSCLI $MP4_LDFLAGS"
fi
-if [ "$avis_input" = "auto" ] ; then
- if [ $SYS = MINGW ]; then
- avis_input="yes"
- else
- avis_input="no";
- fi
-fi
-if [ "$avis_input" = "yes" ] ; then
- if cc_check "stdlib.h" -lvfw32 ; then
- echo "#define AVIS_INPUT" >> config.h
- LDFLAGS="$LDFLAGS -lvfw32"
- elif cc_check "stdlib.h" -lavifil32 ; then
- echo "#define AVIS_INPUT" >> config.h
- LDFLAGS="$LDFLAGS -lavifil32"
- else
- avis_input="no";
+if [ "$avs_input" = "auto" ] ; then
+ avs_input=no
+ if [ $SYS = MINGW ] && cc_check avisynth_c.h ; then
+ avs_input="yes"
+ define AVS_INPUT
+ define HAVE_AVISYNTH_C_H
+ elif [ $SYS = MINGW ] && cc_check extras/avisynth_c.h ; then
+ avs_input="yes"
+ define AVS_INPUT
fi
fi
@@ -418,16 +602,20 @@ fi
if [ "$debug" = "yes" ]; then
CFLAGS="-O1 -g $CFLAGS"
+elif [ $ARCH = ARM ]; then
+ # arm-gcc-4.2 produces incorrect output with -ffast-math
+ # and it doesn't save any speed anyway on 4.4, so disable it
+ CFLAGS="-O3 -fno-fast-math $CFLAGS"
else
- CFLAGS="-O4 -ffast-math $CFLAGS"
+ CFLAGS="-O3 -ffast-math $CFLAGS"
fi
if cc_check "stdio.h" "" "fseeko(stdin,0,0);" ; then
- echo "#define fseek fseeko" >> config.h
- echo "#define ftell ftello" >> config.h
+ define fseek fseeko
+ define ftell ftello
elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then
- echo "#define fseek fseeko64" >> config.h
- echo "#define ftell ftello64" >> config.h
+ define fseek fseeko64
+ define ftell ftello64
fi
rm -f conftest*
@@ -444,8 +632,8 @@ ARCH=$ARCH
SYS=$SYS
CC=$CC
CFLAGS=$CFLAGS
-ALTIVECFLAGS=$ALTIVECFLAGS
LDFLAGS=$LDFLAGS
+LDFLAGSCLI=$LDFLAGSCLI
AR=$AR
RANLIB=$RANLIB
STRIP=$STRIP
@@ -496,18 +684,27 @@ Libs: $pclibs
Cflags: -I$includedir
EOF
+cat > conftest.log <<EOF
+Platform: $ARCH
+System: $SYS
+asm: $asm
+avs input: $avs_input
+lavf input: $lavf_input
+ffms input: $ffms_input
+mp4 output: $mp4_output
+pthread: $pthread
+debug: $debug
+gprof: $gprof
+PIC: $pic
+shared: $shared
+visualize: $vis
+EOF
+
+echo >> config.log
+cat conftest.log >> config.log
+cat conftest.log
+rm conftest.log
-echo "Platform: $ARCH"
-echo "System: $SYS"
-echo "asm: $asm"
-echo "avis input: $avis_input"
-echo "mp4 output: $mp4_output"
-echo "pthread: $pthread"
-echo "debug: $debug"
-echo "gprof: $gprof"
-echo "PIC: $pic"
-echo "shared: $shared"
-echo "visualize: $vis"
echo
echo "You can run 'make' or 'make fprofiled' now."
diff --git a/doc/standards.txt b/doc/standards.txt
index 4ebb165..db9a691 100644
--- a/doc/standards.txt
+++ b/doc/standards.txt
@@ -1,5 +1,5 @@
-x264 is written in C. The particular variant of C is: intersection of gcc-2.95 and msvc. This means C89 + a few C99 features.
-The extra utilities (mostly checkasm) are written in C99, with no attempt at compatibility with old compilers.
+x264 is written in C. The particular variant of C is: intersection of C99 and gcc>=3.4.
+checkasm is written in gcc, with no attempt at compatibility with anything else.
We make the following additional assumptions which are true of real systems but not guaranteed by C99:
* Two's complement.
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 7ec43c8..666596b 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -24,10 +24,7 @@
#define _ISOC99_SOURCE
#include <math.h>
-#include <limits.h>
-#ifndef _MSC_VER
#include <unistd.h>
-#endif
#include "common/common.h"
#include "common/cpu.h"
@@ -47,7 +44,7 @@ typedef struct
/* 8x8 */
int i_cost8x8;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
- DECLARE_ALIGNED_4( int16_t mvc[32][5][2] );
+ ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
/* Sub 4x4 */
@@ -78,15 +75,15 @@ typedef struct
int i_lambda;
int i_lambda2;
int i_qp;
- int16_t *p_cost_mv;
- uint16_t *p_cost_ref0;
- uint16_t *p_cost_ref1;
+ uint16_t *p_cost_mv;
+ uint16_t *p_cost_ref[2];
int i_mbrd;
/* I: Intra part */
/* Take some shortcuts in intra search if intra is deemed unlikely */
int b_fast_intra;
+ int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
int b_try_pskip;
/* Luma part */
@@ -106,7 +103,7 @@ typedef struct
/* Chroma part */
int i_satd_i8x8chroma;
- int i_satd_i8x8chroma_dir[4];
+ int i_satd_i8x8chroma_dir[7];
int i_predict8x8chroma;
/* II: Inter part P/B frame */
@@ -135,7 +132,7 @@ typedef struct
} x264_mb_analysis_t;
/* lambda = pow(2,qp/6-2) */
-const int x264_lambda_tab[52] = {
+const uint8_t x264_lambda_tab[52] = {
1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
1, 1, 1, 1, /* 8-11 */
1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
@@ -156,82 +153,217 @@ const int x264_lambda2_tab[52] = {
943718, 1189010, 1498059, 1887436 /* 48 - 51 */
};
+const uint8_t x264_exp2_lut[64] = {
+ 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
+ 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
+ 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
+ 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
+};
+
+const float x264_log2_lut[128] = {
+ 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
+ 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
+ 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
+ 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
+ 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
+ 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
+ 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
+ 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
+ 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
+ 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
+ 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
+ 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
+ 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
+ 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
+ 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
+ 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
+};
+
+/* Avoid an int/float conversion. */
+const float x264_log2_lz_lut[32] = {
+ 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+};
+
+// should the intra and inter lambdas be different?
+// I'm just matching the behaviour of deadzone quant.
+static const int x264_trellis_lambda2_tab[2][52] = {
+ // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
+ { 46, 58, 73, 92, 117, 147,
+ 185, 233, 294, 370, 466, 587,
+ 740, 932, 1174, 1480, 1864, 2349,
+ 2959, 3728, 4697, 5918, 7457, 9395,
+ 11837, 14914, 18790, 23674, 29828, 37581,
+ 47349, 59656, 75163, 94699, 119313, 150326,
+ 189399, 238627, 300652, 378798, 477255, 601304,
+ 757596, 954511, 1202608, 1515192, 1909022, 2405217,
+ 3030384, 3818045, 4810435, 6060769 },
+ // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
+ { 27, 34, 43, 54, 68, 86,
+ 108, 136, 172, 216, 273, 343,
+ 433, 545, 687, 865, 1090, 1374,
+ 1731, 2180, 2747, 3461, 4361, 5494,
+ 6922, 8721, 10988, 13844, 17442, 21976,
+ 27688, 34885, 43953, 55377, 69771, 87906,
+ 110755, 139543, 175813, 221511, 279087, 351627,
+ 443023, 558174, 703255, 886046, 1116348, 1406511,
+ 1772093, 2232697, 2813022, 3544186 }
+};
+
+static const uint16_t x264_chroma_lambda2_offset_tab[] = {
+ 16, 20, 25, 32, 40, 50,
+ 64, 80, 101, 128, 161, 203,
+ 256, 322, 406, 512, 645, 812,
+ 1024, 1290, 1625, 2048, 2580, 3250,
+ 4096, 5160, 6501, 8192, 10321, 13003,
+ 16384, 20642, 26007, 32768, 41285, 52015,
+ 65535
+};
+
/* TODO: calculate CABAC costs */
-static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
+static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
};
-static const int i_mb_b16x8_cost_table[17] = {
+static const uint8_t i_mb_b16x8_cost_table[17] = {
0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
};
-static const int i_sub_mb_b_cost_table[13] = {
+static const uint8_t i_sub_mb_b_cost_table[13] = {
7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
};
-static const int i_sub_mb_p_cost_table[4] = {
+static const uint8_t i_sub_mb_p_cost_table[4] = {
5, 3, 3, 1
};
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
-/* Indexed by lambda instead of qp because, due to rounding,
- * some quantizers share lambdas. This saves memory. */
-uint16_t *x264_cost_mv_fpel[92][4];
-uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[92][3][33];
+static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
-/* initialize an array of lambda*nbits for all possible mvs */
-static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+int x264_analyse_init_costs( x264_t *h, int qp )
{
- static int16_t *p_cost_mv[92];
int i, j;
-
- if( !p_cost_mv[a->i_lambda] )
+ int lambda = x264_lambda_tab[qp];
+ if( h->cost_mv[lambda] )
+ return 0;
+ /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
+ CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
+ h->cost_mv[lambda] += 2*4*2048;
+ for( i = 0; i <= 2*4*2048; i++ )
+ {
+ h->cost_mv[lambda][-i] =
+ h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+ }
+ x264_pthread_mutex_lock( &cost_ref_mutex );
+ for( i = 0; i < 3; i++ )
+ for( j = 0; j < 33; j++ )
+ x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
+ x264_pthread_mutex_unlock( &cost_ref_mutex );
+ if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
{
- x264_emms();
- /* could be faster, but isn't called many times */
- /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
- p_cost_mv[a->i_lambda] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
- p_cost_mv[a->i_lambda] += 2*4*2048;
- for( i = 0; i <= 2*4*2048; i++ )
+ for( j=0; j<4; j++ )
{
- p_cost_mv[a->i_lambda][-i] =
- p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+ CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
+ h->cost_mv_fpel[lambda][j] += 2*2048;
+ for( i = -2*2048; i < 2*2048; i++ )
+ h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
}
- for( i = 0; i < 3; i++ )
- for( j = 0; j < 33; j++ )
- x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
}
- a->p_cost_mv = p_cost_mv[a->i_lambda];
- a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
- a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+ return 0;
+fail:
+ return -1;
+}
- /* FIXME is this useful for all me methods? */
- if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
+void x264_analyse_free_costs( x264_t *h )
+{
+ int i, j;
+ for( i = 0; i < 92; i++ )
{
- for( j=0; j<4; j++ )
+ if( h->cost_mv[i] )
+ x264_free( h->cost_mv[i] - 2*4*2048 );
+ if( h->cost_mv_fpel[i][0] )
+ for( j = 0; j < 4; j++ )
+ x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
+ }
+}
+
+void x264_analyse_weight_frame( x264_t *h, int end )
+{
+ int j;
+ for( j=0; j<h->i_ref0; j++ )
+ {
+ if( h->sh.weight[j][0].weightfn )
{
- x264_cost_mv_fpel[a->i_lambda][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) );
- x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
- for( i = -2*2048; i < 2*2048; i++ )
- x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
+ x264_frame_t *frame = h->fref0[j];
+ int width = frame->i_width[0] + 2*PADH;
+ int i_padv = PADV << h->param.b_interlaced;
+ int offset, height;
+ uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+ int k;
+ height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
+ offset = h->fenc->i_lines_weighted*frame->i_stride[0];
+ h->fenc->i_lines_weighted += height;
+ if( height )
+ {
+ for( k = j; k < h->i_ref0; k++ )
+ if( h->sh.weight[k][0].weightfn )
+ {
+ uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+ x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
+ src + offset, frame->i_stride[0],
+ width, height, &h->sh.weight[k][0] );
+ }
+ }
+ break;
}
}
}
-static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
+/* initialize an array of lambda*nbits for all possible mvs */
+static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+{
+ a->p_cost_mv = h->cost_mv[a->i_lambda];
+ a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+ a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+}
+
+static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
{
- int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
- /* mbrd == 1 -> RD mode decision */
- /* mbrd == 2 -> RD refinement */
- a->i_mbrd = (i>=6) + (i>=8);
/* conduct the analysis using this lamda and QP */
a->i_qp = h->mb.i_qp = i_qp;
h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
+
a->i_lambda = x264_lambda_tab[i_qp];
a->i_lambda2 = x264_lambda2_tab[i_qp];
+
+ h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
+ if( h->param.analyse.i_trellis )
+ {
+ h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
+ h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
+ h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
+ h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
+ }
+ h->mb.i_psy_rd_lambda = a->i_lambda;
+ /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
+ h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
+
+}
+
+static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
+{
+ int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
+
+ /* mbrd == 1 -> RD mode decision */
+ /* mbrd == 2 -> RD refinement */
+ /* mbrd == 3 -> QPRD */
+ a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
+
+ x264_mb_analyse_init_qp( h, a, i_qp );
+
h->mb.i_me_method = h->param.analyse.i_me_method;
h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
&& h->mb.i_subpel_refine >= 5;
- h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
+
h->mb.b_transform_8x8 = 0;
h->mb.b_noise_reduction = 0;
@@ -257,8 +389,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
int i_fmv_range = 4 * h->param.analyse.i_mv_range;
// limit motion search to a slightly smaller range than the theoretical limit,
// since the search may go a few iterations past its given range
- int i_fpel_border = 5; // umh unconditional radius
- int i_spel_border = 8; // 1.5 for subpel_satd, 1.5 for subpel_rd, 2 for bime, round up
+ int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
/* Calculate max allowed MV range */
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
@@ -266,15 +397,23 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
+ if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
+ {
+ int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
+ int max_mv = max_x - 4*16*h->mb.i_mb_x;
+ /* If we're left of the refresh bar, don't reference right of it. */
+ if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
+ h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
+ }
h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
- if( h->mb.i_mb_x == 0)
+ if( h->mb.i_mb_x == 0 )
{
int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
int thread_mvy_range = i_fmv_range;
- if( h->param.i_threads > 1 )
+ if( h->i_thread_frames > 1 )
{
int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
int thresh = pix_y + h->param.analyse.i_mv_range_thread;
@@ -284,19 +423,22 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
int i_ref = i ? h->i_ref1 : h->i_ref0;
for( j=0; j<i_ref; j++ )
{
- x264_frame_cond_wait( fref[j], thresh );
- thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
+ x264_frame_cond_wait( fref[j]->orig, thresh );
+ thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
}
}
+
if( h->param.b_deterministic )
thread_mvy_range = h->param.analyse.i_mv_range_thread;
if( h->mb.b_interlaced )
thread_mvy_range >>= 1;
+
+ x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
}
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
- h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], X264_MAX(4*(-512+i_spel_border), -i_fmv_range), i_fmv_range );
+ h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
@@ -361,154 +503,80 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
}
}
h->mb.b_skip_mc = 0;
+ if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
+ h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
+ {
+ a->b_force_intra = 1;
+ a->b_fast_intra = 0;
+ }
+ else
+ a->b_force_intra = 0;
}
}
+/* Prediction modes allowed for various combinations of neighbors. */
+/* Terminated by a -1. */
+/* In order, no neighbors, left, top, top/left, top/left/topleft */
+static const int8_t i16x16_mode_available[5][5] =
+{
+ {I_PRED_16x16_DC_128, -1, -1, -1, -1},
+ {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
+ {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
+ {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
+ {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
+};
+static const int8_t i8x8chroma_mode_available[5][5] =
+{
+ {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
+ {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
+ {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
+ {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
+ {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
+};
-/*
- * Handle intra mb
- */
-/* Max = 4 */
-static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+static const int8_t i4x4_mode_available[5][10] =
{
- if( i_neighbour & MB_TOPLEFT )
- {
- /* top and left available */
- *mode++ = I_PRED_16x16_V;
- *mode++ = I_PRED_16x16_H;
- *mode++ = I_PRED_16x16_DC;
- *mode++ = I_PRED_16x16_P;
- *pi_count = 4;
- }
- else if( i_neighbour & MB_LEFT )
- {
- /* left available*/
- *mode++ = I_PRED_16x16_DC_LEFT;
- *mode++ = I_PRED_16x16_H;
- *pi_count = 2;
- }
- else if( i_neighbour & MB_TOP )
- {
- /* top available*/
- *mode++ = I_PRED_16x16_DC_TOP;
- *mode++ = I_PRED_16x16_V;
- *pi_count = 2;
- }
- else
- {
- /* none available */
- *mode = I_PRED_16x16_DC_128;
- *pi_count = 1;
- }
-}
+ {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
+ {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
+ {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
+ {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
+};
-/* Max = 4 */
-static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
{
- if( i_neighbour & MB_TOPLEFT )
- {
- /* top and left available */
- *mode++ = I_PRED_CHROMA_V;
- *mode++ = I_PRED_CHROMA_H;
- *mode++ = I_PRED_CHROMA_DC;
- *mode++ = I_PRED_CHROMA_P;
- *pi_count = 4;
- }
- else if( i_neighbour & MB_LEFT )
- {
- /* left available*/
- *mode++ = I_PRED_CHROMA_DC_LEFT;
- *mode++ = I_PRED_CHROMA_H;
- *pi_count = 2;
- }
- else if( i_neighbour & MB_TOP )
- {
- /* top available*/
- *mode++ = I_PRED_CHROMA_DC_TOP;
- *mode++ = I_PRED_CHROMA_V;
- *pi_count = 2;
- }
- else
- {
- /* none available */
- *mode = I_PRED_CHROMA_DC_128;
- *pi_count = 1;
- }
+ int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
+ return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
}
-/* MAX = 9 */
-static void predict_4x4_mode_available( unsigned int i_neighbour,
- int *mode, int *pi_count )
+static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
{
- int b_l = i_neighbour & MB_LEFT;
- int b_t = i_neighbour & MB_TOP;
+ int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
+ return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
+}
- if( b_l && b_t )
- {
- *pi_count = 6;
- *mode++ = I_PRED_4x4_DC;
- *mode++ = I_PRED_4x4_H;
- *mode++ = I_PRED_4x4_V;
- *mode++ = I_PRED_4x4_DDL;
- if( i_neighbour & MB_TOPLEFT )
- {
- *mode++ = I_PRED_4x4_DDR;
- *mode++ = I_PRED_4x4_VR;
- *mode++ = I_PRED_4x4_HD;
- *pi_count += 3;
- }
- *mode++ = I_PRED_4x4_VL;
- *mode++ = I_PRED_4x4_HU;
- }
- else if( b_l )
- {
- *mode++ = I_PRED_4x4_DC_LEFT;
- *mode++ = I_PRED_4x4_H;
- *mode++ = I_PRED_4x4_HU;
- *pi_count = 3;
- }
- else if( b_t )
- {
- *mode++ = I_PRED_4x4_DC_TOP;
- *mode++ = I_PRED_4x4_V;
- *mode++ = I_PRED_4x4_DDL;
- *mode++ = I_PRED_4x4_VL;
- *pi_count = 4;
- }
- else
- {
- *mode++ = I_PRED_4x4_DC_128;
- *pi_count = 1;
- }
+static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
+{
+ int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
+ return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
}
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
{
- DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
- DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
- DECLARE_ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
- int i;
+ ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
if( do_both_dct || h->mb.b_transform_8x8 )
- {
- h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
- for( i = 0; i < 4; i++ )
- h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
- }
+ h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
if( do_both_dct || !h->mb.b_transform_8x8 )
- {
- h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
- for( i = 0; i < 16; i++ )
- h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
- }
+ h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
}
/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
static inline void x264_mb_cache_fenc_satd( x264_t *h )
{
- DECLARE_ALIGNED_16( static uint8_t zero[16] ) = {0};
+ ALIGNED_16( static uint8_t zero[16] ) = {0};
uint8_t *fenc;
int x, y, satd_sum = 0, sa8d_sum = 0;
if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
@@ -537,71 +605,55 @@ static inline void x264_mb_cache_fenc_satd( x264_t *h )
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
{
- int i;
-
- int i_max;
- int predict_mode[4];
int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
- uint8_t *p_dstc[2], *p_srcc[2];
-
if( a->i_satd_i8x8chroma < COST_MAX )
return;
- /* 8x8 prediction selection for chroma */
- p_dstc[0] = h->mb.pic.p_fdec[1];
- p_dstc[1] = h->mb.pic.p_fdec[2];
- p_srcc[0] = h->mb.pic.p_fenc[1];
- p_srcc[1] = h->mb.pic.p_fenc[2];
+ const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
- predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
- a->i_satd_i8x8chroma = COST_MAX;
- if( i_max == 4 && b_merged_satd )
+ /* 8x8 prediction selection for chroma */
+ if( predict_mode[3] >= 0 && b_merged_satd )
{
int satdu[4], satdv[4];
- h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
- h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
- h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
- h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
- satdu[I_PRED_CHROMA_P] =
- h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
- satdv[I_PRED_CHROMA_P] =
- h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
-
- for( i=0; i<i_max; i++ )
+ h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
+ h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
+ h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
+ h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
+ satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
+ satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+
+ for( ; *predict_mode >= 0; predict_mode++ )
{
- int i_mode = predict_mode[i];
- int i_satd = satdu[i_mode] + satdv[i_mode]
- + a->i_lambda * bs_size_ue(i_mode);
+ int i_mode = *predict_mode;
+ int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
- a->i_satd_i8x8chroma_dir[i] = i_satd;
+ a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
}
}
else
{
- for( i=0; i<i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
int i_satd;
- int i_mode = predict_mode[i];
+ int i_mode = *predict_mode;
/* we do the prediction */
if( h->mb.b_lossless )
x264_predict_lossless_8x8_chroma( h, i_mode );
else
{
- h->predict_8x8c[i_mode]( p_dstc[0] );
- h->predict_8x8c[i_mode]( p_dstc[1] );
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
}
/* we calculate the cost */
- i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
- p_srcc[0], FENC_STRIDE ) +
- h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
- p_srcc[1], FENC_STRIDE ) +
+ i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
+ h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
- a->i_satd_i8x8chroma_dir[i] = i_satd;
+ a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
}
}
@@ -616,16 +668,14 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
uint8_t *p_dst = h->mb.pic.p_fdec[0];
int i, idx;
- int i_max;
- int predict_mode[9];
int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
/*---------------- Try all mode and calculate their score ---------------*/
/* 16x16 prediction selection */
- predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+ const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
- if( b_merged_satd && i_max == 4 )
+ if( b_merged_satd && predict_mode[3] >= 0 )
{
h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
h->predict_16x16[I_PRED_16x16_P]( p_dst );
@@ -639,10 +689,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
}
else
{
- for( i = 0; i < i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
int i_satd;
- int i_mode = predict_mode[i];
+ int i_mode = *predict_mode;
if( h->mb.b_lossless )
x264_predict_lossless_16x16( h, i_mode );
@@ -665,7 +715,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
@@ -685,10 +735,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
int i_best = COST_MAX;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
- predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+ predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
- if( b_merged_satd && i_max == 9 )
+ if( b_merged_satd && predict_mode[8] >= 0 )
{
int satd[9];
h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
@@ -698,23 +748,22 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
}
- i = 3;
+ predict_mode += 3;
}
- else
- i = 0;
- for( ; i<i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
int i_satd;
- int i_mode = predict_mode[i];
+ int i_mode = *predict_mode;
if( h->mb.b_lossless )
x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
else
h->predict_8x8[i_mode]( p_dst_by, edge );
- i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
- + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+ i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
+ if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
+ i_satd -= a->i_lambda * 3;
COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
@@ -737,10 +786,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
- h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
- h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
- h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
- h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+ h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+ h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+ h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+ h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
@@ -777,41 +826,39 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
int i_best = COST_MAX;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
- predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+ const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+ M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
- if( b_merged_satd && i_max >= 6 )
+ if( b_merged_satd && predict_mode[5] >= 0 )
{
int satd[9];
h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
satd[i_pred_mode] -= 3 * a->i_lambda;
for( i=2; i>=0; i-- )
- COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
- a->i_predict4x4[idx], i );
- i = 3;
+ COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
+ predict_mode += 3;
}
- else
- i = 0;
- for( ; i<i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
int i_satd;
- int i_mode = predict_mode[i];
+ int i_mode = *predict_mode;
+
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
else
h->predict_4x4[i_mode]( p_dst_by );
- i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
- p_src_by, FENC_STRIDE )
- + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+ i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
+ if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
+ i_satd -= a->i_lambda * 3;
COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
}
- i_cost += i_best;
+ i_cost += i_best + 4 * a->i_lambda;
if( i_cost > i_satd_thresh || idx == 15 )
break;
@@ -828,10 +875,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
- h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
- h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
- h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
- h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+ h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+ h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+ h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+ h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
@@ -877,21 +924,20 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
{
uint8_t *p_dst = h->mb.pic.p_fdec[0];
- int i, j, idx, x, y;
- int i_max, i_mode, i_thresh;
+ int i, idx, x, y;
+ int i_mode, i_thresh;
uint64_t i_satd, i_best;
- int predict_mode[9];
h->mb.i_skip_intra = 0;
if( h->mb.i_type == I_16x16 )
{
int old_pred_mode = a->i_predict16x16;
+ const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
i_best = a->i_satd_i16x16;
- predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
- for( i = 0; i < i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
- int i_mode = predict_mode[i];
+ int i_mode = *predict_mode;
if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
continue;
h->mb.i_intra16x16_pred_mode = i_mode;
@@ -901,18 +947,19 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
}
/* RD selection for chroma prediction */
- predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
- if( i_max > 1 )
+ const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+ if( predict_mode[1] >= 0 )
{
+ int8_t predict_mode_sorted[4];
+ int i_max;
i_thresh = a->i_satd_i8x8chroma * 5/4;
- for( i = j = 0; i < i_max; i++ )
- if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
- predict_mode[i] != a->i_predict8x8chroma )
- {
- predict_mode[j++] = predict_mode[i];
- }
- i_max = j;
+ for( i_max = 0; *predict_mode >= 0; predict_mode++ )
+ {
+ i_mode = *predict_mode;
+ if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
+ predict_mode_sorted[i_max++] = i_mode;
+ }
if( i_max > 0 )
{
@@ -924,7 +971,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
for( i = 0; i < i_max; i++ )
{
- i_mode = predict_mode[i];
+ i_mode = predict_mode_sorted[i];
if( h->mb.b_lossless )
x264_predict_lossless_8x8_chroma( h, i_mode );
else
@@ -952,15 +999,15 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
i_best = COST_MAX64;
- predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+ const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+ M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
- for( i = 0; i < i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
- i_mode = predict_mode[i];
+ i_mode = *predict_mode;
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
else
@@ -971,18 +1018,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
{
a->i_predict4x4[idx] = i_mode;
i_best = i_satd;
- pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
- pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
- pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
- pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
+ pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
+ pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
+ pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
+ pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
}
}
- *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
- *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
- *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
- *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
+ M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
+ M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
+ M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
+ M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
@@ -990,12 +1037,12 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
}
else if( h->mb.i_type == I_8x8 )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
for( idx = 0; idx < 4; idx++ )
{
uint64_t pels_h = 0;
uint8_t pels_v[7];
- uint16_t i_nnz[2];
+ uint16_t i_nnz[2] = {0}; //shut up gcc
uint8_t *p_dst_by;
int j;
int cbp_luma_new = 0;
@@ -1006,14 +1053,15 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
y = idx>>1;
p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
- predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+ const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
- for( i = 0; i < i_max; i++ )
+ for( ; *predict_mode >= 0; predict_mode++ )
{
- i_mode = predict_mode[i];
+ i_mode = *predict_mode;
if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
continue;
+
if( h->mb.b_lossless )
x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
else
@@ -1027,21 +1075,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
cbp_luma_new = h->mb.i_cbp_luma;
i_best = i_satd;
- pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
+ pels_h = M64( p_dst_by+7*FDEC_STRIDE );
if( !(idx&1) )
for( j=0; j<7; j++ )
pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
- i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
- i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
+ i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
+ i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
}
}
a->i_cbp_i8x8_luma = cbp_luma_new;
- *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
+ M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
if( !(idx&1) )
for( j=0; j<7; j++ )
p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
+ M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
+ M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
}
@@ -1049,6 +1097,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
}
#define LOAD_FENC( m, src, xoff, yoff) \
+ (m)->p_cost_mv = a->p_cost_mv; \
(m)->i_stride[0] = h->mb.pic.i_stride[0]; \
(m)->i_stride[1] = h->mb.pic.i_stride[1]; \
(m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
@@ -1056,28 +1105,33 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
(m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
- (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
+ (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
(m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
- (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
+ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
+ (m)->weight = weight_none; \
+ (m)->i_ref = ref;
+
+#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
+ (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
+ (m)->weight = h->sh.weight[i_ref];
#define REF_COST(list, ref) \
- (a->p_cost_ref##list[ref])
+ (a->p_cost_ref[list][ref])
static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
int i_ref, i_mvc;
- DECLARE_ALIGNED_4( int16_t mvc[8][2] );
+ ALIGNED_4( int16_t mvc[8][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
/* 16x16 Search on all ref frame */
m.i_pixel = PIXEL_16x16;
- m.p_cost_mv = a->p_cost_mv;
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
a->l0.me16x16.cost = INT_MAX;
@@ -1086,13 +1140,21 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
const int i_ref_cost = REF_COST( 0, i_ref );
i_halfpel_thresh -= i_ref_cost;
m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
/* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
+
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
- x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
+
+ if( h->mb.ref_blind_dupe == i_ref )
+ {
+ CP32( m.mv, a->l0.mvc[0][0] );
+ x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
+ }
+ else
+ x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
/* early termination
* SSD threshold would probably be better than SATD */
@@ -1105,7 +1167,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
h->mb.i_type = P_SKIP;
x264_analyse_update_cache( h, a );
- assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+ assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
return;
}
@@ -1116,22 +1178,24 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
/* save mv for predicting neighbors */
- *(uint32_t*)a->l0.mvc[i_ref][0] =
- *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+ CP32( a->l0.mvc[i_ref][0], m.mv );
+ CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
}
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
- assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+ assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
h->mb.i_type = P_L0;
if( a->i_mbrd )
{
x264_mb_cache_fenc_satd( h );
- if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+ if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
{
h->mb.i_partition = D_16x16;
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
+ h->mb.i_type = P_SKIP;
}
}
}
@@ -1148,22 +1212,29 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
h->mb.i_partition = D_8x8;
+ #define CHECK_NEIGHBOUR(i)\
+ {\
+ int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
+ if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
+ i_maxref = ref;\
+ }
+
/* early termination: if 16x16 chose ref 0, then evalute no refs older
* than those used by the neighbors */
- if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
+ if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
h->mb.i_mb_type_top && h->mb.i_mb_type_left )
{
i_maxref = 0;
- i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
- i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
- i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
- i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
- i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
- i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
+ CHECK_NEIGHBOUR( -8 - 1 );
+ CHECK_NEIGHBOUR( -8 + 0 );
+ CHECK_NEIGHBOUR( -8 + 2 );
+ CHECK_NEIGHBOUR( -8 + 4 );
+ CHECK_NEIGHBOUR( 0 - 1 );
+ CHECK_NEIGHBOUR( 2*8 - 1 );
}
for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
- *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
+ CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
for( i = 0; i < 4; i++ )
{
@@ -1172,34 +1243,45 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
const int y8 = i/2;
m.i_pixel = PIXEL_8x8;
- m.p_cost_mv = a->p_cost_mv;
LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
l0m->cost = INT_MAX;
- for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
+ for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
{
const int i_ref_cost = REF_COST( 0, i_ref );
- i_halfpel_thresh -= i_ref_cost;
m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
- x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
+ if( h->mb.ref_blind_dupe == i_ref )
+ {
+ CP32( m.mv, a->l0.mvc[0][i+1] );
+ x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
+ }
+ else
+ x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
m.cost += i_ref_cost;
i_halfpel_thresh += i_ref_cost;
- *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
+ CP32( a->l0.mvc[i_ref][i+1], m.mv );
if( m.cost < l0m->cost )
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
+ if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
+ i_ref = h->mb.ref_blind_dupe;
+ else
+ i_ref++;
}
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
- /* mb type cost */
- l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+ /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
+ are effectively zero. */
+ if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+ l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
}
a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
@@ -1214,9 +1296,11 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
{
- const int i_ref = a->l0.me16x16.i_ref;
+ /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
+ * reference frame flags. Thus, if we're not doing mixedrefs, just
+ * don't bother analysing the dupes. */
+ const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
- uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
uint8_t **p_fenc = h->mb.pic.p_fenc;
int i_mvc;
int16_t (*mvc)[2] = a->l0.mvc[i_ref];
@@ -1226,7 +1310,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
h->mb.i_partition = D_8x8;
i_mvc = 1;
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
+ CP32( mvc[0], a->l0.me16x16.mv );
for( i = 0; i < 4; i++ )
{
@@ -1235,23 +1319,24 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
const int y8 = i/2;
m->i_pixel = PIXEL_8x8;
- m->p_cost_mv = a->p_cost_mv;
m->i_ref_cost = i_ref_cost;
- m->i_ref = i_ref;
LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
- LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
+ LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+ LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
x264_me_search( h, m, mvc, i_mvc );
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
- *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
+ CP32( mvc[i_mvc], m->mv );
i_mvc++;
/* mb type cost */
m->cost += i_ref_cost;
- m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+ if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+ m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
}
a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
@@ -1268,7 +1353,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
uint8_t **p_fenc = h->mb.pic.p_fenc;
- DECLARE_ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_4( int16_t mvc[3][2] );
int i, j;
/* XXX Needed for x264_mb_predict_mv */
@@ -1277,11 +1362,12 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
for( i = 0; i < 2; i++ )
{
x264_me_t *l0m = &a->l0.me16x8[i];
- const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
+ const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
+ const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
+ const int ref8[2] = { minref, maxref };
const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
m.i_pixel = PIXEL_16x8;
- m.p_cost_mv = a->p_cost_mv;
LOAD_FENC( &m, p_fenc, 0, 8*i );
l0m->cost = INT_MAX;
@@ -1290,17 +1376,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
/* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
- *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
- *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
+ CP32( mvc[0], a->l0.mvc[i_ref][0] );
+ CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
+ CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
+
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
- x264_me_search( h, &m, mvc, 3 );
+ /* We can only take this shortcut if the first search was performed on ref0. */
+ if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
+ {
+ /* We can just leave the MV from the previous ref search. */
+ x264_me_refine_qpel_refdupe( h, &m, NULL );
+ }
+ else
+ x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost;
@@ -1318,7 +1412,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
uint8_t **p_fenc = h->mb.pic.p_fenc;
- DECLARE_ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_4( int16_t mvc[3][2] );
int i, j;
/* XXX Needed for x264_mb_predict_mv */
@@ -1327,11 +1421,12 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
for( i = 0; i < 2; i++ )
{
x264_me_t *l0m = &a->l0.me8x16[i];
- const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
+ const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
+ const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
+ const int ref8[2] = { minref, maxref };
const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
m.i_pixel = PIXEL_8x16;
- m.p_cost_mv = a->p_cost_mv;
LOAD_FENC( &m, p_fenc, 8*i, 0 );
l0m->cost = INT_MAX;
@@ -1340,16 +1435,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
- *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
- *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
+ CP32( mvc[0], a->l0.mvc[i_ref][0] );
+ CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
+ CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
+
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
- x264_me_search( h, &m, mvc, 3 );
+ /* We can only take this shortcut if the first search was performed on ref0. */
+ if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
+ {
+ /* We can just leave the MV from the previous ref search. */
+ x264_me_refine_qpel_refdupe( h, &m, NULL );
+ }
+ else
+ x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost;
@@ -1365,32 +1468,43 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
{
- DECLARE_ALIGNED_8( uint8_t pix1[16*8] );
+ ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
uint8_t *pix2 = pix1+8;
const int i_stride = h->mb.pic.i_stride[1];
const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
+ const int i_ref = a->l0.me8x8[i8x8].i_ref;
+ const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ x264_weight_t *weight = h->sh.weight[i_ref];
#define CHROMA4x4MC( width, height, me, x, y ) \
- h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
- h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
+ h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+ if( weight[1].weightfn ) \
+ weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+ h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+ if( weight[2].weightfn ) \
+ weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
+
if( pixel == PIXEL_4x4 )
{
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
+ x264_me_t *m = a->l0.me4x4[i8x8];
+ CHROMA4x4MC( 2,2, m[0], 0,0 );
+ CHROMA4x4MC( 2,2, m[1], 2,0 );
+ CHROMA4x4MC( 2,2, m[2], 0,2 );
+ CHROMA4x4MC( 2,2, m[3], 2,2 );
}
else if( pixel == PIXEL_8x4 )
{
- CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
- CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
+ x264_me_t *m = a->l0.me8x4[i8x8];
+ CHROMA4x4MC( 4,2, m[0], 0,0 );
+ CHROMA4x4MC( 4,2, m[1], 0,2 );
}
else
{
- CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
- CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
+ x264_me_t *m = a->l0.me4x8[i8x8];
+ CHROMA4x4MC( 2,4, m[0], 0,0 );
+ CHROMA4x4MC( 2,4, m[1], 2,0 );
}
return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
@@ -1417,10 +1531,10 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
m->i_pixel = PIXEL_4x4;
- m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+ LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
@@ -1457,10 +1571,10 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
m->i_pixel = PIXEL_8x4;
- m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+ LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
@@ -1494,10 +1608,10 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
m->i_pixel = PIXEL_4x8;
- m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+ LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
@@ -1534,33 +1648,31 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
}
}
-#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
-{ \
- h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
-}
-
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
- DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
- DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
uint8_t *src0, *src1;
int stride0 = 16, stride1 = 16;
x264_me_t m;
int i_ref, i_mvc;
- DECLARE_ALIGNED_4( int16_t mvc[9][2] );
+ ALIGNED_4( int16_t mvc[9][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
/* 16x16 Search on all ref frame */
m.i_pixel = PIXEL_16x16;
- m.p_cost_mv = a->p_cost_mv;
+ m.weight = weight_none;
+
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
/* ME for List 0 */
a->l0.me16x16.cost = INT_MAX;
for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
{
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ m.i_ref_cost = i_ref_cost;
/* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
@@ -1568,7 +1680,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
/* add ref cost */
- m.cost += REF_COST( 0, i_ref );
+ m.cost += i_ref_cost;
if( m.cost < a->l0.me16x16.cost )
{
@@ -1577,10 +1689,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
/* save mv for predicting neighbors */
- *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+ CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
}
- /* subtract ref cost, so we don't have to add it for the other MB types */
- a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
+ a->l0.me16x16.i_ref = a->l0.i_ref;
/* ME for list 1 */
i_halfpel_thresh = INT_MAX;
@@ -1588,6 +1699,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
a->l1.me16x16.cost = INT_MAX;
for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
{
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ m.i_ref_cost = i_ref_cost;
/* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
@@ -1595,7 +1708,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
/* add ref cost */
- m.cost += REF_COST( 1, i_ref );
+ m.cost += i_ref_cost;
if( m.cost < a->l1.me16x16.cost )
{
@@ -1604,22 +1717,17 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
/* save mv for predicting neighbors */
- *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+ CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
}
- /* subtract ref cost, so we don't have to add it for the other MB types */
- a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
-
- /* Set global ref, needed for other modes? */
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+ a->l1.me16x16.i_ref = a->l1.i_ref;
/* get cost of BI mode */
src0 = h->mc.get_ref( pix0, &stride0,
- h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
+ h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+ a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
src1 = h->mc.get_ref( pix1, &stride1,
- h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
+ h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+ a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
@@ -1665,6 +1773,16 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int
}
}
+static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
+{
+ const int x = 2*(idx&1);
+ const int y = 2*(idx>>1);
+ x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
+ x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
+ x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
+ x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
+}
+
#define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
if( x264_mb_partition_listX_table[0][part] ) \
{ \
@@ -1725,7 +1843,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_8( uint8_t pix[2][8*8] );
+ ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
int i, l;
/* XXX Needed for x264_mb_predict_mv */
@@ -1745,24 +1863,26 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
for( l = 0; l < 2; l++ )
{
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+ const int i_ref_cost = REF_COST( l, lX->i_ref );
x264_me_t *m = &lX->me8x8[i];
m->i_pixel = PIXEL_8x8;
- m->p_cost_mv = a->p_cost_mv;
+ m->i_ref_cost = i_ref_cost;
LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
+ x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
x264_me_search( h, m, &lX->me16x16.mv, 1 );
+ m->cost += i_ref_cost;
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
/* BI mode */
src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
- m->mv[0], m->mv[1], 8, 8 );
- i_part_cost_bi += m->cost_mv;
- /* FIXME: ref cost */
+ m->mv[0], m->mv[1], 8, 8, weight_none );
+ i_part_cost_bi += m->cost_mv + i_ref_cost;
}
h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
@@ -1790,8 +1910,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
- DECLARE_ALIGNED_4( int16_t mvc[2][2] );
+ ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
+ ALIGNED_4( int16_t mvc[2][2] );
int i, l;
h->mb.i_partition = D_16x8;
@@ -1808,25 +1928,27 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
for( l = 0; l < 2; l++ )
{
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+ const int i_ref_cost = REF_COST( l, lX->i_ref );
x264_me_t *m = &lX->me16x8[i];
m->i_pixel = PIXEL_16x8;
- m->p_cost_mv = a->p_cost_mv;
+ m->i_ref_cost = i_ref_cost;
LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
- *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
- *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
+ CP32( mvc[0], lX->me8x8[2*i].mv );
+ CP32( mvc[1], lX->me8x8[2*i+1].mv );
- x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
+ x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
+ x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
x264_me_search( h, m, mvc, 2 );
+ m->cost += i_ref_cost;
/* BI mode */
src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
- m->mv[0], m->mv[1], 16, 8 );
- /* FIXME: ref cost */
- i_part_cost_bi += m->cost_mv;
+ m->mv[0], m->mv[1], 16, 8, weight_none );
+ i_part_cost_bi += m->cost_mv + i_ref_cost;
}
h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
@@ -1860,8 +1982,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_8( uint8_t pix[2][8*16] );
- DECLARE_ALIGNED_4( int16_t mvc[2][2] );
+ ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
+ ALIGNED_4( int16_t mvc[2][2] );
int i, l;
h->mb.i_partition = D_8x16;
@@ -1877,25 +1999,27 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
for( l = 0; l < 2; l++ )
{
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+ const int i_ref_cost = REF_COST( l, lX->i_ref );
x264_me_t *m = &lX->me8x16[i];
m->i_pixel = PIXEL_8x16;
- m->p_cost_mv = a->p_cost_mv;
+ m->i_ref_cost = i_ref_cost;
LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
- *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
- *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
+ CP32( mvc[0], lX->me8x8[i].mv );
+ CP32( mvc[1], lX->me8x8[i+2].mv );
+ x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
x264_me_search( h, m, mvc, 2 );
+ m->cost += i_ref_cost;
/* BI mode */
src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
- m->mv[0], m->mv[1], 8, 16 );
- /* FIXME: ref cost */
- i_part_cost_bi += m->cost_mv;
+ m->mv[0], m->mv[1], 8, 16, weight_none );
+ i_part_cost_bi += m->cost_mv + i_ref_cost;
}
h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
@@ -1936,7 +2060,6 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
x264_analyse_update_cache( h, a );
a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
}
- a->l0.me16x16.cost = a->l0.i_rd16x16;
if( a->l0.i_cost16x8 <= thresh )
{
@@ -1985,8 +2108,11 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
COPY2_IF_LT( bcost, cost, btype, subtype );
}
- h->mb.i_sub_partition[i] = btype;
- x264_mb_cache_mv_p8x8( h, a, i );
+ if( h->mb.i_sub_partition[i] != btype )
+ {
+ h->mb.i_sub_partition[i] = btype;
+ x264_mb_cache_mv_p8x8( h, a, i );
+ }
}
}
else
@@ -2077,25 +2203,25 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
switch( h->mb.i_partition )
{
- case D_16x16:
- if( h->mb.i_type == B_BI_BI )
- x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
- break;
- case D_16x8:
- for( i=0; i<2; i++ )
- if( a->i_mb_partition16x8[i] == D_BI_8x8 )
- x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
- break;
- case D_8x16:
- for( i=0; i<2; i++ )
- if( a->i_mb_partition8x16[i] == D_BI_8x8 )
- x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
- break;
- case D_8x8:
- for( i=0; i<4; i++ )
- if( h->mb.i_sub_partition[i] == D_BI_8x8 )
- x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
- break;
+ case D_16x16:
+ if( h->mb.i_type == B_BI_BI )
+ x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
+ break;
+ case D_16x8:
+ for( i=0; i<2; i++ )
+ if( a->i_mb_partition16x8[i] == D_BI_8x8 )
+ x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
+ break;
+ case D_8x16:
+ for( i=0; i<2; i++ )
+ if( a->i_mb_partition8x16[i] == D_BI_8x8 )
+ x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
+ break;
+ case D_8x8:
+ for( i=0; i<4; i++ )
+ if( h->mb.i_sub_partition[i] == D_BI_8x8 )
+ x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
+ break;
}
}
@@ -2123,7 +2249,7 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *
{
int i_rd8;
x264_analyse_update_cache( h, a );
- h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+ h->mb.b_transform_8x8 ^= 1;
/* FIXME only luma is needed, but the score for comparison already includes chroma */
i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
@@ -2134,10 +2260,87 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *
*i_rd = i_rd8;
}
else
- h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+ h->mb.b_transform_8x8 ^= 1;
}
}
+/* Rate-distortion optimal QP selection.
+ * FIXME: More than half of the benefit of this function seems to be
+ * in the way it improves the coding of chroma DC (by decimating or
+ * finding a better way to code a single DC coefficient.)
+ * There must be a more efficient way to get that portion of the benefit
+ * without doing full QP-RD, but RD-decimation doesn't seem to do the
+ * trick. */
+static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
+{
+ int bcost, cost, direction, failures, prevcost, origcost;
+ int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
+ int last_qp_tried = 0;
+ origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
+
+ /* If CBP is already zero, don't raise the quantizer any higher. */
+ for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
+ {
+ /* Without psy-RD, require monotonicity when moving quant away from previous
+ * macroblock's quant; allow 1 failure when moving quant towards previous quant.
+ * With psy-RD, allow 1 failure when moving quant away from previous quant,
+ * allow 2 failures when moving quant towards previous quant.
+ * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
+ int threshold = (!!h->mb.i_psy_rd);
+ /* Raise the threshold for failures if we're moving towards the last QP. */
+ if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
+ ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
+ threshold++;
+ h->mb.i_qp = orig_qp;
+ failures = 0;
+ prevcost = origcost;
+ h->mb.i_qp += direction;
+ while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
+ {
+ if( h->mb.i_last_qp == h->mb.i_qp )
+ last_qp_tried = 1;
+ h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+ cost = x264_rd_cost_mb( h, a->i_lambda2 );
+ COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+
+ /* We can't assume that the costs are monotonic over QPs.
+ * Tie case-as-failure seems to give better results. */
+ if( cost < prevcost )
+ failures = 0;
+ else
+ failures++;
+ prevcost = cost;
+
+ if( failures > threshold )
+ break;
+ if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
+ break;
+ h->mb.i_qp += direction;
+ }
+ }
+
+ /* Always try the last block's QP. */
+ if( !last_qp_tried )
+ {
+ h->mb.i_qp = h->mb.i_last_qp;
+ h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+ cost = x264_rd_cost_mb( h, a->i_lambda2 );
+ COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+ }
+
+ h->mb.i_qp = bqp;
+ h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+
+ /* Check transform again; decision from before may no longer be optimal. */
+ if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
+ x264_mb_transform_8x8_allowed( h ) )
+ {
+ h->mb.b_transform_8x8 ^= 1;
+ cost = x264_rd_cost_mb( h, a->i_lambda2 );
+ if( cost > bcost )
+ h->mb.b_transform_8x8 ^= 1;
+ }
+}
/*****************************************************************************
* x264_macroblock_analyse:
@@ -2150,13 +2353,20 @@ void x264_macroblock_analyse( x264_t *h )
h->mb.i_qp = x264_ratecontrol_qp( h );
if( h->param.rc.i_aq_mode )
+ {
x264_adaptive_quant( h );
+ /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
+ * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
+ if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
+ h->mb.i_qp = h->mb.i_last_qp;
+ }
x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
/*--------------------------- Do the analysis ---------------------------*/
if( h->sh.i_type == SLICE_TYPE_I )
{
+intra_analysis:
if( analysis.i_mbrd )
x264_mb_cache_fenc_satd( h );
x264_mb_analyse_intra( h, &analysis, COST_MAX );
@@ -2179,20 +2389,31 @@ void x264_macroblock_analyse( x264_t *h )
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
- /* Fast P_SKIP detection */
analysis.b_try_pskip = 0;
- if( h->param.analyse.b_fast_pskip )
+ if( analysis.b_force_intra )
{
- if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
- // FIXME don't need to check this if the reference frame is done
- {}
- else if( h->param.analyse.i_subpel_refine >= 3 )
- analysis.b_try_pskip = 1;
- else if( h->mb.i_mb_type_left == P_SKIP ||
- h->mb.i_mb_type_top == P_SKIP ||
- h->mb.i_mb_type_topleft == P_SKIP ||
- h->mb.i_mb_type_topright == P_SKIP )
- b_skip = x264_macroblock_probe_pskip( h );
+ if( !h->param.analyse.b_psy )
+ {
+ x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
+ goto intra_analysis;
+ }
+ }
+ else
+ {
+ /* Fast P_SKIP detection */
+ if( h->param.analyse.b_fast_pskip )
+ {
+ if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
+ // FIXME don't need to check this if the reference frame is done
+ {}
+ else if( h->param.analyse.i_subpel_refine >= 3 )
+ analysis.b_try_pskip = 1;
+ else if( h->mb.i_mb_type_left == P_SKIP ||
+ h->mb.i_mb_type_top == P_SKIP ||
+ h->mb.i_mb_type_topleft == P_SKIP ||
+ h->mb.i_mb_type_topright == P_SKIP )
+ b_skip = x264_macroblock_probe_pskip( h );
+ }
}
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
@@ -2201,7 +2422,7 @@ void x264_macroblock_analyse( x264_t *h )
{
h->mb.i_type = P_SKIP;
h->mb.i_partition = D_16x16;
- assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+ assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
}
else
{
@@ -2281,7 +2502,7 @@ void x264_macroblock_analyse( x264_t *h )
/* refine qpel */
//FIXME mb_type costs?
- if( analysis.i_mbrd )
+ if( analysis.i_mbrd || !h->mb.i_subpel_refine )
{
/* refine later */
}
@@ -2365,7 +2586,7 @@ void x264_macroblock_analyse( x264_t *h )
x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
i_type = P_L0;
i_partition = D_16x16;
- i_cost = analysis.l0.me16x16.cost;
+ i_cost = analysis.l0.i_rd16x16;
COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
@@ -2383,6 +2604,19 @@ void x264_macroblock_analyse( x264_t *h )
h->mb.i_type = i_type;
+ if( analysis.b_force_intra && !IS_INTRA(i_type) )
+ {
+ /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
+ * it was an inter block. */
+ x264_analyse_update_cache( h, &analysis );
+ x264_macroblock_encode( h );
+ h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
+ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
+ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+ x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
+ goto intra_analysis;
+ }
+
if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
{
if( IS_INTRA( h->mb.i_type ) )
@@ -2392,6 +2626,7 @@ void x264_macroblock_analyse( x264_t *h )
else if( i_partition == D_16x16 )
{
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
+ analysis.l0.me16x16.cost = i_cost;
x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
}
else if( i_partition == D_16x8 )
@@ -2424,20 +2659,20 @@ void x264_macroblock_analyse( x264_t *h )
}
else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
{
- x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
- x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
}
else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
{
- x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
- x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
}
else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
{
- x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
- x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
- x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
- x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
+ x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
}
}
}
@@ -2500,7 +2735,7 @@ void x264_macroblock_analyse( x264_t *h )
const unsigned int flags = h->param.analyse.inter;
int i_type;
int i_partition;
- int i_satd_inter = 0; // shut up uninitialized warning
+ int i_satd_inter;
h->mb.b_skip_mc = 0;
x264_mb_analyse_load_costs( h, &analysis );
@@ -2561,7 +2796,7 @@ void x264_macroblock_analyse( x264_t *h )
}
}
- if( analysis.i_mbrd )
+ if( analysis.i_mbrd || !h->mb.i_subpel_refine )
{
/* refine later */
}
@@ -2644,9 +2879,10 @@ void x264_macroblock_analyse( x264_t *h )
}
}
+ i_satd_inter = i_cost;
+
if( analysis.i_mbrd )
{
- i_satd_inter = i_cost;
x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
i_type = B_SKIP;
i_cost = i_bskip_cost;
@@ -2692,9 +2928,15 @@ void x264_macroblock_analyse( x264_t *h )
if( i_partition == D_16x16 )
{
if( i_type == B_L0_L0 )
+ {
+ analysis.l0.me16x16.cost = i_cost;
x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
+ }
else if( i_type == B_L1_L1 )
+ {
+ analysis.l1.me16x16.cost = i_cost;
x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
+ }
else if( i_type == B_BI_BI )
x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
}
@@ -2742,9 +2984,25 @@ void x264_macroblock_analyse( x264_t *h )
x264_analyse_update_cache( h, &analysis );
+ /* In rare cases we can end up qpel-RDing our way back to a larger partition size
+ * without realizing it. Check for this and account for it if necessary. */
+ if( analysis.i_mbrd >= 2 )
+ {
+ /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
+ static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
+ int list = check_mv_lists[h->mb.i_type] - 1;
+ if( list >= 0 && h->mb.i_partition != D_16x16 &&
+ M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
+ h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
+ h->mb.i_partition = D_16x16;
+ }
+
if( !analysis.i_mbrd )
x264_mb_analyse_transform( h );
+ if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
+ x264_mb_analyse_qp_rd( h, &analysis );
+
h->mb.b_trellis = h->param.analyse.i_trellis;
h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
@@ -2885,7 +3143,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
}
#ifndef NDEBUG
- if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
+ if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
{
int l;
for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
@@ -2894,7 +3152,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
int ref = h->mb.cache.ref[l][x264_scan8[0]];
if( ref < 0 )
continue;
- completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
+ completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
{
x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
diff --git a/encoder/analyse.h b/encoder/analyse.h
index b8c828f..7c2c22c 100644
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -24,7 +24,21 @@
#ifndef X264_ANALYSE_H
#define X264_ANALYSE_H
+int x264_analyse_init_costs( x264_t *h, int qp );
+void x264_analyse_free_costs( x264_t *h );
+void x264_analyse_weight_frame( x264_t *h, int end );
void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
+void x264_slicetype_analyse( x264_t *h, int keyframe );
+
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
+
+int x264_lookahead_init( x264_t *h, int i_slicetype_length );
+int x264_lookahead_is_empty( x264_t *h );
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
+void x264_lookahead_get_frames( x264_t *h );
+void x264_lookahead_delete( x264_t *h );
+
#endif
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 97defa0..271f527 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -36,11 +36,13 @@ static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_
{
x264_cabac_encode_decision_noup( cb, ctx0, 0 );
}
+#if !RDO_SKIP_BS
else if( i_mb_type == I_PCM )
{
x264_cabac_encode_decision_noup( cb, ctx0, 1 );
x264_cabac_encode_flush( h, cb );
}
+#endif
else
{
int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
@@ -86,24 +88,9 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
/* prefix: 14, suffix: 17 */
if( i_mb_type == P_L0 )
{
- if( h->mb.i_partition == D_16x16 )
- {
- x264_cabac_encode_decision_noup( cb, 14, 0 );
- x264_cabac_encode_decision_noup( cb, 15, 0 );
- x264_cabac_encode_decision_noup( cb, 16, 0 );
- }
- else if( h->mb.i_partition == D_16x8 )
- {
- x264_cabac_encode_decision_noup( cb, 14, 0 );
- x264_cabac_encode_decision_noup( cb, 15, 1 );
- x264_cabac_encode_decision_noup( cb, 17, 1 );
- }
- else if( h->mb.i_partition == D_8x16 )
- {
- x264_cabac_encode_decision_noup( cb, 14, 0 );
- x264_cabac_encode_decision_noup( cb, 15, 1 );
- x264_cabac_encode_decision_noup( cb, 17, 0 );
- }
+ x264_cabac_encode_decision_noup( cb, 14, 0 );
+ x264_cabac_encode_decision_noup( cb, 15, h->mb.i_partition != D_16x16 );
+ x264_cabac_encode_decision_noup( cb, 17-(h->mb.i_partition == D_16x16), h->mb.i_partition == D_16x8 );
}
else if( i_mb_type == P_8x8 )
{
@@ -129,10 +116,14 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
ctx++;
if( i_mb_type == B_DIRECT )
+ {
x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
- else if( i_mb_type == B_8x8 )
+ return;
+ }
+ x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
+
+ if( i_mb_type == B_8x8 )
{
- x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
x264_cabac_encode_decision_noup( cb, 27+3, 1 );
x264_cabac_encode_decision_noup( cb, 27+4, 1 );
x264_cabac_encode_decision( cb, 27+5, 1 );
@@ -142,7 +133,6 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
else if( IS_INTRA( i_mb_type ) )
{
/* prefix */
- x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
x264_cabac_encode_decision_noup( cb, 27+3, 1 );
x264_cabac_encode_decision_noup( cb, 27+4, 1 );
x264_cabac_encode_decision( cb, 27+5, 1 );
@@ -154,39 +144,32 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
}
else
{
- static const int i_mb_len[9*3] =
- {
- 6, 6, 3, /* L0 L0 */
- 6, 6, 0, /* L0 L1 */
- 7, 7, 0, /* L0 BI */
- 6, 6, 0, /* L1 L0 */
- 6, 6, 3, /* L1 L1 */
- 7, 7, 0, /* L1 BI */
- 7, 7, 0, /* BI L0 */
- 7, 7, 0, /* BI L1 */
- 7, 7, 6, /* BI BI */
- };
- static const int i_mb_bits[9*3][7] =
+ static const uint8_t i_mb_bits[9*3] =
{
- { 1,1,0,0,0,1 }, { 1,1,0,0,1,0, }, { 1,0,0 }, /* L0 L0 */
- { 1,1,0,1,0,1 }, { 1,1,0,1,1,0 }, {0}, /* L0 L1 */
- { 1,1,1,0,0,0,0 }, { 1,1,1,0,0,0,1 }, {0}, /* L0 BI */
- { 1,1,0,1,1,1 }, { 1,1,1,1,1,0 }, {0}, /* L1 L0 */
- { 1,1,0,0,1,1 }, { 1,1,0,1,0,0 }, { 1,0,1 }, /* L1 L1 */
- { 1,1,1,0,0,1,0 }, { 1,1,1,0,0,1,1 }, {0}, /* L1 BI */
- { 1,1,1,0,1,0,0 }, { 1,1,1,0,1,0,1 }, {0}, /* BI L0 */
- { 1,1,1,0,1,1,0 }, { 1,1,1,0,1,1,1 }, {0}, /* BI L1 */
- { 1,1,1,1,0,0,0 }, { 1,1,1,1,0,0,1 }, { 1,1,0,0,0,0 }, /* BI BI */
+ 0x31, 0x29, 0x4, /* L0 L0 */
+ 0x35, 0x2d, 0, /* L0 L1 */
+ 0x43, 0x63, 0, /* L0 BI */
+ 0x3d, 0x2f, 0, /* L1 L0 */
+ 0x39, 0x25, 0x6, /* L1 L1 */
+ 0x53, 0x73, 0, /* L1 BI */
+ 0x4b, 0x6b, 0, /* BI L0 */
+ 0x5b, 0x7b, 0, /* BI L1 */
+ 0x47, 0x67, 0x21 /* BI BI */
};
const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
- int i;
+ int bits = i_mb_bits[idx];
- x264_cabac_encode_decision_noup( cb, 27+ctx, i_mb_bits[idx][0] );
- x264_cabac_encode_decision_noup( cb, 27+3, i_mb_bits[idx][1] );
- x264_cabac_encode_decision( cb, 27+5-i_mb_bits[idx][1], i_mb_bits[idx][2] );
- for( i = 3; i < i_mb_len[idx]; i++ )
- x264_cabac_encode_decision( cb, 27+5, i_mb_bits[idx][i] );
+ x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
+ x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
+ if( bits != 1 )
+ {
+ x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
+ x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
+ x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
+ if( bits != 1 )
+ x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
+ }
}
}
}
@@ -231,10 +214,10 @@ static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb )
int cbp = h->mb.i_cbp_luma;
int cbp_l = h->mb.cache.i_cbp_left;
int cbp_t = h->mb.cache.i_cbp_top;
- x264_cabac_encode_decision( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (h->mb.i_cbp_luma >> 0) & 1 );
- x264_cabac_encode_decision( cb, 76 - ((cbp >> 0) & 1) - ((cbp_t >> 2) & 2), (h->mb.i_cbp_luma >> 1) & 1 );
- x264_cabac_encode_decision( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp << 1) & 2), (h->mb.i_cbp_luma >> 2) & 1 );
- x264_cabac_encode_decision_noup( cb, 76 - ((cbp >> 2) & 1) - ((cbp >> 0) & 2), (h->mb.i_cbp_luma >> 3) & 1 );
+ x264_cabac_encode_decision ( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (cbp >> 0) & 1 );
+ x264_cabac_encode_decision ( cb, 76 - ((cbp >> 0) & 1) - ((cbp_t >> 2) & 2), (cbp >> 1) & 1 );
+ x264_cabac_encode_decision ( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp << 1) & 2), (cbp >> 2) & 1 );
+ x264_cabac_encode_decision_noup( cb, 76 - ((cbp >> 2) & 1) - ((cbp >> 0) & 2), (cbp >> 3) & 1 );
}
static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
@@ -260,7 +243,6 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
{
- int i_mbn_xy = h->mb.i_mb_prev_xy;
int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
int ctx;
@@ -273,9 +255,9 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
i_dqp = 0;
}
- /* No need to test for PCM / SKIP */
- ctx = h->mb.i_last_dqp &&
- ( h->mb.type[i_mbn_xy] == I_16x16 || (h->mb.cbp[i_mbn_xy]&0x3f) );
+ /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
+ * we don't have to check for them. */
+ ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
if( i_dqp != 0 )
{
@@ -283,11 +265,11 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
/* dqp is interpreted modulo 52 */
if( val >= 51 && val != 52 )
val = 103 - val;
- while( val-- )
+ do
{
x264_cabac_encode_decision( cb, 60 + ctx, 1 );
ctx = 2+(ctx>>1);
- }
+ } while( --val );
}
x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 );
}
@@ -305,61 +287,38 @@ void x264_cabac_mb_skip( x264_t *h, int b_skip )
static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t *cb, int i_sub )
{
if( i_sub == D_L0_8x8 )
- x264_cabac_encode_decision( cb, 21, 1 );
- else if( i_sub == D_L0_8x4 )
- {
- x264_cabac_encode_decision( cb, 21, 0 );
- x264_cabac_encode_decision( cb, 22, 0 );
- }
- else if( i_sub == D_L0_4x8 )
{
- x264_cabac_encode_decision( cb, 21, 0 );
- x264_cabac_encode_decision( cb, 22, 1 );
- x264_cabac_encode_decision( cb, 23, 1 );
+ x264_cabac_encode_decision( cb, 21, 1 );
+ return;
}
- else if( i_sub == D_L0_4x4 )
+ x264_cabac_encode_decision( cb, 21, 0 );
+ if( i_sub == D_L0_8x4 )
+ x264_cabac_encode_decision( cb, 22, 0 );
+ else
{
- x264_cabac_encode_decision( cb, 21, 0 );
x264_cabac_encode_decision( cb, 22, 1 );
- x264_cabac_encode_decision( cb, 23, 0 );
+ x264_cabac_encode_decision( cb, 23, i_sub == D_L0_4x8 );
}
}
-static NOINLINE void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
+static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
{
- static const uint8_t part_bits[12][7] = {
- {6,1,1,1,0,1,1}, // D_L0_4x4
- {5,1,1,0,0,1}, // D_L0_8x4
- {5,1,1,0,1,0}, // D_L0_4x8
- {3,1,0,0}, // D_L0_8x8
- {5,1,1,1,1,0}, // D_L1_4x4
- {5,1,1,0,1,1}, // D_L1_8x4
- {6,1,1,1,0,0,0}, // D_L1_4x8
- {3,1,0,1}, // D_L1_8x8
- {5,1,1,1,1,1}, // D_BI_4x4
- {6,1,1,1,0,0,1}, // D_BI_8x4
- {6,1,1,1,0,1,0}, // D_BI_4x8
- {5,1,1,0,0,0}, // D_BI_8x8
- };
- int len;
if( i_sub == D_DIRECT_8x8 )
{
x264_cabac_encode_decision( cb, 36, 0 );
return;
}
- len = part_bits[i_sub][0];
- x264_cabac_encode_decision( cb, 36, part_bits[i_sub][1] );
- x264_cabac_encode_decision( cb, 37, part_bits[i_sub][2] );
- if( len == 3 )
- x264_cabac_encode_decision( cb, 39, part_bits[i_sub][3] );
- else
+ x264_cabac_encode_decision( cb, 36, 1 );
+ if( i_sub == D_BI_8x8 )
{
- x264_cabac_encode_decision( cb, 38, part_bits[i_sub][3] );
- x264_cabac_encode_decision( cb, 39, part_bits[i_sub][4] );
- x264_cabac_encode_decision( cb, 39, part_bits[i_sub][5] );
- if( len == 6 )
- x264_cabac_encode_decision( cb, 39, part_bits[i_sub][6] );
+ x264_cabac_encode_decision( cb, 37, 1 );
+ x264_cabac_encode_decision( cb, 38, 0 );
+ x264_cabac_encode_decision( cb, 39, 0 );
+ x264_cabac_encode_decision( cb, 39, 0 );
+ return;
}
+ x264_cabac_encode_decision( cb, 37, 0 );
+ x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
}
static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
@@ -376,9 +335,9 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
int i_ref = h->mb.cache.ref[i_list][i8];
int ctx = 0;
- if( i_refa > 0 && !h->mb.cache.skip[i8 - 1])
+ if( i_refa > 0 && !h->mb.cache.skip[i8 - 1] )
ctx++;
- if( i_refb > 0 && !h->mb.cache.skip[i8 - 8])
+ if( i_refb > 0 && !h->mb.cache.skip[i8 - 8] )
ctx += 2;
while( i_ref > 0 )
@@ -392,54 +351,68 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
{
- static const uint8_t ctxes[9] = { 0,3,4,5,6,6,6,6,6 };
const int i_abs = abs( mvd );
const int ctxbase = l ? 47 : 40;
int i;
-
+#if RDO_SKIP_BS
if( i_abs == 0 )
x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
- else if( i_abs < 9 )
+ else
{
x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
-#if RDO_SKIP_BS
- if( i_abs > 4 )
+ if( i_abs <= 3 )
{
- for( i = 1; i < 4; i++ )
- x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
- cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
- cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
+ for( i = 1; i < i_abs; i++ )
+ x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 );
+ x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 );
+ x264_cabac_encode_bypass( cb, mvd < 0 );
}
else
-#endif
{
- for( i = 1; i < i_abs; i++ )
- x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
- x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs], 0 );
- x264_cabac_encode_bypass( cb, mvd < 0 );
+ x264_cabac_encode_decision( cb, ctxbase + 3, 1 );
+ x264_cabac_encode_decision( cb, ctxbase + 4, 1 );
+ x264_cabac_encode_decision( cb, ctxbase + 5, 1 );
+ if( i_abs < 9 )
+ {
+ cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
+ cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
+ }
+ else
+ {
+ cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]];
+ cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]];
+ x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
+ }
}
}
+#else
+ static const uint8_t ctxes[8] = { 3,4,5,6,6,6,6,6 };
+
+ if( i_abs == 0 )
+ x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
else
{
x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
-#if RDO_SKIP_BS
- for( i = 1; i < 4; i++ )
- x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
- cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]];
- cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]];
- x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
-#else
- for( i = 1; i < 9; i++ )
- x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
- x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
+ if( i_abs < 9 )
+ {
+ for( i = 1; i < i_abs; i++ )
+ x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
+ x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 );
+ }
+ else
+ {
+ for( i = 1; i < 9; i++ )
+ x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
+ x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
+ }
x264_cabac_encode_bypass( cb, mvd < 0 );
-#endif
}
+#endif
}
-static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
+static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
{
- DECLARE_ALIGNED_4( int16_t mvp[2] );
+ ALIGNED_4( int16_t mvp[2] );
uint32_t amvd;
int mdx, mdy;
@@ -458,43 +431,35 @@ static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_l
}
#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
+do\
{\
- uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width,height);\
+ uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
-}
+} while(0)
-static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i )
+static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
{
- if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
- return;
-
switch( h->mb.i_sub_partition[i] )
{
case D_L0_8x8:
- case D_L1_8x8:
- case D_BI_8x8:
- x264_cabac_mb_mvd( h, cb, i_list, 4*i, 2, 2 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
break;
case D_L0_8x4:
- case D_L1_8x4:
- case D_BI_8x4:
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 2, 1 );
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 2, 1 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 2, 1 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+2, 2, 1 );
break;
case D_L0_4x8:
- case D_L1_4x8:
- case D_BI_4x8:
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 2 );
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 2 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 1, 2 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+1, 1, 2 );
break;
case D_L0_4x4:
- case D_L1_4x4:
- case D_BI_4x4:
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 1 );
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 1 );
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 1, 1 );
- x264_cabac_mb_mvd( h, cb, i_list, 4*i+3, 1, 1 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 1, 1 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+1, 1, 1 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+2, 1, 1 );
+ x264_cabac_mb_mvd( h, cb, 0, 4*i+3, 1, 1 );
break;
+ default:
+ assert(0);
}
}
@@ -519,9 +484,14 @@ static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int
/* no need to test for skip/pcm */
i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
- i_nza &= 0x7f + (b_intra << 7);
- i_nzb &= 0x7f + (b_intra << 7);
- return 85 + 4*i_cat + 2*!!i_nzb + !!i_nza;
+ if( x264_constant_p(b_intra) && !b_intra )
+ return 85 + 4*i_cat + ((2*i_nzb + i_nza)&0x7f);
+ else
+ {
+ i_nza &= 0x7f + (b_intra << 7);
+ i_nzb &= 0x7f + (b_intra << 7);
+ return 85 + 4*i_cat + 2*!!i_nzb + !!i_nza;
+ }
case DCT_LUMA_DC:
i_nza = (h->mb.cache.i_cbp_left >> 8) & 1;
i_nzb = (h->mb.cache.i_cbp_top >> 8) & 1;
@@ -861,7 +831,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
}
- else if( h->mb.i_partition == D_8x16 )
+ else //if( h->mb.i_partition == D_8x16 )
{
if( h->mb.pic.i_fref[0] > 1 )
{
@@ -875,10 +845,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
else if( i_mb_type == P_8x8 )
{
/* sub mb type */
- x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[0] );
- x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[1] );
- x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[2] );
- x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[3] );
+ for( i = 0; i < 4; i++ )
+ x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
/* ref 0 */
if( h->mb.pic.i_fref[0] > 1 )
@@ -890,57 +858,50 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
}
for( i = 0; i < 4; i++ )
- x264_cabac_mb8x8_mvd( h, cb, 0, i );
+ x264_cabac_mb8x8_mvd( h, cb, i );
}
else if( i_mb_type == B_8x8 )
{
/* sub mb type */
- x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[0] );
- x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[1] );
- x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[2] );
- x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[3] );
+ for( i = 0; i < 4; i++ )
+ x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
/* ref */
- for( i_list = 0; i_list < 2; i_list++ )
- {
- if( ( i_list ? h->mb.pic.i_fref[1] : h->mb.pic.i_fref[0] ) == 1 )
- continue;
+ if( h->mb.pic.i_fref[0] > 1 )
for( i = 0; i < 4; i++ )
- if( x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
- x264_cabac_mb_ref( h, cb, i_list, 4*i );
- }
+ if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+ x264_cabac_mb_ref( h, cb, 0, 4*i );
+
+ if( h->mb.pic.i_fref[1] > 1 )
+ for( i = 0; i < 4; i++ )
+ if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+ x264_cabac_mb_ref( h, cb, 1, 4*i );
for( i = 0; i < 4; i++ )
- x264_cabac_mb8x8_mvd( h, cb, 0, i );
+ if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+ x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
+
for( i = 0; i < 4; i++ )
- x264_cabac_mb8x8_mvd( h, cb, 1, i );
+ if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+ x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
}
else if( i_mb_type != B_DIRECT )
{
/* All B mode */
const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
-
- for( i_list = 0; i_list < 2; i_list++ )
+ if( h->mb.pic.i_fref[0] > 1 )
{
- const int i_ref_max = h->mb.pic.i_fref[i_list];
-
- if( i_ref_max > 1 )
- {
- if( h->mb.i_partition == D_16x16 )
- {
- if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
- }
- else if( h->mb.i_partition == D_16x8 )
- {
- if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
- if( b_list[i_list][1] ) x264_cabac_mb_ref( h, cb, i_list, 8 );
- }
- else if( h->mb.i_partition == D_8x16 )
- {
- if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
- if( b_list[i_list][1] ) x264_cabac_mb_ref( h, cb, i_list, 4 );
- }
- }
+ if( b_list[0][0] )
+ x264_cabac_mb_ref( h, cb, 0, 0 );
+ if( b_list[0][1] && h->mb.i_partition != D_16x16 )
+ x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
+ }
+ if( h->mb.pic.i_fref[1] > 1 )
+ {
+ if( b_list[1][0] )
+ x264_cabac_mb_ref( h, cb, 1, 0 );
+ if( b_list[1][1] && h->mb.i_partition != D_16x16 )
+ x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
}
for( i_list = 0; i_list < 2; i_list++ )
{
@@ -953,7 +914,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
}
- else if( h->mb.i_partition == D_8x16 )
+ else //if( h->mb.i_partition == D_8x16 )
{
if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
@@ -1010,11 +971,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra );
block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra );
- }
- if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
- {
- for( i = 16; i < 24; i++ )
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
+ if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+ for( i = 16; i < 24; i++ )
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
}
}
@@ -1027,9 +986,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
/*****************************************************************************
* RD only; doesn't generate a valid bitstream
* doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref or subpartition (never varies between calls, so no point in doing so)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
* works on all partition sizes except 16x16
- * for sub8x8, call once per 8x8 block
*****************************************************************************/
static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
{
@@ -1038,11 +997,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
int j;
if( i_mb_type == P_8x8 )
- x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
- else if( i_mb_type == P_L0 )
{
- x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
+ x264_cabac_mb8x8_mvd( h, cb, i8 );
+ x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
}
+ else if( i_mb_type == P_L0 )
+ x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
@@ -1050,8 +1010,10 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
}
else //if( i_mb_type == B_8x8 )
{
- x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
- x264_cabac_mb8x8_mvd( h, cb, 1, i8 );
+ if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+ x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
+ if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+ x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
}
for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 1b0b5d1..c65c9bd 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -61,8 +61,9 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
/****************************************************************************
* block_residual_write_cavlc:
****************************************************************************/
-static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_suffix_length, int level )
+static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_length, int level )
{
+ bs_t *s = &h->out.bs;
static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
int i_level_prefix = 15;
int mask = level >> 15;
@@ -112,8 +113,9 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_s
return i_suffix_length;
}
-static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int16_t *l, int nC )
+static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l, int nC )
{
+ bs_t *s = &h->out.bs;
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
static const int count_cat[5] = {16, 15, 16, 4, 15};
x264_run_level_t runlevel;
@@ -157,7 +159,7 @@ static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, in
i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
}
else
- i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+ i_suffix_length = block_residual_write_cavlc_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
for( i = i_trailing+1; i < i_total; i++ )
{
val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
@@ -167,7 +169,7 @@ static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, in
i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
}
else
- i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+ i_suffix_length = block_residual_write_cavlc_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
}
}
@@ -191,18 +193,19 @@ static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, in
static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
-#define block_residual_write_cavlc(h,s,cat,idx,l)\
+#define block_residual_write_cavlc(h,cat,idx,l)\
{\
int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? 0 : idx )];\
uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
if( !*nnz )\
- bs_write_vlc( s, x264_coeff0_token[nC] );\
+ bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
else\
- *nnz = block_residual_write_cavlc(h,s,cat,l,nC);\
+ *nnz = block_residual_write_cavlc(h,cat,l,nC);\
}
-static void cavlc_qp_delta( x264_t *h, bs_t *s )
+static void cavlc_qp_delta( x264_t *h )
{
+ bs_t *s = &h->out.bs;
int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
/* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
@@ -225,50 +228,40 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
bs_write_se( s, i_dqp );
}
-static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
+static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
{
- DECLARE_ALIGNED_4( int16_t mvp[2] );
+ bs_t *s = &h->out.bs;
+ ALIGNED_4( int16_t mvp[2] );
x264_mb_predict_mv( h, i_list, idx, width, mvp );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
}
-static void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i_list, int i )
+static inline void cavlc_mb8x8_mvd( x264_t *h, int i )
{
- if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
- return;
-
switch( h->mb.i_sub_partition[i] )
{
case D_L0_8x8:
- case D_L1_8x8:
- case D_BI_8x8:
- cavlc_mb_mvd( h, s, i_list, 4*i, 2 );
+ cavlc_mb_mvd( h, 0, 4*i, 2 );
break;
case D_L0_8x4:
- case D_L1_8x4:
- case D_BI_8x4:
- cavlc_mb_mvd( h, s, i_list, 4*i+0, 2 );
- cavlc_mb_mvd( h, s, i_list, 4*i+2, 2 );
+ cavlc_mb_mvd( h, 0, 4*i+0, 2 );
+ cavlc_mb_mvd( h, 0, 4*i+2, 2 );
break;
case D_L0_4x8:
- case D_L1_4x8:
- case D_BI_4x8:
- cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
- cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
+ cavlc_mb_mvd( h, 0, 4*i+0, 1 );
+ cavlc_mb_mvd( h, 0, 4*i+1, 1 );
break;
case D_L0_4x4:
- case D_L1_4x4:
- case D_BI_4x4:
- cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
- cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
- cavlc_mb_mvd( h, s, i_list, 4*i+2, 1 );
- cavlc_mb_mvd( h, s, i_list, 4*i+3, 1 );
+ cavlc_mb_mvd( h, 0, 4*i+0, 1 );
+ cavlc_mb_mvd( h, 0, 4*i+1, 1 );
+ cavlc_mb_mvd( h, 0, 4*i+2, 1 );
+ cavlc_mb_mvd( h, 0, 4*i+3, 1 );
break;
}
}
-static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8start, int i8end )
+static inline void x264_macroblock_luma_write_cavlc( x264_t *h, int i8start, int i8end )
{
int i8, i4;
if( h->mb.b_transform_8x8 )
@@ -282,20 +275,23 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
for( i4 = 0; i4 < 4; i4++ )
- block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
+ block_residual_write_cavlc( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
}
/*****************************************************************************
* x264_macroblock_write:
*****************************************************************************/
-void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
+void x264_macroblock_write_cavlc( x264_t *h )
{
+ bs_t *s = &h->out.bs;
const int i_mb_type = h->mb.i_type;
- static const int i_offsets[3] = {5,23,0};
+ static const uint8_t i_offsets[3] = {5,23,0};
int i_mb_i_offset = i_offsets[h->sh.i_type];
int i;
-#if !RDO_SKIP_BS
+#if RDO_SKIP_BS
+ s->i_bits_encoded = 0;
+#else
const int i_mb_pos_start = bs_pos( s );
int i_mb_pos_tex;
#endif
@@ -309,6 +305,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
#if !RDO_SKIP_BS
if( i_mb_type == I_PCM )
{
+ uint8_t *p_start = s->p_start;
bs_write_ue( s, i_mb_i_offset + 25 );
i_mb_pos_tex = bs_pos( s );
h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
@@ -324,6 +321,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
s->p += 64;
+ bs_init( s, s->p, s->p_end - s->p );
+ s->p_start = p_start;
+
/* if PCM is chosen, we need to store reconstructed frame data */
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
@@ -366,17 +366,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
}
else if( i_mb_type == P_L0 )
{
- DECLARE_ALIGNED_4( int16_t mvp[2] );
-
if( h->mb.i_partition == D_16x16 )
{
bs_write1( s, 1 );
if( h->mb.pic.i_fref[0] > 1 )
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- x264_mb_predict_mv( h, 0, 0, 4, mvp );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+ cavlc_mb_mvd( h, 0, 0, 4 );
}
else if( h->mb.i_partition == D_16x8 )
{
@@ -386,14 +382,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
}
-
- x264_mb_predict_mv( h, 0, 0, 4, mvp );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
- x264_mb_predict_mv( h, 0, 8, 4, mvp );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+ cavlc_mb_mvd( h, 0, 0, 4 );
+ cavlc_mb_mvd( h, 0, 8, 4 );
}
else if( h->mb.i_partition == D_8x16 )
{
@@ -403,14 +393,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
}
-
- x264_mb_predict_mv( h, 0, 0, 2, mvp );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
- x264_mb_predict_mv( h, 0, 4, 2, mvp );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+ cavlc_mb_mvd( h, 0, 0, 2 );
+ cavlc_mb_mvd( h, 0, 4, 2 );
}
}
else if( i_mb_type == P_8x8 )
@@ -445,7 +429,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
}
for( i = 0; i < 4; i++ )
- cavlc_mb8x8_mvd( h, s, 0, i );
+ cavlc_mb8x8_mvd( h, i );
}
else if( i_mb_type == B_8x8 )
{
@@ -467,80 +451,47 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
/* mvd */
for( i = 0; i < 4; i++ )
- cavlc_mb8x8_mvd( h, s, 0, i );
+ if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+ cavlc_mb_mvd( h, 0, 4*i, 2 );
for( i = 0; i < 4; i++ )
- cavlc_mb8x8_mvd( h, s, 1, i );
+ if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+ cavlc_mb_mvd( h, 1, 4*i, 2 );
}
else if( i_mb_type != B_DIRECT )
{
/* All B mode */
/* Motion Vector */
- int i_list;
- DECLARE_ALIGNED_4( int16_t mvp[2] );
const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
+ const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
+ const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
-
- for( i_list = 0; i_list < 2; i_list++ )
+ if( h->mb.i_partition == D_16x16 )
{
- const int i_ref_max = (i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]) - 1;
-
- if( i_ref_max )
- switch( h->mb.i_partition )
- {
- case D_16x16:
- if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
- break;
- case D_16x8:
- if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
- if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[8]] );
- break;
- case D_8x16:
- if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
- if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[4]] );
- break;
- }
+ if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
+ if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
+ if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
+ if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
}
- for( i_list = 0; i_list < 2; i_list++ )
+ else
{
- switch( h->mb.i_partition )
+ if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
+ if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
+ if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
+ if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
+ if( h->mb.i_partition == D_16x8 )
{
- case D_16x16:
- if( b_list[i_list][0] )
- {
- x264_mb_predict_mv( h, i_list, 0, 4, mvp );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
- }
- break;
- case D_16x8:
- if( b_list[i_list][0] )
- {
- x264_mb_predict_mv( h, i_list, 0, 4, mvp );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
- }
- if( b_list[i_list][1] )
- {
- x264_mb_predict_mv( h, i_list, 8, 4, mvp );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
- }
- break;
- case D_8x16:
- if( b_list[i_list][0] )
- {
- x264_mb_predict_mv( h, i_list, 0, 2, mvp );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
- }
- if( b_list[i_list][1] )
- {
- x264_mb_predict_mv( h, i_list, 4, 2, mvp );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
- }
- break;
+ if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
+ if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 8, 4 );
+ if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
+ if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 8, 4 );
+ }
+ else //if( h->mb.i_partition == D_8x16 )
+ {
+ if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 2 );
+ if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 4, 2 );
+ if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 2 );
+ if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 4, 2 );
}
}
}
@@ -565,29 +516,29 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
/* write residual */
if( i_mb_type == I_16x16 )
{
- cavlc_qp_delta( h, s );
+ cavlc_qp_delta( h );
/* DC Luma */
- block_residual_write_cavlc( h, s, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
+ block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
/* AC Luma */
if( h->mb.i_cbp_luma )
for( i = 0; i < 16; i++ )
- block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
+ block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
}
else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
{
- cavlc_qp_delta( h, s );
- x264_macroblock_luma_write_cavlc( h, s, 0, 3 );
+ cavlc_qp_delta( h );
+ x264_macroblock_luma_write_cavlc( h, 0, 3 );
}
if( h->mb.i_cbp_chroma )
{
/* Chroma DC residual present */
- block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
- block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+ block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+ block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
for( i = 16; i < 24; i++ )
- block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
}
#if !RDO_SKIP_BS
@@ -599,37 +550,42 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
/*****************************************************************************
* RD only; doesn't generate a valid bitstream
* doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref or subpartition (never varies between calls, so no point in doing so)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
* works on all partition sizes except 16x16
- * for sub8x8, call once per 8x8 block
*****************************************************************************/
static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
{
+ bs_t *s = &h->out.bs;
const int i_mb_type = h->mb.i_type;
int b_8x16 = h->mb.i_partition == D_8x16;
int j;
- h->out.bs.i_bits_encoded = 0;
if( i_mb_type == P_8x8 )
- cavlc_mb8x8_mvd( h, &h->out.bs, 0, i8 );
+ {
+ cavlc_mb8x8_mvd( h, i8 );
+ bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
+ }
else if( i_mb_type == P_L0 )
- cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 4>>b_8x16 );
+ cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
- if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 4>>b_8x16 );
- if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, &h->out.bs, 1, 4*i8, 4>>b_8x16 );
+ if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
+ if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
}
else //if( i_mb_type == B_8x8 )
{
- cavlc_mb8x8_mvd( h, &h->out.bs, 0, i8 );
- cavlc_mb8x8_mvd( h, &h->out.bs, 1, i8 );
+ if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+ cavlc_mb_mvd( h, 0, 4*i8, 2 );
+ if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+ cavlc_mb_mvd( h, 1, 4*i8, 2 );
}
for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
{
- x264_macroblock_luma_write_cavlc( h, &h->out.bs, i8, i8 );
- block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
- block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
+ x264_macroblock_luma_write_cavlc( h, i8, i8 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
@@ -640,12 +596,12 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
{
int b_8x4 = i_pixel == PIXEL_8x4;
h->out.bs.i_bits_encoded = 0;
- cavlc_mb_mvd( h, &h->out.bs, 0, i4, 1+b_8x4 );
- block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+ cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
+ block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
if( i_pixel != PIXEL_4x4 )
{
i4 += 2-b_8x4;
- block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+ block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
}
return h->out.bs.i_bits_encoded;
@@ -663,14 +619,14 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
{
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
- x264_macroblock_luma_write_cavlc( h, &h->out.bs, i8, i8 );
+ x264_macroblock_luma_write_cavlc( h, i8, i8 );
return h->out.bs.i_bits_encoded;
}
static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
{
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
- block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+ block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
return h->out.bs.i_bits_encoded;
}
@@ -679,14 +635,14 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
if( h->mb.i_cbp_chroma )
{
- block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
- block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+ block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+ block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma == 2 )
{
int i;
for( i = 16; i < 24; i++ )
- block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
}
}
return h->out.bs.i_bits_encoded;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 176443b..d873cd0 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -31,8 +31,9 @@
#include "analyse.h"
#include "ratecontrol.h"
#include "macroblock.h"
+#include "me.h"
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
#include "common/visualize.h"
#endif
@@ -42,9 +43,9 @@
#define bs_write_ue bs_write_ue_big
-static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
- x264_nal_t **pp_nal, int *pi_nal,
- x264_picture_t *pic_out );
+static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+ x264_nal_t **pp_nal, int *pi_nal,
+ x264_picture_t *pic_out );
/****************************************************************************
*
@@ -67,7 +68,7 @@ static void x264_frame_dump( x264_t *h )
if( !f )
return;
/* Write the frame in display order */
- fseek( f, h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
+ fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
for( i = 0; i < h->fdec->i_plane; i++ )
for( y = 0; y < h->param.i_height >> !!i; y++ )
fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f );
@@ -88,7 +89,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
sh->pps = pps;
sh->i_first_mb = 0;
- sh->i_last_mb = h->sps->i_mb_width * h->sps->i_mb_height;
+ sh->i_last_mb = h->mb.i_mb_count - 1;
sh->i_pps_id = pps->i_id;
sh->i_frame_num = i_frame;
@@ -175,12 +176,12 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
bs_write_ue( s, sh->i_type + 5 ); /* same type things */
bs_write_ue( s, sh->i_pps_id );
- bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num );
+ bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num & ((1<<sh->sps->i_log2_max_frame_num)-1) );
if( !sh->sps->b_frame_mbs_only )
{
bs_write1( s, sh->b_field_pic );
- if ( sh->b_field_pic )
+ if( sh->b_field_pic )
bs_write1( s, sh->b_bottom_field );
}
@@ -191,7 +192,7 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
if( sh->sps->i_poc_type == 0 )
{
- bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb );
+ bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb & ((1<<sh->sps->i_log2_max_poc_lsb)-1) );
if( sh->pps->b_pic_order && !sh->b_field_pic )
{
bs_write_se( s, sh->i_delta_poc_bottom );
@@ -257,10 +258,36 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
}
}
- if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) ||
- ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) )
+ if( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) )
{
- /* FIXME */
+ /* pred_weight_table() */
+ bs_write_ue( s, sh->weight[0][0].i_denom );
+ bs_write_ue( s, sh->weight[0][1].i_denom );
+ for( i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
+ {
+ int luma_weight_l0_flag = !!sh->weight[i][0].weightfn;
+ int chroma_weight_l0_flag = !!sh->weight[i][1].weightfn || !!sh->weight[i][2].weightfn;
+ bs_write1( s, luma_weight_l0_flag );
+ if( luma_weight_l0_flag )
+ {
+ bs_write_se( s, sh->weight[i][0].i_scale );
+ bs_write_se( s, sh->weight[i][0].i_offset );
+ }
+ bs_write1( s, chroma_weight_l0_flag );
+ if( chroma_weight_l0_flag )
+ {
+ int j;
+ for( j = 1; j < 3; j++ )
+ {
+ bs_write_se( s, sh->weight[i][j].i_scale );
+ bs_write_se( s, sh->weight[i][j].i_offset );
+ }
+ }
+ }
+ }
+ else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B )
+ {
+ /* TODO */
}
if( i_nal_ref_idc != 0 )
@@ -272,7 +299,17 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
}
else
{
- bs_write1( s, 0 ); /* adaptive_ref_pic_marking_mode_flag */
+ bs_write1( s, sh->i_mmco_command_count > 0 ); /* adaptive_ref_pic_marking_mode_flag */
+ if( sh->i_mmco_command_count > 0 )
+ {
+ int i;
+ for( i = 0; i < sh->i_mmco_command_count; i++ )
+ {
+ bs_write_ue( s, 1 ); /* mark short term ref as unused */
+ bs_write_ue( s, sh->mmco[i].i_difference_of_pic_nums - 1 );
+ }
+ bs_write_ue( s, 0 ); /* end command list */
+ }
}
}
@@ -295,17 +332,18 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
/* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
/* reallocate, adding an arbitrary amount of space (100 kilobytes). */
-static void x264_bitstream_check_buffer( x264_t *h )
+static int x264_bitstream_check_buffer( x264_t *h )
{
+ uint8_t *bs_bak = h->out.p_bitstream;
if( ( h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500) )
|| ( h->out.bs.p_end - h->out.bs.p < 2500 ) )
{
- uint8_t *bs_bak = h->out.p_bitstream;
intptr_t delta;
int i;
h->out.i_bitstream += 100000;
- h->out.p_bitstream = x264_realloc( h->out.p_bitstream, h->out.i_bitstream );
+ CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream );
+ h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - 100000) & ~15 );
delta = h->out.p_bitstream - bs_bak;
h->out.bs.p_start += delta;
@@ -318,7 +356,12 @@ static void x264_bitstream_check_buffer( x264_t *h )
for( i = 0; i <= h->out.i_nal; i++ )
h->out.nal[i].p_payload += delta;
+ x264_free( bs_bak );
}
+ return 0;
+fail:
+ x264_free( bs_bak );
+ return -1;
}
/****************************************************************************
@@ -332,9 +375,9 @@ static void x264_bitstream_check_buffer( x264_t *h )
static int x264_validate_parameters( x264_t *h )
{
#ifdef HAVE_MMX
- if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+ if( !(x264_cpu_detect() & X264_CPU_SSE) )
{
- x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
return -1;
}
@@ -352,14 +395,15 @@ static int x264_validate_parameters( x264_t *h )
h->param.i_width, h->param.i_height );
return -1;
}
- if( h->param.i_csp != X264_CSP_I420 )
+ int i_csp = h->param.i_csp & X264_CSP_MASK;
+ if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
{
- x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420 supported)\n" );
+ x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12 supported)\n" );
return -1;
}
- if( h->param.i_threads == 0 )
- h->param.i_threads = x264_cpu_num_processors() * 3/2;
+ if( h->param.i_threads == X264_THREADS_AUTO )
+ h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
if( h->param.i_threads > 1 )
{
@@ -367,7 +411,17 @@ static int x264_validate_parameters( x264_t *h )
x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
h->param.i_threads = 1;
#endif
+ /* Avoid absurdly small thread slices as they can reduce performance
+ * and VBV compliance. Capped at an arbitrary 4 rows per thread. */
+ if( h->param.b_sliced_threads )
+ {
+ int max_threads = (h->param.i_height+15)/16 / 4;
+ h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
+ }
}
+ else
+ h->param.b_sliced_threads = 0;
+ h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
if( h->param.b_interlaced )
{
@@ -381,6 +435,31 @@ static int x264_validate_parameters( x264_t *h )
x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
}
+ if( h->param.analyse.i_weighted_pred > 0 )
+ {
+ x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
+ h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+ }
+ }
+
+ /* Detect default ffmpeg settings and terminate with an error. */
+ {
+ int score = 0;
+ score += h->param.analyse.i_me_range == 0;
+ score += h->param.rc.i_qp_step == 3;
+ score += h->param.i_keyint_max == 12;
+ score += h->param.rc.i_qp_min == 2;
+ score += h->param.rc.i_qp_max == 31;
+ score += h->param.rc.f_qcompress == 0.5;
+ score += fabs(h->param.rc.f_ip_factor - 1.25) < 0.01;
+ score += fabs(h->param.rc.f_pb_factor - 1.25) < 0.01;
+ score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8;
+ if( score >= 5 )
+ {
+ x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" );
+ x264_log( h, X264_LOG_ERROR, "use an encoding preset (vpre)\n" );
+ return -1;
+ }
}
if( h->param.rc.i_rc_method < 0 || h->param.rc.i_rc_method > 2 )
@@ -391,7 +470,10 @@ static int x264_validate_parameters( x264_t *h )
h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, 51 );
h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
if( h->param.rc.i_rc_method == X264_RC_CRF )
+ {
h->param.rc.i_qp_constant = h->param.rc.f_rf_constant;
+ h->param.rc.i_bitrate = 0;
+ }
if( (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF)
&& h->param.rc.i_qp_constant == 0 )
{
@@ -421,19 +503,31 @@ static int x264_validate_parameters( x264_t *h )
h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
h->param.rc.i_aq_mode = 0;
+ h->param.rc.b_mb_tree = 0;
}
h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
- if( ( h->param.i_width % 16 || h->param.i_height % 16 )
- && h->param.i_height != 1080 && !h->mb.b_lossless )
+ int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
+ if( h->param.b_sliced_threads )
+ h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
+ else
{
- // There's nothing special about 1080 in that the warning still applies to it,
- // but chances are the user can't help it if his content is already 1080p,
- // so there's no point in warning in that case.
- x264_log( h, X264_LOG_WARNING,
- "width or height not divisible by 16 (%dx%d), compression will suffer.\n",
- h->param.i_width, h->param.i_height );
+ h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
+ h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
+ h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+ if( h->param.b_interlaced && h->param.i_slice_max_size )
+ {
+ x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+ h->param.i_slice_max_size = 0;
+ }
+ if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+ {
+ x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+ h->param.i_slice_max_mbs = 0;
+ }
+ if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
+ h->param.i_slice_count = 0;
}
h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
@@ -441,18 +535,68 @@ static int x264_validate_parameters( x264_t *h )
h->param.i_keyint_max = 1;
if( h->param.i_scenecut_threshold < 0 )
h->param.i_scenecut_threshold = 0;
- h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
{
x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" );
h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
}
h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_BFRAME_MAX );
+ if( h->param.i_keyint_max == 1 )
+ {
+ h->param.i_bframe = 0;
+ h->param.b_intra_refresh = 0;
+ }
h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
- h->param.b_bframe_pyramid = h->param.b_bframe_pyramid && h->param.i_bframe > 1;
+ if( h->param.i_bframe <= 1 )
+ h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE;
+ h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL );
if( !h->param.i_bframe )
+ {
h->param.i_bframe_adaptive = X264_B_ADAPT_NONE;
- h->param.analyse.b_weighted_bipred = h->param.analyse.b_weighted_bipred && h->param.i_bframe > 0;
+ h->param.analyse.i_direct_mv_pred = 0;
+ h->param.analyse.b_weighted_bipred = 0;
+ }
+ if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL )
+ {
+ x264_log( h, X264_LOG_WARNING, "b-pyramid normal + intra-refresh is not supported\n" );
+ h->param.i_bframe_pyramid = X264_B_PYRAMID_STRICT;
+ }
+ if( h->param.b_intra_refresh && h->param.i_frame_reference > 1 )
+ {
+ x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
+ h->param.i_frame_reference = 1;
+ }
+ if( h->param.b_intra_refresh )
+ h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
+ h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
+ h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
+ {
+ int maxrate = X264_MAX( h->param.rc.i_vbv_max_bitrate, h->param.rc.i_bitrate );
+ float bufsize = maxrate ? (float)h->param.rc.i_vbv_buffer_size / maxrate : 0;
+ float fps = h->param.i_fps_num > 0 && h->param.i_fps_den > 0 ? (float) h->param.i_fps_num / h->param.i_fps_den : 25.0;
+ h->param.rc.i_lookahead = X264_MIN( h->param.rc.i_lookahead, X264_MAX( h->param.i_keyint_max, bufsize*fps ) );
+ }
+
+ if( !h->param.i_timebase_num || !h->param.i_timebase_den )
+ {
+ h->param.i_timebase_num = h->param.i_fps_den;
+ h->param.i_timebase_den = h->param.i_fps_num;
+ }
+
+ h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 );
+ if( !h->param.rc.i_lookahead || h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 )
+ h->param.rc.b_mb_tree = 0;
+ if( h->param.rc.b_stat_read )
+ h->param.rc.i_lookahead = 0;
+#ifdef HAVE_PTHREAD
+ if( h->param.i_sync_lookahead )
+ h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->i_thread_frames + h->param.i_bframe, X264_LOOKAHEAD_MAX );
+ if( h->param.rc.b_stat_read || h->i_thread_frames == 1 )
+ h->param.i_sync_lookahead = 0;
+#else
+ h->param.i_sync_lookahead = 0;
+#endif
+
h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
&& h->param.i_bframe
&& ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
@@ -477,7 +621,7 @@ static int x264_validate_parameters( x264_t *h )
if( h->param.analyse.i_me_method == X264_ME_TESA &&
(h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
h->param.analyse.i_me_method = X264_ME_ESA;
- h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 9 );
+ h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 10 );
h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16|
X264_ANALYSE_I4x4|X264_ANALYSE_I8x8;
@@ -493,6 +637,11 @@ static int x264_validate_parameters( x264_t *h )
if( !h->param.b_cabac )
h->param.analyse.i_trellis = 0;
h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
+ if( !h->param.analyse.b_psy )
+ {
+ h->param.analyse.f_psy_rd = 0;
+ h->param.analyse.f_psy_trellis = 0;
+ }
if( !h->param.analyse.i_trellis )
h->param.analyse.f_psy_trellis = 0;
h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
@@ -513,11 +662,19 @@ static int x264_validate_parameters( x264_t *h )
else
h->mb.i_psy_trellis = 0;
h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
- h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 );
+ h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
if( h->param.rc.f_aq_strength == 0 )
h->param.rc.i_aq_mode = 0;
+ /* MB-tree requires AQ to be on, even if the strength is zero. */
+ if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree )
+ {
+ h->param.rc.i_aq_mode = 1;
+ h->param.rc.f_aq_strength = 0;
+ }
h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
+ if( h->param.analyse.i_subpel_refine == 10 && (h->param.analyse.i_trellis != 2 || !h->param.rc.i_aq_mode) )
+ h->param.analyse.i_subpel_refine = 9;
{
const x264_level_t *l = x264_levels;
@@ -548,7 +705,11 @@ static int x264_validate_parameters( x264_t *h )
h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> h->param.b_interlaced);
}
- if( h->param.i_threads > 1 )
+ h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, 0, X264_WEIGHTP_SMART );
+ if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
+ h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
+
+ if( h->i_thread_frames > 1 )
{
int r = h->param.analyse.i_mv_range_thread;
int r2;
@@ -558,7 +719,7 @@ static int x264_validate_parameters( x264_t *h )
// the rest is allocated to whichever thread is far enough ahead to use it.
// reserving more space increases quality for some videos, but costs more time
// in thread synchronization.
- int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->param.i_threads - X264_THREAD_HEIGHT;
+ int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->i_thread_frames - X264_THREAD_HEIGHT;
r = max_range / 2;
}
r = X264_MAX( r, h->param.analyse.i_me_range );
@@ -587,13 +748,28 @@ static int x264_validate_parameters( x264_t *h )
/* ensure the booleans are 0 or 1 so they can be used in math */
#define BOOLIFY(x) h->param.x = !!h->param.x
BOOLIFY( b_cabac );
+ BOOLIFY( b_constrained_intra );
BOOLIFY( b_deblocking_filter );
+ BOOLIFY( b_deterministic );
+ BOOLIFY( b_sliced_threads );
BOOLIFY( b_interlaced );
+ BOOLIFY( b_intra_refresh );
+ BOOLIFY( b_visualize );
+ BOOLIFY( b_aud );
+ BOOLIFY( b_repeat_headers );
+ BOOLIFY( b_annexb );
BOOLIFY( analyse.b_transform_8x8 );
+ BOOLIFY( analyse.b_weighted_bipred );
BOOLIFY( analyse.b_chroma_me );
+ BOOLIFY( analyse.b_mixed_references );
BOOLIFY( analyse.b_fast_pskip );
+ BOOLIFY( analyse.b_dct_decimate );
+ BOOLIFY( analyse.b_psy );
+ BOOLIFY( analyse.b_psnr );
+ BOOLIFY( analyse.b_ssim );
BOOLIFY( rc.b_stat_write );
BOOLIFY( rc.b_stat_read );
+ BOOLIFY( rc.b_mb_tree );
#undef BOOLIFY
return 0;
@@ -613,94 +789,121 @@ static void mbcmp_init( x264_t *h )
memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
}
+static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
+{
+ /* VUI */
+ if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
+ {
+ int i_w = param->vui.i_sar_width;
+ int i_h = param->vui.i_sar_height;
+ int old_w = h->param.vui.i_sar_width;
+ int old_h = h->param.vui.i_sar_height;
+
+ x264_reduce_fraction( &i_w, &i_h );
+
+ while( i_w > 65535 || i_h > 65535 )
+ {
+ i_w /= 2;
+ i_h /= 2;
+ }
+
+ x264_reduce_fraction( &i_w, &i_h );
+
+ if( i_w != old_w || i_h != old_h || initial )
+ {
+ h->param.vui.i_sar_width = 0;
+ h->param.vui.i_sar_height = 0;
+ if( i_w == 0 || i_h == 0 )
+ x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" );
+ else
+ {
+ x264_log( h, initial?X264_LOG_INFO:X264_LOG_DEBUG, "using SAR=%d/%d\n", i_w, i_h );
+ h->param.vui.i_sar_width = i_w;
+ h->param.vui.i_sar_height = i_h;
+ }
+ }
+ }
+}
+
/****************************************************************************
* x264_encoder_open:
****************************************************************************/
-x264_t *x264_encoder_open ( x264_param_t *param )
+x264_t *x264_encoder_open( x264_param_t *param )
{
- x264_t *h = x264_malloc( sizeof( x264_t ) );
+ x264_t *h;
char buf[1000], *p;
- int i;
+ int i, qp, i_slicetype_length;
- memset( h, 0, sizeof( x264_t ) );
+ CHECKED_MALLOCZERO( h, sizeof(x264_t) );
/* Create a copy of param */
- memcpy( &h->param, param, sizeof( x264_param_t ) );
+ memcpy( &h->param, param, sizeof(x264_param_t) );
+
+ if( param->param_free )
+ param->param_free( param );
if( x264_validate_parameters( h ) < 0 )
- {
- x264_free( h );
- return NULL;
- }
+ goto fail;
if( h->param.psz_cqm_file )
if( x264_cqm_parse_file( h, h->param.psz_cqm_file ) < 0 )
- {
- x264_free( h );
- return NULL;
- }
+ goto fail;
if( h->param.rc.psz_stat_out )
h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out );
if( h->param.rc.psz_stat_in )
h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in );
- /* VUI */
- if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 )
- {
- int i_w = param->vui.i_sar_width;
- int i_h = param->vui.i_sar_height;
-
- x264_reduce_fraction( &i_w, &i_h );
-
- while( i_w > 65535 || i_h > 65535 )
- {
- i_w /= 2;
- i_h /= 2;
- }
-
- h->param.vui.i_sar_width = 0;
- h->param.vui.i_sar_height = 0;
- if( i_w == 0 || i_h == 0 )
- {
- x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" );
- }
- else
- {
- x264_log( h, X264_LOG_INFO, "using SAR=%d/%d\n", i_w, i_h );
- h->param.vui.i_sar_width = i_w;
- h->param.vui.i_sar_height = i_h;
- }
- }
+ x264_set_aspect_ratio( h, &h->param, 1 );
x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
+ x264_reduce_fraction( &h->param.i_timebase_num, &h->param.i_timebase_den );
/* Init x264_t */
- h->i_frame = 0;
+ h->i_frame = -1;
h->i_frame_num = 0;
h->i_idr_pic_id = 0;
+ if( h->param.b_dts_compress )
+ {
+ /* h->i_dts_compress_multiplier == h->frames.i_bframe_delay + 1 */
+ h->i_dts_compress_multiplier = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 3 : 2) : 1;
+ if( h->i_dts_compress_multiplier != 1 )
+ x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %d/%d -> %d/%d\n",
+ h->param.i_timebase_num, h->param.i_timebase_den,
+ h->param.i_timebase_num, h->param.i_timebase_den * h->i_dts_compress_multiplier );
+ h->param.i_timebase_den *= h->i_dts_compress_multiplier;
+ }
+ else
+ h->i_dts_compress_multiplier = 1;
h->sps = &h->sps_array[0];
x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
h->pps = &h->pps_array[0];
- x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps);
+ x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps );
x264_validate_levels( h, 1 );
+ h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
+
if( x264_cqm_init( h ) < 0 )
- {
- x264_free( h );
- return NULL;
- }
+ goto fail;
h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
/* Init frames. */
if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
- h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_threads - 1;
+ h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4;
else
- h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
+ h->frames.i_delay = h->param.i_bframe;
+ if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
+ h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
+ i_slicetype_length = h->frames.i_delay;
+ h->frames.i_delay += h->i_thread_frames - 1;
+ h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
+ h->frames.i_delay += h->param.i_sync_lookahead;
+ h->frames.i_bframe_delay = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 2 : 1) : 0;
+
h->frames.i_max_ref0 = h->param.i_frame_reference;
h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering;
@@ -708,20 +911,26 @@ x264_t *x264_encoder_open ( x264_param_t *param )
&& ( h->param.rc.i_rc_method == X264_RC_ABR
|| h->param.rc.i_rc_method == X264_RC_CRF
|| h->param.i_bframe_adaptive
- || h->param.i_scenecut_threshold );
- h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+ || h->param.i_scenecut_threshold
+ || h->param.rc.b_mb_tree
+ || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART );
+ h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
- h->frames.i_last_idr = - h->param.i_keyint_max;
+ h->frames.i_last_keyframe = - h->param.i_keyint_max;
h->frames.i_input = 0;
- h->frames.last_nonb = NULL;
+ CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
+ /* Allocate room for max refs plus a few extra just in case. */
+ CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + 20) * sizeof(x264_frame_t *) );
+ CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
+ + h->i_thread_frames + 3) * sizeof(x264_frame_t *) );
+ if( h->param.analyse.i_weighted_pred > 0 )
+ CHECKED_MALLOCZERO( h->frames.blank_unused, h->i_thread_frames * 4 * sizeof(x264_frame_t *) );
h->i_ref0 = 0;
h->i_ref1 = 0;
- h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
-
- x264_rdo_init( );
+ x264_rdo_init();
/* init CPU functions */
x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
@@ -744,57 +953,105 @@ x264_t *x264_encoder_open ( x264_param_t *param )
for( i=0; x264_cpu_names[i].flags; i++ )
{
if( !strcmp(x264_cpu_names[i].name, "SSE2")
- && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
+ && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
continue;
if( !strcmp(x264_cpu_names[i].name, "SSE3")
- && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
+ && (h->param.cpu & X264_CPU_SSSE3 || !(h->param.cpu & X264_CPU_CACHELINE_64)) )
continue;
if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
- && (param->cpu & X264_CPU_SSE42) )
+ && (h->param.cpu & X264_CPU_SSE42) )
continue;
- if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
+ if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
}
- if( !param->cpu )
+ if( !h->param.cpu )
p += sprintf( p, " none!" );
x264_log( h, X264_LOG_INFO, "%s\n", buf );
+ for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+ if( x264_analyse_init_costs( h, qp ) )
+ goto fail;
+ if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
+ goto fail;
+ if( h->cost_mv[1][2013] != 24 )
+ {
+ x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
+ goto fail;
+ }
+
h->out.i_nal = 0;
h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
* ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
: pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
+ CHECKED_MALLOC( h->nal_buffer, h->out.i_bitstream * 3/2 + 4 );
+ h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
+
h->thread[0] = h;
- h->i_thread_num = 0;
- for( i = 1; i < h->param.i_threads; i++ )
- h->thread[i] = x264_malloc( sizeof(x264_t) );
+ for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+ CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+
+ if( x264_lookahead_init( h, i_slicetype_length ) )
+ goto fail;
for( i = 0; i < h->param.i_threads; i++ )
{
+ int init_nal_count = h->param.i_slice_count + 3;
+ int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
if( i > 0 )
*h->thread[i] = *h;
- h->thread[i]->fdec = x264_frame_pop_unused( h );
- h->thread[i]->out.p_bitstream = x264_malloc( h->out.i_bitstream );
- if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
- return NULL;
+
+ if( allocate_threadlocal_data )
+ {
+ h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
+ if( !h->thread[i]->fdec )
+ goto fail;
+ }
+ else
+ h->thread[i]->fdec = h->thread[0]->fdec;
+
+ CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
+ /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
+ CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
+ h->thread[i]->out.i_nals_allocated = init_nal_count;
+
+ if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
+ goto fail;
+ }
+
+ /* Allocate scratch buffer */
+ for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+ {
+ int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
+ int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+ int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+ int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+ ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+ int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+ int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
+ int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
+ CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
}
if( x264_ratecontrol_new( h ) < 0 )
- return NULL;
+ goto fail;
if( h->param.psz_dump_yuv )
{
/* create or truncate the reconstructed video file */
FILE *f = fopen( h->param.psz_dump_yuv, "w" );
- if( f )
- fclose( f );
- else
+ if( !f )
+ {
+ x264_log( h, X264_LOG_ERROR, "dump_yuv: can't write to %s\n", h->param.psz_dump_yuv );
+ goto fail;
+ }
+ else if( !x264_is_regular_file( f ) )
{
- x264_log( h, X264_LOG_ERROR, "can't write to fdec.yuv\n" );
- x264_free( h );
- return NULL;
+ x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv );
+ goto fail;
}
+ fclose( f );
}
x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
@@ -804,6 +1061,9 @@ x264_t *x264_encoder_open ( x264_param_t *param )
"High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
return h;
+fail:
+ x264_free( h );
+ return NULL;
}
/****************************************************************************
@@ -811,6 +1071,8 @@ x264_t *x264_encoder_open ( x264_param_t *param )
****************************************************************************/
int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
{
+ h = h->thread[h->i_thread_phase];
+ x264_set_aspect_ratio( h, param, 0 );
#define COPY(var) h->param.var = param->var
COPY( i_frame_reference ); // but never uses more refs than initially specified
COPY( i_bframe_bias );
@@ -819,8 +1081,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
COPY( b_deblocking_filter );
COPY( i_deblocking_filter_alphac0 );
COPY( i_deblocking_filter_beta );
- COPY( analyse.intra );
COPY( analyse.inter );
+ COPY( analyse.intra );
COPY( analyse.i_direct_mv_pred );
/* Scratch buffer prevents me_range from being increased for esa/tesa */
if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
@@ -844,7 +1106,10 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
if( h->pps->b_transform_8x8_mode )
COPY( analyse.b_transform_8x8 );
if( h->frames.i_max_ref1 > 1 )
- COPY( b_bframe_pyramid );
+ COPY( i_bframe_pyramid );
+ COPY( i_slice_max_size );
+ COPY( i_slice_max_mbs );
+ COPY( i_slice_count );
#undef COPY
mbcmp_init( h );
@@ -852,6 +1117,14 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
return x264_validate_parameters( h );
}
+/****************************************************************************
+ * x264_encoder_parameters:
+ ****************************************************************************/
+void x264_encoder_parameters( x264_t *h, x264_param_t *param )
+{
+ memcpy( param, &h->thread[h->i_thread_phase]->param, sizeof(x264_param_t) );
+}
+
/* internal usage */
static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
{
@@ -863,11 +1136,57 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
nal->i_payload= 0;
nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
}
-static void x264_nal_end( x264_t *h )
+/* if number of allocated nals is not enough, re-allocate a larger one. */
+static int x264_nal_check_buffer( x264_t *h )
+{
+ if( h->out.i_nal >= h->out.i_nals_allocated )
+ {
+ x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
+ if( !new_out )
+ return -1;
+ memcpy( new_out, h->out.nal, sizeof(x264_nal_t) * (h->out.i_nals_allocated) );
+ x264_free( h->out.nal );
+ h->out.nal = new_out;
+ h->out.i_nals_allocated *= 2;
+ }
+ return 0;
+}
+static int x264_nal_end( x264_t *h )
{
x264_nal_t *nal = &h->out.nal[h->out.i_nal];
nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
h->out.i_nal++;
+
+ return x264_nal_check_buffer( h );
+}
+
+static int x264_encoder_encapsulate_nals( x264_t *h )
+{
+ int nal_size = 0, i;
+ for( i = 0; i < h->out.i_nal; i++ )
+ nal_size += h->out.nal[i].i_payload;
+
+ /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
+ if( h->nal_buffer_size < nal_size * 3/2 + h->out.i_nal * 4 )
+ {
+ uint8_t *buf = x264_malloc( nal_size * 2 + h->out.i_nal * 4 );
+ if( !buf )
+ return -1;
+ x264_free( h->nal_buffer );
+ h->nal_buffer = buf;
+ }
+
+ uint8_t *nal_buffer = h->nal_buffer;
+
+ for( i = 0; i < h->out.i_nal; i++ )
+ {
+ int size = x264_nal_encode( nal_buffer, h->param.b_annexb, &h->out.nal[i] );
+ h->out.nal[i].i_payload = size;
+ h->out.nal[i].p_payload = nal_buffer;
+ nal_buffer += size;
+ }
+
+ return nal_buffer - h->nal_buffer;
}
/****************************************************************************
@@ -875,44 +1194,175 @@ static void x264_nal_end( x264_t *h )
****************************************************************************/
int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
{
+ int frame_size = 0;
/* init bitstream context */
h->out.i_nal = 0;
bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
- /* Put SPS and PPS */
- if( h->i_frame == 0 )
- {
- /* identify ourself */
- x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
- x264_sei_version_write( h, &h->out.bs );
- x264_nal_end( h );
+ /* Write SEI, SPS and PPS. */
+ x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
+ if( x264_sei_version_write( h, &h->out.bs ) )
+ return -1;
+ if( x264_nal_end( h ) )
+ return -1;
- /* generate sequence parameters */
- x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
- x264_sps_write( &h->out.bs, h->sps );
- x264_nal_end( h );
+ /* generate sequence parameters */
+ x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+ x264_sps_write( &h->out.bs, h->sps );
+ if( x264_nal_end( h ) )
+ return -1;
+
+ /* generate picture parameters */
+ x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+ x264_pps_write( &h->out.bs, h->pps );
+ if( x264_nal_end( h ) )
+ return -1;
+
+ frame_size = x264_encoder_encapsulate_nals( h );
- /* generate picture parameters */
- x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
- x264_pps_write( &h->out.bs, h->pps );
- x264_nal_end( h );
- }
/* now set output*/
*pi_nal = h->out.i_nal;
*pp_nal = &h->out.nal[0];
h->out.i_nal = 0;
- return 0;
+ return frame_size;
}
-static inline void x264_reference_build_list( x264_t *h, int i_poc )
+/* Check to see whether we have chosen a reference list ordering different
+ * from the standard's default. */
+static inline void x264_reference_check_reorder( x264_t *h )
+{
+ int i;
+ for( i = 0; i < h->i_ref0 - 1; i++ )
+ /* P and B-frames use different default orders. */
+ if( h->sh.i_type == SLICE_TYPE_P ? h->fref0[i]->i_frame_num < h->fref0[i+1]->i_frame_num
+ : h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+ {
+ h->b_ref_reorder[0] = 1;
+ break;
+ }
+}
+
+/* return -1 on failure, else return the index of the new reference frame */
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w )
+{
+ int i = h->i_ref0;
+ int j;
+ x264_frame_t *newframe;
+ if( i <= 1 ) /* empty list, definitely can't duplicate frame */
+ return -1;
+
+ /* Find a place to insert the duplicate in the reference list. */
+ for( j = 0; j < i; j++ )
+ if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
+ {
+ /* found a place, after j, make sure there is not already a duplicate there */
+ if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
+ break;
+ }
+
+ if( j == i ) /* No room in the reference list for the duplicate. */
+ return -1;
+ j++;
+
+ newframe = x264_frame_pop_blank_unused( h );
+
+ //FIXME: probably don't need to copy everything
+ *newframe = *h->fref0[i_ref];
+ newframe->i_reference_count = 1;
+ newframe->orig = h->fref0[i_ref];
+ newframe->b_duplicate = 1;
+ memcpy( h->fenc->weight[j], w, sizeof(h->fenc->weight[i]) );
+
+ /* shift the frames to make space for the dupe. */
+ h->b_ref_reorder[0] = 1;
+ if( h->i_ref0 < 16 )
+ ++h->i_ref0;
+ h->fref0[15] = NULL;
+ x264_frame_unshift( &h->fref0[j], newframe );
+
+ return j;
+}
+
+static void x264_weighted_pred_init( x264_t *h )
{
+ int i_ref;
int i;
- int b_ok;
+
+ /* for now no analysis and set all weights to nothing */
+ for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+ h->fenc->weighted[i_ref] = h->fref0[i_ref]->filtered[0];
+
+ // FIXME: This only supports weighting of one reference frame
+ // and duplicates of that frame.
+ h->fenc->i_lines_weighted = 0;
+
+ for( i_ref = 0; i_ref < (h->i_ref0 << h->sh.b_mbaff); i_ref++ )
+ for( i = 0; i < 3; i++ )
+ h->sh.weight[i_ref][i].weightfn = NULL;
+
+
+ if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 )
+ return;
+
+ int i_padv = PADV << h->param.b_interlaced;
+ int denom = -1;
+ int weightluma = 0;
+ int buffer_next = 0;
+ int j;
+ //FIXME: when chroma support is added, move this into loop
+ h->sh.weight[0][1].weightfn = h->sh.weight[0][2].weightfn = NULL;
+ h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
+ for( j = 0; j < h->i_ref0; j++ )
+ {
+ if( h->fenc->weight[j][0].weightfn )
+ {
+ h->sh.weight[j][0] = h->fenc->weight[j][0];
+ // if weight is useless, don't write it to stream
+ if( h->sh.weight[j][0].i_scale == 1<<h->sh.weight[j][0].i_denom && h->sh.weight[j][0].i_offset == 0 )
+ h->sh.weight[j][0].weightfn = NULL;
+ else
+ {
+ if( !weightluma )
+ {
+ weightluma = 1;
+ h->sh.weight[0][0].i_denom = denom = h->sh.weight[j][0].i_denom;
+ assert( x264_clip3( denom, 0, 7 ) == denom );
+ }
+ assert( h->sh.weight[j][0].i_denom == denom );
+ assert( x264_clip3( h->sh.weight[j][0].i_scale, 0, 127 ) == h->sh.weight[j][0].i_scale );
+ assert( x264_clip3( h->sh.weight[j][0].i_offset, -128, 127 ) == h->sh.weight[j][0].i_offset );
+ h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] +
+ h->fenc->i_stride[0] * i_padv + PADH;
+ }
+ }
+
+ //scale full resolution frame
+ if( h->sh.weight[j][0].weightfn && h->param.i_threads == 1 )
+ {
+ uint8_t *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
+ uint8_t *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+ int stride = h->fenc->i_stride[0];
+ int width = h->fenc->i_width[0] + PADH*2;
+ int height = h->fenc->i_lines[0] + i_padv*2;
+ x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
+ h->fenc->i_lines_weighted = height;
+ }
+ }
+ if( !weightluma )
+ h->sh.weight[0][0].i_denom = 0;
+}
+
+static inline void x264_reference_build_list( x264_t *h, int i_poc )
+{
+ int i, b_ok;
/* build ref list 0/1 */
- h->i_ref0 = 0;
- h->i_ref1 = 0;
+ h->mb.pic.i_fref[0] = h->i_ref0 = 0;
+ h->mb.pic.i_fref[1] = h->i_ref1 = 0;
+ if( h->sh.i_type == SLICE_TYPE_I )
+ return;
+
for( i = 0; h->frames.reference[i]; i++ )
{
if( h->frames.reference[i]->i_poc < i_poc )
@@ -939,6 +1389,15 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
}
}
} while( !b_ok );
+
+ if( h->sh.i_mmco_remove_from_end )
+ for( i = h->i_ref0-1; i >= h->i_ref0 - h->sh.i_mmco_remove_from_end; i-- )
+ {
+ int diff = h->i_frame_num - h->fref0[i]->i_frame_num;
+ h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref0[i]->i_poc;
+ h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff;
+ }
+
/* Order ref1 from lower to higher poc (bubble sort) for B-frame */
do
{
@@ -954,23 +1413,57 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
}
} while( !b_ok );
- /* In the standard, a P-frame's ref list is sorted by frame_num.
- * We use POC, but check whether explicit reordering is needed */
- h->b_ref_reorder[0] =
- h->b_ref_reorder[1] = 0;
- if( h->sh.i_type == SLICE_TYPE_P )
+ x264_reference_check_reorder( h );
+
+ h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 );
+ h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 );
+ h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit
+
+ /* add duplicates */
+ if( h->fenc->i_type == X264_TYPE_P )
{
- for( i = 0; i < h->i_ref0 - 1; i++ )
- if( h->fref0[i]->i_frame_num < h->fref0[i+1]->i_frame_num )
+ int idx = -1;
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ {
+ x264_weight_t w[3];
+ w[1].weightfn = w[2].weightfn = NULL;
+ if( h->param.rc.b_stat_read )
+ x264_ratecontrol_set_weights( h, h->fenc );
+
+ if( !h->fenc->weight[0][0].weightfn )
{
- h->b_ref_reorder[0] = 1;
- break;
+ h->fenc->weight[0][0].i_denom = 0;
+ SET_WEIGHT( w[0], 1, 1, 0, -1 );
+ idx = x264_weighted_reference_duplicate( h, 0, w );
}
+ else
+ {
+ if( h->fenc->weight[0][0].i_scale == 1<<h->fenc->weight[0][0].i_denom )
+ {
+ SET_WEIGHT( h->fenc->weight[0][0], 1, 1, 0, h->fenc->weight[0][0].i_offset );
+ }
+ x264_weighted_reference_duplicate( h, 0, weight_none );
+ if( h->fenc->weight[0][0].i_offset > -128 )
+ {
+ w[0] = h->fenc->weight[0][0];
+ w[0].i_offset--;
+ h->mc.weight_cache( h, &w[0] );
+ idx = x264_weighted_reference_duplicate( h, 0, w );
+ }
+ }
+ }
+ else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
+ {
+ //weighted offset=-1
+ x264_weight_t w[3];
+ SET_WEIGHT( w[0], 1, 1, 0, -1 );
+ h->fenc->weight[0][0].i_denom = 0;
+ w[1].weightfn = w[2].weightfn = NULL;
+ idx = x264_weighted_reference_duplicate( h, 0, w );
+ }
+ h->mb.ref_blind_dupe = idx;
}
- h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 );
- h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 );
- h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit
assert( h->i_ref0 + h->i_ref1 <= 16 );
h->mb.pic.i_fref[0] = h->i_ref0;
h->mb.pic.i_fref[1] = h->i_ref1;
@@ -990,7 +1483,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
if( min_y < 0 )
return;
- if( !b_end )
+ if( !b_end && !h->param.b_sliced_threads )
{
int i, j;
for( j=0; j<=h->sh.b_mbaff; j++ )
@@ -1019,10 +1512,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
}
}
- if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref )
- {
+ if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
- }
min_y = X264_MAX( min_y*16-8, 0 );
max_y = b_end ? h->param.i_height : mb_y*16-8;
@@ -1052,39 +1543,35 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
}
}
-static inline void x264_reference_update( x264_t *h )
+static inline int x264_reference_update( x264_t *h )
{
- int i;
-
- if( h->fdec->i_frame >= 0 )
- h->i_frame++;
-
+ int i, j;
if( !h->fdec->b_kept_as_ref )
{
- if( h->param.i_threads > 1 )
+ if( h->i_thread_frames > 1 )
{
x264_frame_push_unused( h, h->fdec );
- h->fdec = x264_frame_pop_unused( h );
+ h->fdec = x264_frame_pop_unused( h, 1 );
+ if( !h->fdec )
+ return -1;
}
- return;
- }
-
- /* move lowres copy of the image to the ref frame */
- for( i = 0; i < 4; i++)
- {
- XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] );
- XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] );
+ return 0;
}
- /* adaptive B decision needs a pointer, since it can't use the ref lists */
- if( h->sh.i_type != SLICE_TYPE_B )
- h->frames.last_nonb = h->fdec;
+ /* apply mmco from previous frame. */
+ for( i = 0; i < h->sh.i_mmco_command_count; i++ )
+ for( j = 0; h->frames.reference[j]; j++ )
+ if( h->frames.reference[j]->i_poc == h->sh.mmco[i].i_poc )
+ x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[j] ) );
/* move frame in the buffer */
x264_frame_push( h->frames.reference, h->fdec );
- if( h->frames.reference[h->frames.i_max_dpb] )
+ if( h->frames.reference[h->sps->i_num_ref_frames] )
x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
- h->fdec = x264_frame_pop_unused( h );
+ h->fdec = x264_frame_pop_unused( h, 1 );
+ if( !h->fdec )
+ return -1;
+ return 0;
}
static inline void x264_reference_reset( x264_t *h )
@@ -1095,6 +1582,41 @@ static inline void x264_reference_reset( x264_t *h )
h->fenc->i_poc = 0;
}
+static inline void x264_reference_hierarchy_reset( x264_t *h )
+{
+ int i, ref;
+ int b_hasdelayframe = 0;
+ if( !h->param.i_bframe_pyramid )
+ return;
+
+ /* look for delay frames -- chain must only contain frames that are disposable */
+ for( i = 0; h->frames.current[i] && IS_DISPOSABLE( h->frames.current[i]->i_type ); i++ )
+ b_hasdelayframe |= h->frames.current[i]->i_coded
+ != h->frames.current[i]->i_frame + h->sps->vui.i_num_reorder_frames;
+
+ if( h->param.i_bframe_pyramid != X264_B_PYRAMID_STRICT && !b_hasdelayframe )
+ return;
+
+ /* Remove last BREF. There will never be old BREFs in the
+ * dpb during a BREF decode when pyramid == STRICT */
+ for( ref = 0; h->frames.reference[ref]; ref++ )
+ {
+ if( h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT
+ && h->frames.reference[ref]->i_type == X264_TYPE_BREF )
+ {
+ int diff = h->i_frame_num - h->frames.reference[ref]->i_frame_num;
+ h->sh.mmco[h->sh.i_mmco_command_count].i_difference_of_pic_nums = diff;
+ h->sh.mmco[h->sh.i_mmco_command_count++].i_poc = h->frames.reference[ref]->i_poc;
+ x264_frame_push_unused( h, x264_frame_pop( h->frames.reference ) );
+ h->b_ref_reorder[0] = 1;
+ break;
+ }
+ }
+
+ /* Prepare to room in the dpb for the delayed display time of the later b-frame's */
+ h->sh.i_mmco_remove_from_end = X264_MAX( ref + 2 - h->frames.i_max_dpb, 0 );
+}
+
static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
{
/* ------------------------ Create slice header ----------------------- */
@@ -1120,7 +1642,7 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
if( h->sps->i_poc_type == 0 )
{
h->sh.i_poc_lsb = h->fdec->i_poc & ( (1 << h->sps->i_log2_max_poc_lsb) - 1 );
- h->sh.i_delta_poc_bottom = 0; /* XXX won't work for field */
+ h->sh.i_delta_poc_bottom = 0;
}
else if( h->sps->i_poc_type == 1 )
{
@@ -1134,19 +1656,24 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
x264_macroblock_slice_init( h );
}
-static void x264_slice_write( x264_t *h )
+static int x264_slice_write( x264_t *h )
{
int i_skip;
int mb_xy, i_mb_x, i_mb_y;
- int i, i_list, i_ref;
-
- /* init stats */
- memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
+ int i, i_list, i_ref, i_skip_bak = 0; /* Shut up GCC. */
+ bs_t bs_bak;
+ x264_cabac_t cabac_bak;
+ uint8_t cabac_prevbyte_bak = 0; /* Shut up GCC. */
+ /* Assume no more than 3 bytes of NALU escaping. */
+ int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-3-NALU_OVERHEAD)*8 : INT_MAX;
+ int starting_bits = bs_pos(&h->out.bs);
+ bs_realign( &h->out.bs );
/* Slice */
x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
/* Slice header */
+ x264_macroblock_thread_init( h );
x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
if( h->param.b_cabac )
{
@@ -1164,26 +1691,41 @@ static void x264_slice_write( x264_t *h )
i_mb_x = h->sh.i_first_mb % h->sps->i_mb_width;
i_skip = 0;
- while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) < h->sh.i_last_mb )
+ while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) <= h->sh.i_last_mb )
{
int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
+ if( h->param.i_slice_max_size > 0 )
+ {
+ /* We don't need the contexts because flushing the CABAC encoder has no context
+ * dependency and macroblocks are only re-encoded in the case where a slice is
+ * ended (and thus the content of all contexts are thrown away). */
+ if( h->param.b_cabac )
+ {
+ memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) );
+ /* x264's CABAC writer modifies the previous byte during carry, so it has to be
+ * backed up. */
+ cabac_prevbyte_bak = h->cabac.p[-1];
+ }
+ else
+ {
+ bs_bak = h->out.bs;
+ i_skip_bak = i_skip;
+ }
+ }
- if( i_mb_x == 0 )
+ if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
x264_fdec_filter_row( h, i_mb_y );
/* load cache */
x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
- /* analyse parameters
- * Slice I: choose I_4x4 or I_16x16 mode
- * Slice P: choose between using P mode or intra (4x4 or 16x16)
- * */
x264_macroblock_analyse( h );
/* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
x264_macroblock_encode( h );
- x264_bitstream_check_buffer( h );
+ if( x264_bitstream_check_buffer( h ) )
+ return -1;
if( h->param.b_cabac )
{
@@ -1210,11 +1752,42 @@ static void x264_slice_write( x264_t *h )
bs_write_ue( &h->out.bs, i_skip ); /* skip run */
i_skip = 0;
}
- x264_macroblock_write_cavlc( h, &h->out.bs );
+ x264_macroblock_write_cavlc( h );
+ }
+ }
+
+ int total_bits = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
+ int mb_size = total_bits - mb_spos;
+
+ /* We'll just re-encode this last macroblock if we go over the max slice size. */
+ if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb )
+ {
+ if( mb_xy != h->sh.i_first_mb )
+ {
+ if( h->param.b_cabac )
+ {
+ memcpy( &h->cabac, &cabac_bak, offsetof(x264_cabac_t, f8_bits_encoded) );
+ h->cabac.p[-1] = cabac_prevbyte_bak;
+ }
+ else
+ {
+ h->out.bs = bs_bak;
+ i_skip = i_skip_bak;
+ }
+ h->mb.b_reencode_mb = 1;
+ h->sh.i_last_mb = mb_xy-1;
+ break;
+ }
+ else
+ {
+ h->sh.i_last_mb = mb_xy;
+ h->mb.b_reencode_mb = 0;
}
}
+ else
+ h->mb.b_reencode_mb = 0;
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
if( h->param.b_visualize )
x264_visualize_mb( h );
#endif
@@ -1224,13 +1797,14 @@ static void x264_slice_write( x264_t *h )
/* accumulate mb stats */
h->stat.frame.i_mb_count[h->mb.i_type]++;
- if( !IS_SKIP(h->mb.i_type) && !IS_INTRA(h->mb.i_type) && !IS_DIRECT(h->mb.i_type) )
+
+ if( !IS_INTRA(h->mb.i_type) && !IS_SKIP(h->mb.i_type) && !IS_DIRECT(h->mb.i_type) )
{
if( h->mb.i_partition != D_8x8 )
- h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
- else
- for( i = 0; i < 4; i++ )
- h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
+ h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
+ else
+ for( i = 0; i < 4; i++ )
+ h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
if( h->param.i_frame_reference > 1 )
for( i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
for( i = 0; i < 4; i++ )
@@ -1240,22 +1814,37 @@ static void x264_slice_write( x264_t *h )
h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
}
}
- if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma )
- {
- int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
- + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
- int b_intra = IS_INTRA(h->mb.i_type);
- h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
- h->stat.frame.i_mb_cbp[!b_intra + 2] += h->mb.i_cbp_chroma >= 1;
- h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma == 2;
- }
- if( h->mb.i_cbp_luma && !IS_INTRA(h->mb.i_type) )
+
+ if( h->param.i_log_level >= X264_LOG_INFO )
{
- h->stat.frame.i_mb_count_8x8dct[0] ++;
- h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
+ if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma )
+ {
+ int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
+ + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
+ int b_intra = IS_INTRA(h->mb.i_type);
+ h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
+ h->stat.frame.i_mb_cbp[!b_intra + 2] += h->mb.i_cbp_chroma >= 1;
+ h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma == 2;
+ }
+ if( h->mb.i_cbp_luma && !IS_INTRA(h->mb.i_type) )
+ {
+ h->stat.frame.i_mb_count_8x8dct[0] ++;
+ h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
+ }
+ if( IS_INTRA(h->mb.i_type) && h->mb.i_type != I_PCM )
+ {
+ if( h->mb.i_type == I_16x16 )
+ h->stat.frame.i_mb_pred_mode[0][h->mb.i_intra16x16_pred_mode]++;
+ else if( h->mb.i_type == I_8x8 )
+ for( i = 0; i < 16; i += 4 )
+ h->stat.frame.i_mb_pred_mode[1][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
+ else //if( h->mb.i_type == I_4x4 )
+ for( i = 0; i < 16; i++ )
+ h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
+ }
}
- x264_ratecontrol_mb( h, bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac) - mb_spos );
+ x264_ratecontrol_mb( h, mb_size );
if( h->sh.b_mbaff )
{
@@ -1264,7 +1853,7 @@ static void x264_slice_write( x264_t *h )
}
else
i_mb_x++;
- if(i_mb_x == h->sps->i_mb_width)
+ if( i_mb_x == h->sps->i_mb_width )
{
i_mb_y++;
i_mb_x = 0;
@@ -1282,26 +1871,31 @@ static void x264_slice_write( x264_t *h )
bs_write_ue( &h->out.bs, i_skip ); /* last skip run */
/* rbsp_slice_trailing_bits */
bs_rbsp_trailing( &h->out.bs );
+ bs_flush( &h->out.bs );
}
+ if( x264_nal_end( h ) )
+ return -1;
- x264_nal_end( h );
-
- x264_fdec_filter_row( h, h->sps->i_mb_height );
+ if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
+ {
+ h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
+ + (h->out.i_nal*NALU_OVERHEAD * 8)
+ - h->stat.frame.i_tex_bits
+ - h->stat.frame.i_mv_bits;
+ if( !h->param.b_sliced_threads )
+ x264_fdec_filter_row( h, h->sps->i_mb_height );
+ }
- /* Compute misc bits */
- h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
- + NALU_OVERHEAD * 8
- - h->stat.frame.i_tex_bits
- - h->stat.frame.i_mv_bits;
+ return 0;
}
static void x264_thread_sync_context( x264_t *dst, x264_t *src )
{
- x264_frame_t **f;
if( dst == src )
return;
// reference counting
+ x264_frame_t **f;
for( f = src->frames.reference; *f; f++ )
(*f)->i_reference_count++;
for( f = dst->frames.reference; *f; f++ )
@@ -1311,6 +1905,7 @@ static void x264_thread_sync_context( x264_t *dst, x264_t *src )
// copy everything except the per-thread pointers and the constants.
memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
+ dst->param = src->param;
dst->stat = src->stat;
}
@@ -1318,12 +1913,15 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
{
if( dst == src )
return;
- memcpy( &dst->stat.i_slice_count, &src->stat.i_slice_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
+ memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
}
-static int x264_slices_write( x264_t *h )
+static void *x264_slices_write( x264_t *h )
{
- int i_frame_size;
+ int i_slice_num = 0;
+ int last_thread_mb = h->sh.i_last_mb;
+ if( h->param.i_sync_lookahead )
+ x264_lower_thread_priority( 10 );
#ifdef HAVE_MMX
/* Misalign mask has to be set separately for each thread. */
@@ -1331,15 +1929,34 @@ static int x264_slices_write( x264_t *h )
x264_cpu_mask_misalign_sse();
#endif
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
if( h->param.b_visualize )
- x264_visualize_init( h );
+ if( x264_visualize_init( h ) )
+ return (void *)-1;
#endif
- x264_stack_align( x264_slice_write, h );
- i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
+ /* init stats */
+ memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
+ h->mb.b_reencode_mb = 0;
+ while( h->sh.i_first_mb <= last_thread_mb )
+ {
+ h->sh.i_last_mb = last_thread_mb;
+ if( h->param.i_slice_max_mbs )
+ h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
+ else if( h->param.i_slice_count && !h->param.b_sliced_threads )
+ {
+ int height = h->sps->i_mb_height >> h->param.b_interlaced;
+ int width = h->sps->i_mb_width << h->param.b_interlaced;
+ i_slice_num++;
+ h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
+ }
+ h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
+ if( x264_stack_align( x264_slice_write, h ) )
+ return (void *)-1;
+ h->sh.i_first_mb = h->sh.i_last_mb + 1;
+ }
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
if( h->param.b_visualize )
{
x264_visualize_show( h );
@@ -1347,7 +1964,69 @@ static int x264_slices_write( x264_t *h )
}
#endif
- h->out.i_frame_size = i_frame_size;
+ return (void *)0;
+}
+
+static int x264_threaded_slices_write( x264_t *h )
+{
+ int i, j;
+ void *ret = NULL;
+ /* set first/last mb and sync contexts */
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[i];
+ if( i )
+ {
+ t->param = h->param;
+ memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
+ }
+ int height = h->sps->i_mb_height >> h->param.b_interlaced;
+ t->i_threadslice_start = ((height * i + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+ t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+ t->sh.i_first_mb = t->i_threadslice_start * h->sps->i_mb_width;
+ t->sh.i_last_mb = t->i_threadslice_end * h->sps->i_mb_width - 1;
+ }
+
+ x264_analyse_weight_frame( h, h->sps->i_mb_height*16 + 16 );
+
+ x264_threads_distribute_ratecontrol( h );
+
+ /* dispatch */
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
+ return -1;
+ h->thread[i]->b_thread_active = 1;
+ }
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_pthread_join( h->thread[i]->thread_handle, &ret );
+ h->thread[i]->b_thread_active = 0;
+ if( (intptr_t)ret )
+ return (intptr_t)ret;
+ }
+
+ /* deblocking and hpel filtering */
+ for( i = 0; i <= h->sps->i_mb_height; i++ )
+ x264_fdec_filter_row( h, i );
+
+ for( i = 1; i < h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[i];
+ for( j = 0; j < t->out.i_nal; j++ )
+ {
+ h->out.nal[h->out.i_nal] = t->out.nal[j];
+ h->out.i_nal++;
+ x264_nal_check_buffer( h );
+ }
+ /* All entries in stat.frame are ints except for ssd/ssim,
+ * which are only calculated in the main thread. */
+ for( j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
+ ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
+ }
+
+ x264_threads_merge_ratecontrol( h );
+
return 0;
}
@@ -1370,18 +2049,14 @@ int x264_encoder_encode( x264_t *h,
x264_picture_t *pic_out )
{
x264_t *thread_current, *thread_prev, *thread_oldest;
- int i_nal_type;
- int i_nal_ref_idc;
-
- int i_global_qp;
+ int i_nal_type, i_nal_ref_idc, i_global_qp, i;
- if( h->param.i_threads > 1)
+ if( h->i_thread_frames > 1 )
{
- int i = ++h->i_thread_phase;
- int t = h->param.i_threads;
- thread_current = h->thread[ i%t ];
- thread_prev = h->thread[ (i-1)%t ];
- thread_oldest = h->thread[ (i+1)%t ];
+ thread_prev = h->thread[ h->i_thread_phase ];
+ h->i_thread_phase = (h->i_thread_phase + 1) % h->i_thread_frames;
+ thread_current = h->thread[ h->i_thread_phase ];
+ thread_oldest = h->thread[ (h->i_thread_phase + 1) % h->i_thread_frames ];
x264_thread_sync_context( thread_current, thread_prev );
x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest );
h = thread_current;
@@ -1394,7 +2069,8 @@ int x264_encoder_encode( x264_t *h,
}
// ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
- x264_reference_update( h );
+ if( x264_reference_update( h ) )
+ return -1;
h->fdec->i_lines_completed = -1;
/* no data out */
@@ -1405,7 +2081,9 @@ int x264_encoder_encode( x264_t *h,
if( pic_in != NULL )
{
/* 1: Copy the picture to a frame and move it to a buffer */
- x264_frame_t *fenc = x264_frame_pop_unused( h );
+ x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
+ if( !fenc )
+ return -1;
if( x264_frame_copy_picture( h, fenc, pic_in ) < 0 )
return -1;
@@ -1416,95 +2094,102 @@ int x264_encoder_encode( x264_t *h,
fenc->i_frame = h->frames.i_input++;
- x264_frame_push( h->frames.next, fenc );
+ if( h->frames.i_bframe_delay && fenc->i_frame == h->frames.i_bframe_delay )
+ h->frames.i_bframe_delay_time = fenc->i_pts;
if( h->frames.b_have_lowres )
+ {
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ x264_weight_plane_analyse( h, fenc );
x264_frame_init_lowres( h, fenc );
+ }
- if( h->param.rc.i_aq_mode )
+ if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
+ {
+ if( x264_macroblock_tree_read( h, fenc ) )
+ return -1;
+ }
+ else if( h->param.rc.i_aq_mode )
x264_adaptive_quant_frame( h, fenc );
- if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
+ /* 2: Place the frame into the queue for its slice type decision */
+ x264_lookahead_put_frame( h, fenc );
+
+ if( h->frames.i_input <= h->frames.i_delay + 1 - h->i_thread_frames )
{
- /* Nothing yet to encode */
- /* waiting for filling bframe buffer */
+ /* Nothing yet to encode, waiting for filling of buffers */
pic_out->i_type = X264_TYPE_AUTO;
return 0;
}
}
-
- if( h->frames.current[0] == NULL )
+ else
{
- int bframes = 0;
- /* 2: Select frame types */
- if( h->frames.next[0] == NULL )
- {
- x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
- return 0;
- }
+ /* signal kills for lookahead thread */
+ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+ h->lookahead->b_exit_thread = 1;
+ x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+ x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+ }
- x264_stack_align( x264_slicetype_decide, h );
+ h->i_frame++;
+ /* 3: The picture is analyzed in the lookahead */
+ if( !h->frames.current[0] )
+ x264_lookahead_get_frames( h );
- /* 3: move some B-frames and 1 non-B to encode queue */
- while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
- bframes++;
- x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) );
- /* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */
- if( h->param.b_bframe_pyramid && bframes > 1 )
- {
- x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] );
- mid->i_type = X264_TYPE_BREF;
- x264_frame_push( h->frames.current, mid );
- bframes--;
- }
- while( bframes-- )
- x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
- }
+ if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
+ return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
/* ------------------- Get frame to be encoded ------------------------- */
/* 4: get picture to encode */
h->fenc = x264_frame_shift( h->frames.current );
- if( h->fenc == NULL )
+ if( h->fenc->param )
{
- /* Nothing yet to encode (ex: waiting for I/P with B frames) */
- /* waiting for filling bframe buffer */
- pic_out->i_type = X264_TYPE_AUTO;
- return 0;
+ x264_encoder_reconfig( h, h->fenc->param );
+ if( h->fenc->param->param_free )
+ h->fenc->param->param_free( h->fenc->param );
}
- if( h->fenc->i_type == X264_TYPE_IDR )
+ if( h->fenc->b_keyframe )
{
- h->frames.i_last_idr = h->fenc->i_frame;
+ h->frames.i_last_keyframe = h->fenc->i_frame;
+ if( h->fenc->i_type == X264_TYPE_IDR )
+ h->i_frame_num = 0;
}
+ h->sh.i_mmco_command_count =
+ h->sh.i_mmco_remove_from_end = 0;
+ h->b_ref_reorder[0] =
+ h->b_ref_reorder[1] = 0;
/* ------------------- Setup frame context ----------------------------- */
/* 5: Init data dependent of frame type */
if( h->fenc->i_type == X264_TYPE_IDR )
{
/* reset ref pictures */
- x264_reference_reset( h );
-
i_nal_type = NAL_SLICE_IDR;
i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
h->sh.i_type = SLICE_TYPE_I;
+ x264_reference_reset( h );
}
else if( h->fenc->i_type == X264_TYPE_I )
{
i_nal_type = NAL_SLICE;
i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
h->sh.i_type = SLICE_TYPE_I;
+ x264_reference_hierarchy_reset( h );
}
else if( h->fenc->i_type == X264_TYPE_P )
{
i_nal_type = NAL_SLICE;
i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
h->sh.i_type = SLICE_TYPE_P;
+ x264_reference_hierarchy_reset( h );
}
else if( h->fenc->i_type == X264_TYPE_BREF )
{
i_nal_type = NAL_SLICE;
- i_nal_ref_idc = NAL_PRIORITY_HIGH; /* maybe add MMCO to forget it? -> low */
+ i_nal_ref_idc = h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT ? NAL_PRIORITY_LOW : NAL_PRIORITY_HIGH;
h->sh.i_type = SLICE_TYPE_B;
+ x264_reference_hierarchy_reset( h );
}
else /* B frame */
{
@@ -1514,7 +2199,7 @@ int x264_encoder_encode( x264_t *h,
}
h->fdec->i_poc =
- h->fenc->i_poc = 2 * (h->fenc->i_frame - h->frames.i_last_idr);
+ h->fenc->i_poc = 2 * (h->fenc->i_frame - h->frames.i_last_keyframe);
h->fdec->i_type = h->fenc->i_type;
h->fdec->i_frame = h->fenc->i_frame;
h->fenc->b_kept_as_ref =
@@ -1526,99 +2211,174 @@ int x264_encoder_encode( x264_t *h,
/* build ref list 0/1 */
x264_reference_build_list( h, h->fdec->i_poc );
- /* Init the rate control */
- x264_ratecontrol_start( h, h->fenc->i_qpplus1 );
- i_global_qp = x264_ratecontrol_qp( h );
-
- pic_out->i_qpplus1 =
- h->fdec->i_qpplus1 = i_global_qp + 1;
-
- if( h->sh.i_type == SLICE_TYPE_B )
- x264_macroblock_bipred_init( h );
-
- /* ------------------------ Create slice header ----------------------- */
- x264_slice_init( h, i_nal_type, i_global_qp );
-
- if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
- h->i_frame_num++;
-
/* ---------------------- Write the bitstream -------------------------- */
/* Init bitstream context */
- h->out.i_nal = 0;
- bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+ if( h->param.b_sliced_threads )
+ {
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
+ h->thread[i]->out.i_nal = 0;
+ }
+ }
+ else
+ {
+ bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+ h->out.i_nal = 0;
+ }
- if(h->param.b_aud){
+ if( h->param.b_aud )
+ {
int pic_type;
- if(h->sh.i_type == SLICE_TYPE_I)
+ if( h->sh.i_type == SLICE_TYPE_I )
pic_type = 0;
- else if(h->sh.i_type == SLICE_TYPE_P)
+ else if( h->sh.i_type == SLICE_TYPE_P )
pic_type = 1;
- else if(h->sh.i_type == SLICE_TYPE_B)
+ else if( h->sh.i_type == SLICE_TYPE_B )
pic_type = 2;
else
pic_type = 7;
- x264_nal_start(h, NAL_AUD, NAL_PRIORITY_DISPOSABLE);
- bs_write(&h->out.bs, 3, pic_type);
- bs_rbsp_trailing(&h->out.bs);
- x264_nal_end(h);
+ x264_nal_start( h, NAL_AUD, NAL_PRIORITY_DISPOSABLE );
+ bs_write( &h->out.bs, 3, pic_type );
+ bs_rbsp_trailing( &h->out.bs );
+ if( x264_nal_end( h ) )
+ return -1;
}
h->i_nal_type = i_nal_type;
h->i_nal_ref_idc = i_nal_ref_idc;
+ int overhead = NALU_OVERHEAD;
+
+ if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
+ {
+ int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
+ float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
+ if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
+ h->fdec->f_pir_position = 0;
+ else
+ {
+ if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
+ {
+ h->fdec->f_pir_position = 0;
+ h->fenc->b_keyframe = 1;
+ }
+ else
+ h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
+ }
+ h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
+ h->fdec->f_pir_position += increment * pocdiff;
+ h->fdec->i_pir_end_col = X264_MIN( h->fdec->f_pir_position+0.5, h->sps->i_mb_width-1 );
+ }
+
/* Write SPS and PPS */
- if( i_nal_type == NAL_SLICE_IDR && h->param.b_repeat_headers )
+ if( h->fenc->b_keyframe )
{
- if( h->fenc->i_frame == 0 )
+ if( h->param.b_repeat_headers )
+ {
+ if( h->fenc->i_frame == 0 )
+ {
+ /* identify ourself */
+ x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
+ if( x264_sei_version_write( h, &h->out.bs ) )
+ return -1;
+ if( x264_nal_end( h ) )
+ return -1;
+ overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
+ }
+
+ /* generate sequence parameters */
+ x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+ x264_sps_write( &h->out.bs, h->sps );
+ if( x264_nal_end( h ) )
+ return -1;
+ overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
+
+ /* generate picture parameters */
+ x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+ x264_pps_write( &h->out.bs, h->pps );
+ if( x264_nal_end( h ) )
+ return -1;
+ overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
+ }
+
+ if( h->fenc->i_type != X264_TYPE_IDR )
{
- /* identify ourself */
x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
- x264_sei_version_write( h, &h->out.bs );
+ x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
x264_nal_end( h );
+ overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
}
+ }
- /* generate sequence parameters */
- x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
- x264_sps_write( &h->out.bs, h->sps );
- x264_nal_end( h );
+ /* Init the rate control */
+ /* FIXME: Include slice header bit cost. */
+ x264_ratecontrol_start( h, h->fenc->i_qpplus1, overhead*8 );
+ i_global_qp = x264_ratecontrol_qp( h );
+
+ pic_out->i_qpplus1 =
+ h->fdec->i_qpplus1 = i_global_qp + 1;
- /* generate picture parameters */
- x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
- x264_pps_write( &h->out.bs, h->pps );
- x264_nal_end( h );
+ if( h->param.rc.b_stat_read && h->sh.i_type != SLICE_TYPE_I )
+ {
+ x264_reference_build_list_optimal( h );
+ x264_reference_check_reorder( h );
}
+ if( h->sh.i_type == SLICE_TYPE_B )
+ x264_macroblock_bipred_init( h );
+
+ /*------------------------- Weights -------------------------------------*/
+ x264_weighted_pred_init( h );
+
+ /* ------------------------ Create slice header ----------------------- */
+ x264_slice_init( h, i_nal_type, i_global_qp );
+
+ if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
+ h->i_frame_num++;
+
/* Write frame */
- if( h->param.i_threads > 1 )
+ h->i_threadslice_start = 0;
+ h->i_threadslice_end = h->sps->i_mb_height;
+ if( h->i_thread_frames > 1 )
{
- x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h );
+ if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
+ return -1;
h->b_thread_active = 1;
}
+ else if( h->param.b_sliced_threads )
+ {
+ if( x264_threaded_slices_write( h ) )
+ return -1;
+ }
else
- x264_slices_write( h );
+ if( (intptr_t)x264_slices_write( h ) )
+ return -1;
- x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
- return 0;
+ return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
}
-static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
- x264_nal_t **pp_nal, int *pi_nal,
- x264_picture_t *pic_out )
+static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+ x264_nal_t **pp_nal, int *pi_nal,
+ x264_picture_t *pic_out )
{
- int i, i_list;
+ int i, j, i_list, frame_size;
char psz_message[80];
if( h->b_thread_active )
{
- x264_pthread_join( h->thread_handle, NULL );
+ void *ret = NULL;
+ x264_pthread_join( h->thread_handle, &ret );
h->b_thread_active = 0;
+ if( (intptr_t)ret )
+ return (intptr_t)ret;
}
if( !h->out.i_nal )
{
pic_out->i_type = X264_TYPE_AUTO;
- return;
+ return 0;
}
x264_frame_push_unused( thread_current, h->fenc );
@@ -1626,6 +2386,9 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
/* End bitstream, set output */
*pi_nal = h->out.i_nal;
*pp_nal = h->out.nal;
+
+ frame_size = x264_encoder_encapsulate_nals( h );
+
h->out.i_nal = 0;
/* Set output picture properties */
@@ -1635,7 +2398,32 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
pic_out->i_type = X264_TYPE_P;
else
pic_out->i_type = X264_TYPE_B;
- pic_out->i_pts = h->fenc->i_pts;
+
+ pic_out->b_keyframe = h->fenc->b_keyframe;
+
+ pic_out->i_pts = h->fenc->i_pts *= h->i_dts_compress_multiplier;
+ if( h->frames.i_bframe_delay )
+ {
+ int64_t *i_prev_dts = thread_current->frames.i_prev_dts;
+ if( h->i_frame <= h->frames.i_bframe_delay )
+ {
+ if( h->i_dts_compress_multiplier == 1 )
+ pic_out->i_dts = h->fenc->i_reordered_pts - h->frames.i_bframe_delay_time;
+ else
+ {
+ /* DTS compression */
+ if( h->i_frame == 1 )
+ thread_current->frames.i_init_delta = h->fenc->i_reordered_pts * h->i_dts_compress_multiplier;
+ pic_out->i_dts = h->i_frame * thread_current->frames.i_init_delta / h->i_dts_compress_multiplier;
+ }
+ }
+ else
+ pic_out->i_dts = i_prev_dts[ (h->i_frame - h->frames.i_bframe_delay) % h->frames.i_bframe_delay ];
+ i_prev_dts[ h->i_frame % h->frames.i_bframe_delay ] = h->fenc->i_reordered_pts * h->i_dts_compress_multiplier;
+ }
+ else
+ pic_out->i_dts = h->fenc->i_reordered_pts;
+ assert( pic_out->i_pts >= pic_out->i_dts );
pic_out->img.i_plane = h->fdec->i_plane;
for(i = 0; i < 3; i++)
@@ -1648,10 +2436,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
/* update rc */
x264_emms();
- x264_ratecontrol_end( h, h->out.i_frame_size * 8 );
-
- /* restore CPU state (before using float again) */
- x264_emms();
+ if( x264_ratecontrol_end( h, frame_size * 8 ) < 0 )
+ return -1;
x264_noise_reduction_update( thread_current );
@@ -1659,9 +2445,9 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
x264_thread_sync_stat( h, h->thread[0] );
/* Slice stat */
- h->stat.i_slice_count[h->sh.i_type]++;
- h->stat.i_slice_size[h->sh.i_type] += h->out.i_frame_size + NALU_OVERHEAD;
- h->stat.f_slice_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
+ h->stat.i_frame_count[h->sh.i_type]++;
+ h->stat.i_frame_size[h->sh.i_type] += frame_size;
+ h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
for( i = 0; i < X264_MBTYPE_MAX; i++ )
h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
@@ -1671,12 +2457,27 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i];
for( i = 0; i < 6; i++ )
h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i];
+ for( i = 0; i < 3; i++ )
+ for( j = 0; j < 13; j++ )
+ h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];
if( h->sh.i_type != SLICE_TYPE_I )
for( i_list = 0; i_list < 2; i_list++ )
for( i = 0; i < 32; i++ )
h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];
if( h->sh.i_type == SLICE_TYPE_P )
+ {
h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ {
+ for( i = 0; i < 3; i++ )
+ for( j = 0; j < h->i_ref0; j++ )
+ if( h->sh.weight[0][i].i_denom != 0 )
+ {
+ h->stat.i_wpred[i]++;
+ break;
+ }
+ }
+ }
if( h->sh.i_type == SLICE_TYPE_B )
{
h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
@@ -1734,7 +2535,7 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
h->stat.frame.i_mb_count_i,
h->stat.frame.i_mb_count_p,
h->stat.frame.i_mb_count_skip,
- h->out.i_frame_size,
+ frame_size,
psz_message );
// keep stats all in one place
@@ -1760,8 +2561,19 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
}
#endif
+ /* Remove duplicates, must be done near the end as breaks h->fref0 array
+ * by freeing some of its pointers. */
+ for( i = 0; i < h->i_ref0; i++ )
+ if( h->fref0[i] && h->fref0[i]->b_duplicate )
+ {
+ x264_frame_push_blank_unused( h, h->fref0[i] );
+ h->fref0[i] = 0;
+ }
+
if( h->param.psz_dump_yuv )
x264_frame_dump( h );
+
+ return frame_size;
}
static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_pcm, char *intra )
@@ -1788,16 +2600,32 @@ void x264_encoder_close ( x264_t *h )
|| h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
|| h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
- for( i=0; i<h->param.i_threads; i++ )
+ x264_lookahead_delete( h );
+
+ if( h->param.i_threads > 1 )
{
// don't strictly have to wait for the other threads, but it's simpler than canceling them
- if( h->thread[i]->b_thread_active )
+ for( i = 0; i < h->param.i_threads; i++ )
+ if( h->thread[i]->b_thread_active )
+ x264_pthread_join( h->thread[i]->thread_handle, NULL );
+ if( h->i_thread_frames > 1 )
{
- x264_pthread_join( h->thread[i]->thread_handle, NULL );
- assert( h->thread[i]->fenc->i_reference_count == 1 );
- x264_frame_delete( h->thread[i]->fenc );
+ for( i = 0; i < h->i_thread_frames; i++ )
+ {
+ if( h->thread[i]->b_thread_active )
+ {
+ assert( h->thread[i]->fenc->i_reference_count == 1 );
+ x264_frame_delete( h->thread[i]->fenc );
+ }
+ }
+
+ x264_t *thread_prev = h->thread[h->i_thread_phase];
+ x264_thread_sync_ratecontrol( h, thread_prev, h );
+ x264_thread_sync_ratecontrol( thread_prev, thread_prev, h );
+ h->i_frame = thread_prev->i_frame + 1 - h->i_thread_frames;
}
}
+ h->i_frame++;
/* Slices used and PSNR */
for( i=0; i<5; i++ )
@@ -1806,17 +2634,17 @@ void x264_encoder_close ( x264_t *h )
static const char *slice_name[] = { "P", "B", "I", "SP", "SI" };
int i_slice = slice_order[i];
- if( h->stat.i_slice_count[i_slice] > 0 )
+ if( h->stat.i_frame_count[i_slice] > 0 )
{
- const int i_count = h->stat.i_slice_count[i_slice];
+ const int i_count = h->stat.i_frame_count[i_slice];
if( h->param.analyse.b_psnr )
{
x264_log( h, X264_LOG_INFO,
- "slice %s:%-5d Avg QP:%5.2f size:%6.0f PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
+ "frame %s:%-5d Avg QP:%5.2f size:%6.0f PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
slice_name[i_slice],
i_count,
- h->stat.f_slice_qp[i_slice] / i_count,
- (double)h->stat.i_slice_size[i_slice] / i_count,
+ h->stat.f_frame_qp[i_slice] / i_count,
+ (double)h->stat.i_frame_size[i_slice] / i_count,
h->stat.f_psnr_mean_y[i_slice] / i_count, h->stat.f_psnr_mean_u[i_slice] / i_count, h->stat.f_psnr_mean_v[i_slice] / i_count,
h->stat.f_psnr_average[i_slice] / i_count,
x264_psnr( h->stat.i_ssd_global[i_slice], i_count * i_yuv_size ) );
@@ -1824,15 +2652,15 @@ void x264_encoder_close ( x264_t *h )
else
{
x264_log( h, X264_LOG_INFO,
- "slice %s:%-5d Avg QP:%5.2f size:%6.0f\n",
+ "frame %s:%-5d Avg QP:%5.2f size:%6.0f\n",
slice_name[i_slice],
i_count,
- h->stat.f_slice_qp[i_slice] / i_count,
- (double)h->stat.i_slice_size[i_slice] / i_count );
+ h->stat.f_frame_qp[i_slice] / i_count,
+ (double)h->stat.i_frame_size[i_slice] / i_count );
}
}
}
- if( h->param.i_bframe && h->stat.i_slice_count[SLICE_TYPE_P] )
+ if( h->param.i_bframe && h->stat.i_frame_count[SLICE_TYPE_P] )
{
char *p = buf;
int den = 0;
@@ -1852,17 +2680,17 @@ void x264_encoder_close ( x264_t *h )
}
/* MB types used */
- if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+ if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
{
int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
- double i_count = h->stat.i_slice_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
+ double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
x264_log( h, X264_LOG_INFO, "mb I %s\n", buf );
}
- if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+ if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
{
int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
- double i_count = h->stat.i_slice_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
+ double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
x264_log( h, X264_LOG_INFO,
@@ -1875,10 +2703,10 @@ void x264_encoder_close ( x264_t *h )
i_mb_size[PIXEL_4x4] / (i_count*4),
i_mb_count[P_SKIP] / i_count );
}
- if( h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+ if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
{
int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
- double i_count = h->stat.i_slice_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
+ double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
double i_mb_list_count;
int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */
@@ -1911,7 +2739,7 @@ void x264_encoder_close ( x264_t *h )
x264_ratecontrol_summary( h );
- if( h->stat.i_slice_count[SLICE_TYPE_I] + h->stat.i_slice_count[SLICE_TYPE_P] + h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+ if( h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
{
#define SUM3(p) (p[SLICE_TYPE_I] + p[SLICE_TYPE_P] + p[SLICE_TYPE_B])
#define SUM3b(p,o) (p[SLICE_TYPE_I][o] + p[SLICE_TYPE_P][o] + p[SLICE_TYPE_B][o])
@@ -1919,35 +2747,76 @@ void x264_encoder_close ( x264_t *h )
int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 )
+ SUM3b( h->stat.i_mb_count, I_16x16 );
int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM);
- const int i_count = h->stat.i_slice_count[SLICE_TYPE_I] +
- h->stat.i_slice_count[SLICE_TYPE_P] +
- h->stat.i_slice_count[SLICE_TYPE_B];
+ const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] +
+ h->stat.i_frame_count[SLICE_TYPE_P] +
+ h->stat.i_frame_count[SLICE_TYPE_B];
int64_t i_mb_count = i_count * h->mb.i_mb_count;
float fps = (float) h->param.i_fps_num / h->param.i_fps_den;
- float f_bitrate = fps * SUM3(h->stat.i_slice_size) / i_count / 125;
+ float f_bitrate = fps * SUM3(h->stat.i_frame_size) / i_count / 125;
if( h->pps->b_transform_8x8_mode )
{
- x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%% inter:%.1f%%\n",
- 100. * i_i8x8 / i_intra,
- 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
+ buf[0] = 0;
+ if( h->stat.i_mb_count_8x8dct[0] )
+ sprintf( buf, " inter:%.1f%%", 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
+ x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
}
if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
- && h->stat.i_slice_count[SLICE_TYPE_B] )
+ && h->stat.i_frame_count[SLICE_TYPE_B] )
{
- x264_log( h, X264_LOG_INFO, "direct mvs spatial:%.1f%% temporal:%.1f%%\n",
- h->stat.i_direct_frames[1] * 100. / h->stat.i_slice_count[SLICE_TYPE_B],
- h->stat.i_direct_frames[0] * 100. / h->stat.i_slice_count[SLICE_TYPE_B] );
+ x264_log( h, X264_LOG_INFO, "direct mvs spatial:%.1f%% temporal:%.1f%%\n",
+ h->stat.i_direct_frames[1] * 100. / h->stat.i_frame_count[SLICE_TYPE_B],
+ h->stat.i_direct_frames[0] * 100. / h->stat.i_frame_count[SLICE_TYPE_B] );
}
- x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra:%.1f%% %.1f%% %.1f%% inter:%.1f%% %.1f%% %.1f%%\n",
+ buf[0] = 0;
+ if( i_mb_count != i_all_intra )
+ sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%",
+ h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
+ h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra) ),
+ h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
+ x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra: %.1f%% %.1f%% %.1f%%%s\n",
h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4),
h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra ),
- h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra ),
- h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
- h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra) ),
- h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
+ h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra ), buf );
+
+ int64_t fixed_pred_modes[3][9] = {{0}};
+ int64_t sum_pred_modes[3] = {0};
+ for( i = 0; i <= I_PRED_16x16_DC_128; i++ )
+ {
+ fixed_pred_modes[0][x264_mb_pred_mode16x16_fix[i]] += h->stat.i_mb_pred_mode[0][i];
+ sum_pred_modes[0] += h->stat.i_mb_pred_mode[0][i];
+ }
+ if( sum_pred_modes[0] )
+ x264_log( h, X264_LOG_INFO, "i16 v,h,dc,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
+ fixed_pred_modes[0][0] * 100.0 / sum_pred_modes[0],
+ fixed_pred_modes[0][1] * 100.0 / sum_pred_modes[0],
+ fixed_pred_modes[0][2] * 100.0 / sum_pred_modes[0],
+ fixed_pred_modes[0][3] * 100.0 / sum_pred_modes[0] );
+ for( i = 1; i <= 2; i++ )
+ {
+ for( j = 0; j <= I_PRED_8x8_DC_128; j++ )
+ {
+ fixed_pred_modes[i][x264_mb_pred_mode4x4_fix(j)] += h->stat.i_mb_pred_mode[i][j];
+ sum_pred_modes[i] += h->stat.i_mb_pred_mode[i][j];
+ }
+ if( sum_pred_modes[i] )
+ x264_log( h, X264_LOG_INFO, "i%d v,h,dc,ddl,ddr,vr,hd,vl,hu: %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", (3-i)*4,
+ fixed_pred_modes[i][0] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][1] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][2] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][3] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][4] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][5] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][6] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][7] * 100.0 / sum_pred_modes[i],
+ fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] );
+ }
+
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
+ x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%%\n",
+ h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
for( i_list = 0; i_list < 2; i_list++ )
{
@@ -1967,7 +2836,7 @@ void x264_encoder_close ( x264_t *h )
continue;
for( i = 0; i <= i_max; i++ )
p += sprintf( p, " %4.1f%%", 100. * h->stat.i_mb_count_ref[i_slice][i_list][i] / i_den );
- x264_log( h, X264_LOG_INFO, "ref %c L%d %s\n", "PB"[i_slice], i_list, buf );
+ x264_log( h, X264_LOG_INFO, "ref %c L%d:%s\n", "PB"[i_slice], i_list, buf );
}
}
@@ -1989,7 +2858,7 @@ void x264_encoder_close ( x264_t *h )
f_bitrate );
}
else
- x264_log( h, X264_LOG_INFO, "kb/s:%.1f\n", f_bitrate );
+ x264_log( h, X264_LOG_INFO, "kb/s:%.2f\n", f_bitrate );
}
/* rc */
@@ -2002,26 +2871,17 @@ void x264_encoder_close ( x264_t *h )
free( h->param.rc.psz_stat_in );
x264_cqm_delete( h );
+ x264_free( h->nal_buffer );
+ x264_analyse_free_costs( h );
- if( h->param.i_threads > 1)
- h = h->thread[ h->i_thread_phase % h->param.i_threads ];
+ if( h->i_thread_frames > 1)
+ h = h->thread[h->i_thread_phase];
/* frames */
- for( i = 0; h->frames.current[i]; i++ )
- {
- assert( h->frames.current[i]->i_reference_count == 1 );
- x264_frame_delete( h->frames.current[i] );
- }
- for( i = 0; h->frames.next[i]; i++ )
- {
- assert( h->frames.next[i]->i_reference_count == 1 );
- x264_frame_delete( h->frames.next[i] );
- }
- for( i = 0; h->frames.unused[i]; i++ )
- {
- assert( h->frames.unused[i]->i_reference_count == 0 );
- x264_frame_delete( h->frames.unused[i] );
- }
+ x264_frame_delete_list( h->frames.unused[0] );
+ x264_frame_delete_list( h->frames.unused[1] );
+ x264_frame_delete_list( h->frames.current );
+ x264_frame_delete_list( h->frames.blank_unused );
h = h->thread[0];
@@ -2029,21 +2889,50 @@ void x264_encoder_close ( x264_t *h )
{
x264_frame_t **frame;
- for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+ if( !h->param.b_sliced_threads || i == 0 )
{
+ for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+ {
+ assert( (*frame)->i_reference_count > 0 );
+ (*frame)->i_reference_count--;
+ if( (*frame)->i_reference_count == 0 )
+ x264_frame_delete( *frame );
+ }
+ frame = &h->thread[i]->fdec;
assert( (*frame)->i_reference_count > 0 );
(*frame)->i_reference_count--;
if( (*frame)->i_reference_count == 0 )
x264_frame_delete( *frame );
+ x264_macroblock_cache_end( h->thread[i] );
}
- frame = &h->thread[i]->fdec;
- assert( (*frame)->i_reference_count > 0 );
- (*frame)->i_reference_count--;
- if( (*frame)->i_reference_count == 0 )
- x264_frame_delete( *frame );
-
- x264_macroblock_cache_end( h->thread[i] );
+ x264_free( h->thread[i]->scratch_buffer );
x264_free( h->thread[i]->out.p_bitstream );
+ x264_free( h->thread[i]->out.nal);
x264_free( h->thread[i] );
}
}
+
+/****************************************************************************
+ * x264_encoder_delayed_frames:
+ ****************************************************************************/
+int x264_encoder_delayed_frames( x264_t *h )
+{
+ int delayed_frames = 0;
+ int i;
+ if( h->i_thread_frames > 1 )
+ {
+ for( i=0; i<h->i_thread_frames; i++ )
+ delayed_frames += h->thread[i]->b_thread_active;
+ h = h->thread[h->i_thread_phase];
+ }
+ for( i=0; h->frames.current[i]; i++ )
+ delayed_frames++;
+ x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+ x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+ delayed_frames += h->lookahead->ifbuf.i_size + h->lookahead->next.i_size + h->lookahead->ofbuf.i_size;
+ x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+ x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+ x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+ return delayed_frames;
+}
diff --git a/encoder/lookahead.c b/encoder/lookahead.c
new file mode 100644
index 0000000..b66eedc
--- /dev/null
+++ b/encoder/lookahead.c
@@ -0,0 +1,244 @@
+/*****************************************************************************
+ * lookahead.c: Lookahead slicetype decisions for x264
+ *****************************************************************************
+ * Lookahead.c and associated modifications:
+ * Copyright (C) 2008 Avail Media
+ *
+ * Authors: Michael Kazmier <mkazmier at availmedia.com>
+ * Alex Giladi <agiladi at availmedia.com>
+ * Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+/* LOOKAHEAD (threaded and non-threaded mode)
+ *
+ * Lookahead types:
+ * [1] Slice type / scene cut;
+ *
+ * In non-threaded mode, we run the existing slicetype decision code as it was.
+ * In threaded mode, we run in a separate thread, that lives between the calls
+ * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
+ * the number of frames specified in rc_lookahead. Recommended setting is
+ * # of bframes + # of threads.
+ */
+#include "common/common.h"
+#include "common/cpu.h"
+#include "analyse.h"
+
+static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
+{
+ int i = count;
+ while( i-- )
+ {
+ assert( dst->i_size < dst->i_max_size );
+ assert( src->i_size );
+ dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
+ src->i_size--;
+ }
+ if( count )
+ {
+ x264_pthread_cond_broadcast( &dst->cv_fill );
+ x264_pthread_cond_broadcast( &src->cv_empty );
+ }
+}
+
+static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
+{
+ if( h->lookahead->last_nonb )
+ x264_frame_push_unused( h, h->lookahead->last_nonb );
+ h->lookahead->last_nonb = new_nonb;
+ new_nonb->i_reference_count++;
+}
+
+#ifdef HAVE_PTHREAD
+static void x264_lookahead_slicetype_decide( x264_t *h )
+{
+ x264_stack_align( x264_slicetype_decide, h );
+
+ x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+
+ x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+ while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
+ x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
+
+ x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+ x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+ x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+
+ /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+ x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+ x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+
+static void x264_lookahead_thread( x264_t *h )
+{
+ int shift;
+#ifdef HAVE_MMX
+ if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+ x264_cpu_mask_misalign_sse();
+#endif
+ while( !h->lookahead->b_exit_thread )
+ {
+ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+ x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+ shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+ x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
+ x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+ if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length )
+ {
+ while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
+ x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
+ x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+ }
+ else
+ {
+ x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+ x264_lookahead_slicetype_decide( h );
+ }
+ } /* end of input frames */
+ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+ x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+ x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
+ x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+ x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+ while( h->lookahead->next.i_size )
+ x264_lookahead_slicetype_decide( h );
+ x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+ h->lookahead->b_thread_active = 0;
+ x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill );
+ x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+#endif
+
+int x264_lookahead_init( x264_t *h, int i_slicetype_length )
+{
+ x264_lookahead_t *look;
+ CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
+ int i;
+ for( i = 0; i < h->param.i_threads; i++ )
+ h->thread[i]->lookahead = look;
+
+ look->i_last_keyframe = - h->param.i_keyint_max;
+ look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
+ && !h->param.rc.b_stat_read;
+ look->i_slicetype_length = i_slicetype_length;
+
+ /* init frame lists */
+ if( x264_synch_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
+ x264_synch_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
+ x264_synch_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
+ goto fail;
+
+ if( !h->param.i_sync_lookahead )
+ return 0;
+
+ x264_t *look_h = h->thread[h->param.i_threads];
+ *look_h = *h;
+ if( x264_macroblock_cache_init( look_h ) )
+ goto fail;
+
+ if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
+ goto fail;
+ look->b_thread_active = 1;
+
+ return 0;
+fail:
+ x264_free( look );
+ return -1;
+}
+
+void x264_lookahead_delete( x264_t *h )
+{
+ if( h->param.i_sync_lookahead )
+ {
+ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+ h->lookahead->b_exit_thread = 1;
+ x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+ x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+ x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
+ x264_macroblock_cache_end( h->thread[h->param.i_threads] );
+ x264_free( h->thread[h->param.i_threads]->scratch_buffer );
+ x264_free( h->thread[h->param.i_threads] );
+ }
+ x264_synch_frame_list_delete( &h->lookahead->ifbuf );
+ x264_synch_frame_list_delete( &h->lookahead->next );
+ if( h->lookahead->last_nonb )
+ x264_frame_push_unused( h, h->lookahead->last_nonb );
+ x264_synch_frame_list_delete( &h->lookahead->ofbuf );
+ x264_free( h->lookahead );
+}
+
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
+{
+ if( h->param.i_sync_lookahead )
+ x264_synch_frame_list_push( &h->lookahead->ifbuf, frame );
+ else
+ x264_synch_frame_list_push( &h->lookahead->next, frame );
+}
+
+int x264_lookahead_is_empty( x264_t *h )
+{
+ int b_empty;
+ x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+ x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+ b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size;
+ x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+ x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+ return b_empty;
+}
+
+static void x264_lookahead_encoder_shift( x264_t *h )
+{
+ if( !h->lookahead->ofbuf.i_size )
+ return;
+ int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1;
+ while( i_frames-- )
+ {
+ x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
+ h->lookahead->ofbuf.i_size--;
+ }
+ x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
+}
+
+void x264_lookahead_get_frames( x264_t *h )
+{
+ if( h->param.i_sync_lookahead )
+ { /* We have a lookahead thread, so get frames from there */
+ x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+ while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
+ x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+ x264_lookahead_encoder_shift( h );
+ x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+ }
+ else
+ { /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
+
+ if( h->frames.current[0] || !h->lookahead->next.i_size )
+ return;
+
+ x264_stack_align( x264_slicetype_decide, h );
+ x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+ x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+
+ /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+ x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+ x264_lookahead_encoder_shift( h );
+ }
+}
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 1f346e0..e4edb8a 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -27,8 +27,8 @@
/* These chroma DC functions don't have assembly versions and are only used here. */
-#define ZIG(i,y,x) level[i] = dct[x][y];
-static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
+#define ZIG(i,y,x) level[i] = dct[x*2+y];
+static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
{
ZIG(0,0,0)
ZIG(1,0,1)
@@ -38,11 +38,11 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
#undef ZIG
#define IDCT_DEQUANT_START \
- int d0 = dct[0][0] + dct[0][1]; \
- int d1 = dct[1][0] + dct[1][1]; \
- int d2 = dct[0][0] - dct[0][1]; \
- int d3 = dct[1][0] - dct[1][1]; \
- int dmf = dequant_mf[i_qp%6][0][0]; \
+ int d0 = dct[0] + dct[1]; \
+ int d1 = dct[2] + dct[3]; \
+ int d2 = dct[0] - dct[1]; \
+ int d3 = dct[2] - dct[3]; \
+ int dmf = dequant_mf[i_qp%6][0]; \
int qbits = i_qp/6 - 5; \
if( qbits > 0 ) \
{ \
@@ -50,50 +50,62 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
qbits = 0; \
}
-static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp )
+static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
{
IDCT_DEQUANT_START
- dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits;
- dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits;
- dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits;
- dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
+ dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
+ dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
+ dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
+ dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
}
-static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
{
IDCT_DEQUANT_START
- dct[0][0] = (d0 + d1) * dmf >> -qbits;
- dct[0][1] = (d0 - d1) * dmf >> -qbits;
- dct[1][0] = (d2 + d3) * dmf >> -qbits;
- dct[1][1] = (d2 - d3) * dmf >> -qbits;
+ out[0] = (d0 + d1) * dmf >> -qbits;
+ out[1] = (d0 - d1) * dmf >> -qbits;
+ out[2] = (d2 + d3) * dmf >> -qbits;
+ out[3] = (d2 - d3) * dmf >> -qbits;
}
-static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
+static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
{
- int d0 = dct4x4[0][0][0] + dct4x4[1][0][0];
- int d1 = dct4x4[2][0][0] + dct4x4[3][0][0];
- int d2 = dct4x4[0][0][0] - dct4x4[1][0][0];
- int d3 = dct4x4[2][0][0] - dct4x4[3][0][0];
- d[0][0] = d0 + d1;
- d[1][0] = d2 + d3;
- d[0][1] = d0 - d1;
- d[1][1] = d2 - d3;
- dct4x4[0][0][0] = 0;
- dct4x4[1][0][0] = 0;
- dct4x4[2][0][0] = 0;
- dct4x4[3][0][0] = 0;
+ int d0 = dct4x4[0][0] + dct4x4[1][0];
+ int d1 = dct4x4[2][0] + dct4x4[3][0];
+ int d2 = dct4x4[0][0] - dct4x4[1][0];
+ int d3 = dct4x4[2][0] - dct4x4[3][0];
+ d[0] = d0 + d1;
+ d[2] = d2 + d3;
+ d[1] = d0 - d1;
+ d[3] = d2 - d3;
+ dct4x4[0][0] = 0;
+ dct4x4[1][0] = 0;
+ dct4x4[2][0] = 0;
+ dct4x4[3][0] = 0;
}
-static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+static inline void dct2x2dc_dconly( int16_t d[4] )
+{
+ int d0 = d[0] + d[1];
+ int d1 = d[2] + d[3];
+ int d2 = d[0] - d[1];
+ int d3 = d[2] - d[3];
+ d[0] = d0 + d1;
+ d[2] = d2 + d3;
+ d[1] = d0 - d1;
+ d[3] = d2 - d3;
+}
+
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[16], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
if( h->mb.b_trellis )
- return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
+ return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, 0, idx );
else
return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
}
-static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[64], int i_qp, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
if( h->mb.b_trellis )
@@ -118,12 +130,11 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
if( h->mb.b_lossless )
{
- h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
- nz = array_non_zero( h->dct.luma4x4[idx] );
+ nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
h->mb.i_cbp_luma |= nz<<(idx>>2);
return;
@@ -144,8 +155,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
#define STORE_8x8_NNZ(idx,nz)\
{\
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
+ M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
+ M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
+}
+
+#define CLEAR_16x16_NNZ \
+{\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
}
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
@@ -155,12 +174,11 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
- DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
if( h->mb.b_lossless )
{
- h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
- nz = array_non_zero( h->dct.luma8x8[idx] );
+ nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
STORE_8x8_NNZ(idx,nz);
h->mb.i_cbp_luma |= nz<<idx;
return;
@@ -186,8 +204,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
- DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
int i, nz;
int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
@@ -199,10 +217,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
{
int oe = block_idx_xy_fenc[i];
int od = block_idx_xy_fdec[i];
- h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
- dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
- h->dct.luma4x4[i][0] = 0;
- nz = array_non_zero( h->dct.luma4x4[i] );
+ nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
h->mb.i_cbp_luma |= nz;
}
@@ -217,8 +232,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
- dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0];
- dct4x4[i][0][0] = 0;
+ dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
+ dct4x4[i][0] = 0;
/* quant/scan/dequant */
nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
@@ -237,15 +252,12 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
if( decimate_score < 6 )
{
h->mb.i_cbp_luma = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ CLEAR_16x16_NNZ
}
h->dctf.dct4x4dc( dct_dc4x4 );
if( h->mb.b_trellis )
- nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
+ nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 );
else
nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
@@ -259,7 +271,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
if( h->mb.i_cbp_luma )
for( i = 0; i < 16; i++ )
- dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
+ dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
}
/* put pixels to fdec */
@@ -269,12 +281,120 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
}
+static inline int idct_dequant_round_2x2_dc( int16_t ref[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
+{
+ int16_t out[4];
+ idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
+ return ((ref[0] ^ (out[0]+32))
+ | (ref[1] ^ (out[1]+32))
+ | (ref[2] ^ (out[2]+32))
+ | (ref[3] ^ (out[3]+32))) >> 6;
+}
+
+/* Round down coefficients losslessly in DC-only chroma blocks.
+ * Unlike luma blocks, this can't be done with a lookup table or
+ * other shortcut technique because of the interdependencies
+ * between the coefficients due to the chroma DC transform. */
+static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, int16_t dct2x2[4] )
+{
+ int16_t dct2x2_orig[4];
+ int coeff;
+ int nz = 0;
+
+ /* If the QP is too high, there's no benefit to rounding optimization. */
+ if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 )
+ return 1;
+
+ idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ dct2x2_orig[0] += 32;
+ dct2x2_orig[1] += 32;
+ dct2x2_orig[2] += 32;
+ dct2x2_orig[3] += 32;
+
+ /* If the DC coefficients already round to zero, terminate early. */
+ if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) )
+ return 0;
+
+ /* Start with the highest frequency coefficient... is this the best option? */
+ for( coeff = 3; coeff >= 0; coeff-- )
+ {
+ int sign = dct2x2[coeff] < 0 ? -1 : 1;
+ int level = dct2x2[coeff];
+
+ if( !level )
+ continue;
+
+ while( level )
+ {
+ dct2x2[coeff] = level - sign;
+ if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+ break;
+ level -= sign;
+ }
+
+ nz |= level;
+ dct2x2[coeff] = level;
+ }
+
+ return !!nz;
+}
+
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int i, ch, nz, nz_dc;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+ ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
h->mb.i_cbp_chroma = 0;
+ /* Early termination: check variance of chroma residual before encoding.
+ * Don't bother trying early termination at low QPs.
+ * Values are experimentally derived. */
+ if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
+ {
+ int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+ int ssd[2];
+ int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+ score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+ if( score < thresh*4 )
+ {
+ h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[25]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[26]] = 0;
+ for( ch = 0; ch < 2; ch++ )
+ {
+ if( ssd[ch] > thresh )
+ {
+ h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+ dct2x2dc_dconly( dct2x2 );
+ if( h->mb.b_trellis )
+ nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
+ else
+ nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<
+ 1 );
+
+ if( nz_dc )
+ {
+ if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+ continue;
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+ idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+ h->mb.i_cbp_chroma = 1;
+ }
+ }
+ }
+ return;
+ }
+ }
+
for( ch = 0; ch < 2; ch++ )
{
uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
@@ -282,8 +402,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
int i_decimate_score = 0;
int nz_ac = 0;
- DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
if( h->mb.b_lossless )
{
@@ -291,10 +410,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
- h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
- h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
- h->dct.luma4x4[16+i+ch*4][0] = 0;
- nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
+ nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
h->mb.i_cbp_chroma |= nz;
}
@@ -308,7 +424,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
for( i = 0; i < 4; i++ )
{
if( h->mb.b_trellis )
- nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
+ nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
else
nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
@@ -323,7 +439,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
}
if( h->mb.b_trellis )
- nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+ nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
else
nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
@@ -338,9 +454,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
if( !nz_dc ) /* Whole block is empty */
continue;
+ if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+ {
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+ continue;
+ }
/* DC-only */
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
- idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
}
else
@@ -387,15 +508,25 @@ static void x264_macroblock_encode_pskip( x264_t *h )
{
h->mc.mc_luma( h->mb.pic.p_fdec[0], FDEC_STRIDE,
h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
- mvx, mvy, 16, 16 );
+ mvx, mvy, 16, 16, &h->sh.weight[0][0] );
h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE,
h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
mvx, mvy, 8, 8 );
+ if( h->sh.weight[0][1].weightfn )
+ h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
+ h->mb.pic.p_fdec[1], FDEC_STRIDE,
+ &h->sh.weight[0][1], 8 );
+
h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE,
h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
mvx, mvy, 8, 8 );
+
+ if( h->sh.weight[0][2].weightfn )
+ h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
+ h->mb.pic.p_fdec[2], FDEC_STRIDE,
+ &h->sh.weight[0][2], 8 );
}
x264_macroblock_encode_skip( h );
@@ -529,16 +660,16 @@ void x264_macroblock_encode( x264_t *h )
}
else if( h->mb.i_type == I_8x8 )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
h->mb.b_transform_8x8 = 1;
/* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
/* In RD mode, restore the now-overwritten DCT data. */
if( h->mb.i_skip_intra == 2 )
@@ -565,10 +696,10 @@ void x264_macroblock_encode( x264_t *h )
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
/* In RD mode, restore the now-overwritten DCT data. */
if( h->mb.i_skip_intra == 2 )
@@ -581,7 +712,7 @@ void x264_macroblock_encode( x264_t *h )
if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
+ M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U;
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst, i, i_mode );
@@ -606,27 +737,25 @@ void x264_macroblock_encode( x264_t *h )
{
int x = 8*(i8x8&1);
int y = 8*(i8x8>>1);
- h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
+ nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
- nz = array_non_zero( h->dct.luma8x8[i8x8] );
STORE_8x8_NNZ(i8x8,nz);
h->mb.i_cbp_luma |= nz << i8x8;
}
else
for( i4x4 = 0; i4x4 < 16; i4x4++ )
{
- h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
+ nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
- nz = array_non_zero( h->dct.luma4x4[i4x4] );
h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
h->mb.i_cbp_luma |= nz << (i4x4>>2);
}
}
else if( h->mb.b_transform_8x8 )
{
- DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[1] += h->mb.b_noise_reduction * 4;
@@ -634,7 +763,7 @@ void x264_macroblock_encode( x264_t *h )
for( idx = 0; idx < 4; idx++ )
{
if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
+ h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
if( nz )
@@ -655,10 +784,7 @@ void x264_macroblock_encode( x264_t *h )
if( i_decimate_mb < 6 && b_decimate )
{
h->mb.i_cbp_luma = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ CLEAR_16x16_NNZ
}
else
{
@@ -677,7 +803,7 @@ void x264_macroblock_encode( x264_t *h )
}
else
{
- DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[0] += h->mb.b_noise_reduction * 16;
@@ -692,7 +818,7 @@ void x264_macroblock_encode( x264_t *h )
idx = i8x8 * 4 + i4x4;
if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
+ h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
@@ -727,10 +853,7 @@ void x264_macroblock_encode( x264_t *h )
if( i_decimate_mb < 6 )
{
h->mb.i_cbp_luma = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ CLEAR_16x16_NNZ
}
else
{
@@ -775,7 +898,7 @@ void x264_macroblock_encode( x264_t *h )
{
if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
!(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
- *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
+ M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
&& h->mb.cache.ref[0][x264_scan8[0]] == 0 )
{
h->mb.i_type = P_SKIP;
@@ -796,9 +919,9 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
{
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
- DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
- DECLARE_ALIGNED_16( int16_t dctscan[16] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
+ ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
+ ALIGNED_ARRAY_16( int16_t, dctscan,[16] );
int i_qp = h->mb.i_qp;
int mvp[2];
@@ -816,7 +939,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
/* Motion compensation */
h->mc.mc_luma( h->mb.pic.p_fdec[0], FDEC_STRIDE,
h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
- mvp[0], mvp[1], 16, 16 );
+ mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
}
for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
@@ -852,6 +975,11 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
mvp[0], mvp[1], 8, 8 );
+
+ if( h->sh.weight[0][1+ch].weightfn )
+ h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+ h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+ &h->sh.weight[0][1+ch], 8 );
}
/* there is almost never a termination during chroma, but we can't avoid the check entirely */
@@ -928,15 +1056,15 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
int nnz8x8 = 0;
int ch, nz;
- x264_mb_mc_8x8( h, i8 );
+ if( !h->mb.b_skip_mc )
+ x264_mb_mc_8x8( h, i8 );
if( h->mb.b_lossless )
{
int i4;
if( h->mb.b_transform_8x8 )
{
- h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
- nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
+ nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
STORE_8x8_NNZ(i8,nnz8x8);
}
else
@@ -944,28 +1072,27 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
for( i4 = i8*4; i4 < i8*4+4; i4++ )
{
int nz;
- h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
+ nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
- nz = array_non_zero( h->dct.luma4x4[i4] );
h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
nnz8x8 |= nz;
}
}
for( ch = 0; ch < 2; ch++ )
{
+ int16_t dc;
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
- h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
- h->dct.luma4x4[16+i8+ch*4][0] = 0;
- h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
+ nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
+ h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
}
}
else
{
if( h->mb.b_transform_8x8 )
{
- DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
if( nnz8x8 )
@@ -991,7 +1118,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
int i4;
int i_decimate_8x8 = 0;
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
for( i4 = 0; i4 < 4; i4++ )
{
@@ -1020,15 +1147,15 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
for( ch = 0; ch < 2; ch++ )
{
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
- dct4x4[0][0] = 0;
+ dct4x4[0] = 0;
if( h->mb.b_trellis )
- nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
+ nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
else
nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
@@ -1054,21 +1181,18 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
int i_qp = h->mb.i_qp;
uint8_t *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
uint8_t *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
- const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]];
- const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
int nz;
- h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
+ /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
if( h->mb.b_lossless )
{
- h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
- h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
+ nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
+ h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
}
else
{
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 7b9f08a..25beb18 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -27,7 +27,7 @@
#include "common/macroblock.h"
extern const int x264_lambda2_tab[52];
-extern const int x264_lambda_tab[52];
+extern const uint8_t x264_lambda_tab[52];
void x264_rdo_init( void );
@@ -45,7 +45,7 @@ void x264_predict_lossless_16x16( x264_t *h, int i_mode );
void x264_macroblock_encode ( x264_t *h );
void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
-void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
+void x264_macroblock_write_cavlc ( x264_t *h );
void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
@@ -56,10 +56,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
void x264_cabac_mb_skip( x264_t *h, int b_skip );
int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
- int i_qp, int i_ctxBlockCat, int b_intra );
-int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
- int i_qp, int i_ctxBlockCat, int b_intra, int idx );
-int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+ int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma );
+int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+ int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx );
+int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
int i_qp, int b_intra, int idx );
void x264_noise_reduction_update( x264_t *h );
diff --git a/encoder/me.c b/encoder/me.c
index f13e84b..f58a6a8 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -32,7 +32,7 @@
* and refine_* are run only on the winner.
* the subme=8,9 values are much higher because any amount of satd search makes
* up its time by reducing the number of qpel-rd iterations. */
-static const int subpel_iterations[][4] =
+static const uint8_t subpel_iterations[][4] =
{{0,0,0,0},
{1,1,0,0},
{0,1,1,0},
@@ -42,10 +42,11 @@ static const int subpel_iterations[][4] =
{0,0,2,2},
{0,0,2,2},
{0,0,4,10},
+ {0,0,4,10},
{0,0,4,10}};
/* (x-1)%6 */
-static const int mod6m1[8] = {5,0,1,2,3,4,5,0};
+static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
@@ -58,7 +59,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV( mx, my )\
{\
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
- &p_fref[(my)*stride+(mx)], stride )\
+ &p_fref_w[(my)*stride+(mx)], stride )\
+ BITS_MVD(mx,my);\
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
}
@@ -66,7 +67,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_HPEL( mx, my ) \
{ \
int stride2 = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh ); \
+ uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
@@ -74,7 +75,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
- uint8_t *pix_base = p_fref + bmx + bmy*stride;\
+ uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
@@ -87,7 +88,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
{\
- uint8_t *pix_base = p_fref + bmx + bmy*stride;\
+ uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
@@ -102,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
- uint8_t *pix_base = p_fref + omx + omy*stride;\
+ uint8_t *pix_base = p_fref_w + omx + omy*stride;\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
@@ -122,9 +123,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
{\
h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
- p_fref + (m0x) + (m0y)*stride,\
- p_fref + (m1x) + (m1y)*stride,\
- p_fref + (m2x) + (m2y)*stride,\
+ p_fref_w + (m0x) + (m0y)*stride,\
+ p_fref_w + (m1x) + (m1y)*stride,\
+ p_fref_w + (m2x) + (m2y)*stride,\
stride, costs );\
costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
costs[1] += p_cost_mvx[(m1x)<<2];\
@@ -180,8 +181,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
int omx, omy, pmx, pmy;
uint8_t *p_fenc = m->p_fenc[0];
- uint8_t *p_fref = m->p_fref[0];
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ uint8_t *p_fref_w = m->p_fref_w;
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
int i, j;
int dir;
@@ -194,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
- const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
- const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
@@ -210,7 +211,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
COST_MV_HPEL( bmx, bmy );
for( i = 0; i < i_mvc; i++ )
{
- if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) )
+ if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
{
int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
@@ -451,8 +452,8 @@ me_hex2:
/* hexagon grid */
omx = bmx; omy = bmy;
- const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
- const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
+ const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
+ const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
i = 1;
do
{
@@ -477,7 +478,7 @@ me_hex2:
else
{
int dir = 0;
- uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride;
+ uint8_t *pix_base = p_fref_w + omx + (omy-4*i)*stride;
int dy = i*stride;
#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
@@ -535,7 +536,7 @@ me_hex2:
}
}
} while( ++i <= i_me_range/4 );
- if( bmy <= mv_y_max )
+ if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min )
goto me_hex2;
break;
}
@@ -561,15 +562,14 @@ me_hex2:
* because sum(abs(diff)) >= abs(diff(sum)). */
uint16_t *sums_base = m->integral;
/* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
- * unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any
- * SSE instructions and the only loss is a tiny bit of performance. */
- DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
- DECLARE_ALIGNED_16( int enc_dc[4] );
+ * this is not a problem because it is not used for any SSE instructions. */
+ ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+ ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
- uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+ uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
@@ -587,7 +587,7 @@ me_hex2:
mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
- int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+ int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
+ BITS_MVD( bmx, bmy );
for( my = min_y; my <= max_y; my++ )
{
@@ -599,7 +599,7 @@ me_hex2:
cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
for( i=0; i<xn-2; i+=3 )
{
- uint8_t *ref = p_fref+min_x+my*stride;
+ uint8_t *ref = p_fref_w+min_x+my*stride;
int sads[3];
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( j=0; j<3; j++ )
@@ -609,8 +609,8 @@ me_hex2:
{
COPY1_IF_LT( bsad, sad );
mvsads[nmvsad].sad = sad + ycost;
- mvsads[nmvsad].mx = min_x+xs[i+j];
- mvsads[nmvsad].my = my;
+ mvsads[nmvsad].mv[0] = min_x+xs[i+j];
+ mvsads[nmvsad].mv[1] = my;
nmvsad++;
}
}
@@ -618,14 +618,14 @@ me_hex2:
for( ; i<xn; i++ )
{
int mx = min_x+xs[i];
- int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride )
+ int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride )
+ cost_fpel_mvx[xs[i]];
if( sad < bsad*sad_thresh>>3 )
{
COPY1_IF_LT( bsad, sad );
mvsads[nmvsad].sad = sad + ycost;
- mvsads[nmvsad].mx = mx;
- mvsads[nmvsad].my = my;
+ mvsads[nmvsad].mv[0] = mx;
+ mvsads[nmvsad].mv[1] = my;
nmvsad++;
}
}
@@ -633,42 +633,47 @@ me_hex2:
}
limit = i_me_range / 2;
- if( nmvsad > limit*2 )
+ sad_thresh = bsad*sad_thresh>>3;
+ while( nmvsad > limit*2 && sad_thresh > bsad )
{
// halve the range if the domain is too large... eh, close enough
- bsad = bsad*(sad_thresh+8)>>4;
- for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
+ sad_thresh = (sad_thresh + bsad) >> 1;
+ for( i=0; i<nmvsad && mvsads[i].sad <= sad_thresh; i++ );
for( j=i; j<nmvsad; j++ )
- if( mvsads[j].sad <= bsad )
+ {
+ uint32_t sad;
+ if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 )
{
- /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
- if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
- *(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
- else
- mvsads[i++] = mvsads[j];
+ uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] );
+#ifdef WORDS_BIGENDIAN
+ mvsad >>= 32;
+#endif
+ sad = mvsad;
}
- nmvsad = i;
- }
- if( nmvsad > limit )
- {
- for( i=0; i<limit; i++ )
- {
- int bj = i;
- int bsad = mvsads[bj].sad;
- for( j=i+1; j<nmvsad; j++ )
- COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
- if( bj > i )
+ else
{
- if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
- XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
- else
- XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+ sad = mvsads[j].sad;
+ CP32( mvsads[i].mv, mvsads[j].mv );
+ mvsads[i].sad = sad;
}
+ i += (sad - (sad_thresh+1)) >> 31;
}
- nmvsad = limit;
+ nmvsad = i;
+ }
+ while( nmvsad > limit )
+ {
+ int bi = 0;
+ for( i=1; i<nmvsad; i++ )
+ if( mvsads[i].sad > mvsads[bi].sad )
+ bi = i;
+ nmvsad--;
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ CP64( &mvsads[bi], &mvsads[nmvsad] );
+ else
+ mvsads[bi] = mvsads[nmvsad];
}
for( i=0; i<nmvsad; i++ )
- COST_MV( mvsads[i].mx, mvsads[i].my );
+ COST_MV( mvsads[i].mv[0], mvsads[i].mv[1] );
}
else
{
@@ -719,8 +724,6 @@ me_hex2:
int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
}
- else if( m->mv[1] > h->mb.mv_max_spel[1] )
- m->mv[1] = h->mb.mv_max_spel[1];
}
#undef COST_MV
@@ -729,16 +732,21 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
- if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
+ if( m->i_pixel <= PIXEL_8x8 )
m->cost -= m->i_ref_cost;
-
+
refine_subpel( h, m, hpel, qpel, NULL, 1 );
}
+void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh )
+{
+ refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 );
+}
+
#define COST_MV_SAD( mx, my ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
+ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
@@ -748,17 +756,23 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
if( b_refine_qpel || (dir^1) != odir ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
+ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
{ \
- h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \
+ h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
+ if( m->weight[1].weightfn ) \
+ m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \
+ &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \
if( cost < bcost ) \
{ \
- h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \
+ h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
+ if( m->weight[2].weightfn ) \
+ m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \
+ &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
} \
} \
if( cost < bcost ) \
@@ -774,12 +788,13 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
- const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
- const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
+ const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- DECLARE_ALIGNED_16( uint8_t pix[2][32*18] ); // really 17x17, but round up for alignment
+ ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] ); // really 17x17, but round up for alignment
int omx, omy;
int i;
@@ -791,8 +806,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
/* try the subpel component of the predicted mv */
if( hpel_iters && h->mb.i_subpel_refine < 3 )
{
- int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
- int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
+ int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
+ int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
if( (mx-bmx)|(my-bmy) )
COST_MV_SAD( mx, my );
}
@@ -804,8 +819,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
int costs[4];
int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
uint8_t *src0, *src1, *src2, *src3;
- src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 );
- src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
+ src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
+ src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
src1 = src0 + stride;
src3 = src2 + 1;
h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
@@ -819,9 +834,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
if( !b_refine_qpel )
{
- /* check for mvrange */
- if( bmy > h->mb.mv_max_spel[1] )
- bmy = h->mb.mv_max_spel[1];
bcost = COST_MAX;
COST_MV_SATD( bmx, bmy, -1 );
}
@@ -845,6 +857,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
bdir = -1;
for( i = qpel_iters; i > 0; i-- )
{
+ if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] )
+ break;
odir = bdir;
omx = bmx;
omy = bmy;
@@ -856,88 +870,32 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
break;
}
- /* check for mvrange */
- if( bmy > h->mb.mv_max_spel[1] )
- {
- bmy = h->mb.mv_max_spel[1];
- bcost = COST_MAX;
- COST_MV_SATD( bmx, bmy, -1 );
- }
-
m->cost = bcost;
m->mv[0] = bmx;
m->mv[1] = bmy;
m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
}
-#define BIME_CACHE( dx, dy ) \
+#define BIME_CACHE( dx, dy, list ) \
{ \
+ x264_me_t *m = m##list;\
int i = 4 + 3*dx + dy; \
- stride0[i] = bw;\
- stride1[i] = bw;\
- src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
- src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
+ int mvx = om##list##x+dx;\
+ int mvy = om##list##y+dy;\
+ stride##list[i] = bw;\
+ src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
+ if( rd )\
+ {\
+ h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+ h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+ }\
}
-#define BIME_CACHE2(a,b) \
- BIME_CACHE(a,b) \
- BIME_CACHE(-(a),-(b))
-
#define SATD_THRESH 17/16
-#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
-if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
-{ \
- int cost; \
- int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
- int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
- visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
- h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \
- cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
- + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
- + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
- if( rd ) \
- { \
- if( cost < bcost * SATD_THRESH ) \
- { \
- uint64_t costrd; \
- if( cost < bcost ) \
- bcost = cost; \
- *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \
- *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \
- costrd = x264_rd_cost_part( h, i_lambda2, i8, m0->i_pixel ); \
- if( costrd < bcostrd ) \
- {\
- bcostrd = costrd;\
- bm0x = m0x; \
- bm0y = m0y; \
- bm1x = m1x; \
- bm1y = m1y; \
- }\
- } \
- } \
- else if( cost < bcost ) \
- { \
- bcost = cost; \
- bm0x = m0x; \
- bm0y = m0y; \
- bm1x = m1x; \
- bm1y = m1y; \
- } \
-}
-
-#define CHECK_BIDIR(a,b,c,d) \
- COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d)
-
-#define CHECK_BIDIR2(a,b,c,d) \
- CHECK_BIDIR(a,b,c,d) \
- CHECK_BIDIR(-(a),-(b),-(c),-(d))
-
-#define CHECK_BIDIR8(a,b,c,d) \
- CHECK_BIDIR2(a,b,c,d) \
- CHECK_BIDIR2(b,c,d,a) \
- CHECK_BIDIR2(c,d,a,b) \
- CHECK_BIDIR2(d,a,b,c)
+/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
+ * other than making its iteration count not a compile-time constant. */
+int x264_iter_kludge = 0;
static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
{
@@ -949,15 +907,22 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
- const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
- const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
- const int16_t *p_cost_m1y = m1->p_cost_mv - x264_clip3( m1->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
- DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] );
- DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] );
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
+ const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
+ const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
+ const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
+ ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
+ ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
uint8_t *src0[9];
uint8_t *src1[9];
+ uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
+ uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
+ const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
+ const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
int stride0[9];
int stride1[9];
int bm0x = m0->mv[0], om0x = bm0x;
@@ -966,19 +931,31 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
int bm1y = m1->mv[1], om1y = bm1y;
int bcost = COST_MAX;
int pass = 0;
+ int j;
+ int mc_list0 = 1, mc_list1 = 1;
uint64_t bcostrd = COST_MAX64;
-
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
-
- if( bm0y > h->mb.mv_max_spel[1] - 8 ||
- bm1y > h->mb.mv_max_spel[1] - 8 )
+ ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
+ /* all permutations of an offset in up to 2 of the dimensions */
+ static const int8_t dia4d[33][4] = {
+ {0,0,0,0},
+ {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
+ {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0},
+ {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0},
+ {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1},
+ {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0},
+ {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0},
+ {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1},
+ {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0},
+ };
+
+ if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 ||
+ bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ||
+ bm0x < h->mb.mv_min_spel[0] + 8 || bm1x < h->mb.mv_min_spel[0] + 8 ||
+ bm0x > h->mb.mv_max_spel[0] - 8 || bm1x > h->mb.mv_max_spel[0] - 8 )
return;
- h->mc.memzero_aligned( visited, sizeof(visited) );
-
- BIME_CACHE( 0, 0 );
- CHECK_BIDIR( 0, 0, 0, 0 );
+ h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
for( pass = 0; pass < 8; pass++ )
{
@@ -986,27 +963,57 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
/* doesn't do chroma ME. this probably doesn't matter, as the gains
* from bidir ME are the same with and without chroma ME. */
- BIME_CACHE2( 1, 0 );
- BIME_CACHE2( 0, 1 );
- BIME_CACHE2( 1, 1 );
- BIME_CACHE2( 1,-1 );
+ if( mc_list0 )
+ for( j = x264_iter_kludge; j < 9; j++ )
+ BIME_CACHE( square1[j][0], square1[j][1], 0 );
+
+ if( mc_list1 )
+ for( j = x264_iter_kludge; j < 9; j++ )
+ BIME_CACHE( square1[j][0], square1[j][1], 1 );
- CHECK_BIDIR8( 0, 0, 0, 1 );
- CHECK_BIDIR8( 0, 0, 1, 1 );
- CHECK_BIDIR2( 0, 1, 0, 1 );
- CHECK_BIDIR2( 1, 0, 1, 0 );
- CHECK_BIDIR8( 0, 0,-1, 1 );
- CHECK_BIDIR2( 0,-1, 0, 1 );
- CHECK_BIDIR2(-1, 0, 1, 0 );
+ for( j = !!pass; j < 33; j++ )
+ {
+ int m0x = dia4d[j][0] + om0x;
+ int m0y = dia4d[j][1] + om0y;
+ int m1x = dia4d[j][2] + om1x;
+ int m1y = dia4d[j][3] + om1y;
+ if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) )
+ {
+ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y);
+ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y);
+ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
+ h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight );
+ int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
+ + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
+ if( rd )
+ {
+ if( cost < bcost * SATD_THRESH )
+ {
+ bcost = X264_MIN( cost, bcost );
+ M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
+ M32( cache0_mv2 ) = pack16to32_mask(m0x,m0y);
+ M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
+ M32( cache1_mv2 ) = pack16to32_mask(m1x,m1y);
+ h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+ h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
+ COPY5_IF_LT( bcostrd, costrd, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+ }
+ }
+ else
+ COPY5_IF_LT( bcost, cost, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+ }
+ }
- if( om0x == bm0x && om0y == bm0y && om1x == bm1x && om1y == bm1y )
+ mc_list0 = (om0x-bm0x)|(om0y-bm0y);
+ mc_list1 = (om1x-bm1x)|(om1y-bm1y);
+ if( !mc_list0 && !mc_list1 )
break;
om0x = bm0x;
om0y = bm0y;
om1x = bm1x;
om1y = bm1y;
- BIME_CACHE( 0, 0 );
}
m0->mv[0] = bm0x;
@@ -1022,7 +1029,11 @@ void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_w
void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
{
+ /* Motion compensation is done as part of bidir_rd; don't repeat
+ * it in encoding. */
+ h->mb.b_skip_mc = 1;
x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
+ h->mb.b_skip_mc = 0;
}
#undef COST_MV_SATD
@@ -1030,9 +1041,8 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
{ \
if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
{ \
- int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
- dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
+ p_cost_mvx[mx] + p_cost_mvy[my]; \
COPY1_IF_LT( bsatd, dst ); \
} \
@@ -1045,7 +1055,13 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
if( satd <= bsatd * SATD_THRESH ) \
{ \
uint64_t cost; \
- *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
+ M32( cache_mv ) = pack16to32_mask(mx,my); \
+ M32( cache_mv2 ) = pack16to32_mask(mx,my); \
+ if( m->i_pixel <= PIXEL_8x8 )\
+ {\
+ h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
+ h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
+ }\
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
@@ -1057,29 +1073,38 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
- const int16_t *p_cost_mvx, *p_cost_mvy;
- const int bw = x264_pixel_size[m->i_pixel].w>>2;
- const int bh = x264_pixel_size[m->i_pixel].h>>2;
+ const uint16_t *p_cost_mvx, *p_cost_mvy;
+ const int bw = x264_pixel_size[m->i_pixel].w;
+ const int bh = x264_pixel_size[m->i_pixel].h;
const int i_pixel = m->i_pixel;
+ const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
- uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
+ uint64_t bcost = COST_MAX64;
int bmx = m->mv[0];
int bmy = m->mv[1];
int omx, omy, pmx, pmy, i, j;
unsigned bsatd;
- int satd = 0;
+ int satd;
int dir = -2;
- int satds[8];
+ int i8 = i4>>2;
+
+ uint8_t *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
+ uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+
+ h->mb.b_skip_mc = 1;
if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
- x264_mb_predict_mv( h, i_list, i4, bw, m->mvp );
+ x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp );
pmx = m->mvp[0];
pmy = m->mvp[1];
p_cost_mvx = m->p_cost_mv - pmx;
p_cost_mvy = m->p_cost_mv - pmy;
COST_MV_SATD( bmx, bmy, bsatd, 0 );
- COST_MV_RD( bmx, bmy, 0, 0, 0 );
+ if( m->i_pixel != PIXEL_16x16 )
+ COST_MV_RD( bmx, bmy, 0, 0, 0 )
+ else
+ bcost = m->cost;
/* check the predicted mv */
if( (bmx != pmx || bmy != pmy)
@@ -1087,7 +1112,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
&& pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
{
COST_MV_SATD( pmx, pmy, satd, 0 );
- COST_MV_RD( pmx, pmy, satd, 0,0 );
+ COST_MV_RD ( pmx, pmy, satd, 0, 0 );
/* The hex motion search is guaranteed to not repeat the center candidate,
* so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
if( bmx == pmx && bmy == pmy )
@@ -1097,12 +1122,22 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
}
}
+ if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 ||
+ bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 )
+ {
+ h->mb.b_skip_mc = 0;
+ return;
+ }
+
/* subpel hex search, same pattern as ME HEX. */
dir = -2;
omx = bmx;
omy = bmy;
- for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 );
- for( j=0; j<6; j++ ) COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j );
+ for( j=0; j<6; j++ )
+ {
+ COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 );
+ COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j );
+ }
if( dir != -2 )
{
@@ -1110,29 +1145,35 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
for( i = 1; i < 10; i++ )
{
const int odir = mod6m1[dir+1];
- if( bmy > h->mb.mv_max_spel[1] - 2 ||
- bmy < h->mb.mv_min_spel[1] - 2 )
+ if( bmy < h->mb.mv_min_spel[1] + 3 ||
+ bmy > h->mb.mv_max_spel[1] - 3 )
break;
dir = -2;
omx = bmx;
omy = bmy;
- for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 );
- for( j=0; j<3; j++ ) COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j );
+ for( j=0; j<3; j++ )
+ {
+ COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 );
+ COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j );
+ }
if( dir == -2 )
break;
}
}
- /* square refine, same as pattern as ME HEX. */
+ /* square refine, same pattern as ME HEX. */
omx = bmx;
omy = bmy;
- for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 );
- for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 );
+ for( i=0; i<8; i++ )
+ {
+ COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 );
+ COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 );
+ }
- bmy = x264_clip3( bmy, h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
m->cost = bcost;
m->mv[0] = bmx;
m->mv[1] = bmy;
- x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) );
- x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+ x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
+ x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+ h->mb.b_skip_mc = 0;
}
diff --git a/encoder/me.h b/encoder/me.h
index 3910f74..2f19e61 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -29,28 +29,32 @@
typedef struct
{
+ /* aligning the first member is a gcc hack to force the struct to be
+ * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
/* input */
- int i_pixel; /* PIXEL_WxH */
- int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+ ALIGNED_16( int i_pixel ); /* PIXEL_WxH */
+ uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
+ const x264_weight_t *weight;
uint8_t *p_fref[6];
+ uint8_t *p_fref_w;
uint8_t *p_fenc[3];
uint16_t *integral;
int i_stride[2];
- DECLARE_ALIGNED_4( int16_t mvp[2] );
+ ALIGNED_4( int16_t mvp[2] );
/* output */
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
- DECLARE_ALIGNED_4( int16_t mv[2] );
-} DECLARE_ALIGNED_16( x264_me_t );
+ ALIGNED_4( int16_t mv[2] );
+} ALIGNED_16( x264_me_t );
typedef struct {
int sad;
- int16_t mx, my;
+ int16_t mv[2];
} mvsad_t;
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
@@ -58,6 +62,7 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], i
{ x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
@@ -84,13 +89,30 @@ if((y)<(x))\
(c)=(d);\
}
-#define COPY4_IF_LT(x,y,a,b,c,d,f,e)\
+#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
if((y)<(x))\
{\
(x)=(y);\
(a)=(b);\
(c)=(d);\
- (f)=(e);\
+ (e)=(f);\
+}
+
+#define COPY5_IF_LT(x,y,a,b,c,d,e,f,g,h)\
+if((y)<(x))\
+{\
+ (x)=(y);\
+ (a)=(b);\
+ (c)=(d);\
+ (e)=(f);\
+ (g)=(h);\
+}
+
+#define COPY2_IF_GT(x,y,a,b)\
+if((y)>(x))\
+{\
+ (x)=(y);\
+ (a)=(b);\
}
#endif
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 2dd34d0..63b3be6 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -27,30 +27,34 @@
#define _ISOC99_SOURCE
#undef NDEBUG // always check asserts, the speed effect is far too small to disable them
#include <math.h>
-#include <limits.h>
-#include <assert.h>
#include "common/common.h"
#include "common/cpu.h"
#include "ratecontrol.h"
+#include "me.h"
typedef struct
{
int pict_type;
+ int frame_type;
int kept_as_ref;
- float qscale;
+ double qscale;
int mv_bits;
int tex_bits;
int misc_bits;
uint64_t expected_bits; /*total expected bits up to the current frame (current one excluded)*/
double expected_vbv;
- float new_qscale;
+ double new_qscale;
int new_qp;
int i_count;
int p_count;
int s_count;
float blurred_complexity;
char direct_mode;
+ int16_t weight[2];
+ int16_t i_weight_denom;
+ int refcount[16];
+ int refs;
} ratecontrol_entry_t;
typedef struct
@@ -58,6 +62,7 @@ typedef struct
double coeff;
double count;
double decay;
+ double offset;
} predictor_t;
struct x264_ratecontrol_t
@@ -70,6 +75,7 @@ struct x264_ratecontrol_t
double fps;
double bitrate;
double rate_tolerance;
+ double qcompress;
int nmb; /* number of macroblocks in a frame */
int qp_constant[5];
@@ -80,6 +86,7 @@ struct x264_ratecontrol_t
float f_qpm; /* qp for current macroblock: precise float for AQ */
float qpa_rc; /* average of macroblocks' qp before aq */
float qpa_aq; /* average of macroblocks' qp after aq */
+ float qp_novbv; /* QP for the current frame if 1-pass VBV was disabled. */
int qp_force;
/* VBV stuff */
@@ -88,6 +95,7 @@ struct x264_ratecontrol_t
double buffer_fill; /* planned buffer, if all in-progress frames hit their bit budget */
double buffer_rate; /* # of bits added to buffer_fill after each frame */
predictor_t *pred; /* predict frame size from satd */
+ int single_frame_vbv;
/* ABR stuff */
int last_satd;
@@ -105,6 +113,10 @@ struct x264_ratecontrol_t
/* 2pass stuff */
FILE *p_stat_file_out;
char *psz_stat_file_tmpname;
+ FILE *p_mbtree_stat_file_out;
+ char *psz_mbtree_stat_file_tmpname;
+ char *psz_mbtree_stat_file_name;
+ FILE *p_mbtree_stat_file_in;
int num_entries; /* number of ratecontrol_entry_ts */
ratecontrol_entry_t *entry; /* FIXME: copy needed data and free this once init is done */
@@ -117,12 +129,16 @@ struct x264_ratecontrol_t
double lmin[5]; /* min qscale by frame type */
double lmax[5];
double lstep; /* max change (multiply) in qscale per frame */
+ uint16_t *qp_buffer[2]; /* Global buffers for converting MB-tree quantizer data. */
+ int qpbuf_pos; /* In order to handle pyramid reordering, QP buffer acts as a stack.
+ * This value is the current position (0 or 1). */
/* MBRC stuff */
double frame_size_estimated;
double frame_size_planned;
- predictor_t *row_pred;
- predictor_t row_preds[5];
+ double slice_size_planned;
+ predictor_t (*row_pred)[2];
+ predictor_t row_preds[5][2];
predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
int bframes; /* # consecutive B-frames before this P-frame */
int bframe_bits; /* total cost of those frames */
@@ -137,10 +153,19 @@ static int parse_zones( x264_t *h );
static int init_pass2(x264_t *);
static float rate_estimate_qscale( x264_t *h );
static void update_vbv( x264_t *h, int bits );
-static void update_vbv_plan( x264_t *h );
+static void update_vbv_plan( x264_t *h, int overhead );
static double predict_size( predictor_t *p, double q, double var );
static void update_predictor( predictor_t *p, double q, double var, double bits );
+#define CMP_OPT_FIRST_PASS( opt, param_val )\
+{\
+ if( ( p = strstr( opts, opt "=" ) ) && sscanf( p, opt "=%d" , &i ) && param_val != i )\
+ {\
+ x264_log( h, X264_LOG_ERROR, "different " opt " setting than first pass (%d vs %d)\n", param_val, i );\
+ return -1;\
+ }\
+}
+
/* Terminology:
* qp = h.264's quantizer
* qscale = linearized quantizer = Lagrange multiplier
@@ -167,82 +192,90 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale)
+ rce->misc_bits;
}
+static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
+{
+ int w = i ? 8 : 16;
+ int shift = i ? 6 : 8;
+ int stride = frame->i_stride[i];
+ int offset = h->mb.b_interlaced
+ ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+ : w * (mb_x + mb_y * stride);
+ int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+ stride <<= h->mb.b_interlaced;
+ uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
+ uint32_t sum = (uint32_t)res;
+ uint32_t sqr = res >> 32;
+ return sqr - (sum * sum >> shift);
+}
+
// Find the total AC energy of the block in all planes.
-static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
+static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
{
/* This function contains annoying hacks because GCC has a habit of reordering emms
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
- unsigned int var = 0, i;
- for( i = 0; i < 3; i++ )
- {
- int w = i ? 8 : 16;
- int stride = frame->i_stride[i];
- int offset = h->mb.b_interlaced
- ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
- : w * (mb_x + mb_y * stride);
- int pix = i ? PIXEL_8x8 : PIXEL_16x16;
- stride <<= h->mb.b_interlaced;
- var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
- }
- var = X264_MAX(var,1);
+ uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
+ var += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
+ var += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
x264_emms();
return var;
}
-static const float log2_lut[128] = {
- 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
- 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
- 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
- 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
- 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
- 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
- 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
- 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
- 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
- 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
- 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
- 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
- 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
- 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
- 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
- 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
-};
-
-static const uint8_t exp2_lut[64] = {
- 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47,
- 50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104,
- 108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
- 177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
-};
-
-static int x264_exp2fix8( float x )
-{
- int i, f;
- x += 8;
- if( x <= 0 ) return 0;
- if( x >= 16 ) return 0xffff;
- i = x;
- f = (x-i)*64;
- return (exp2_lut[f]+256) << i >> 8;
-}
-
void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
{
/* constants chosen to result in approximately the same overall bitrate as without AQ.
* FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
- float strength = h->param.rc.f_aq_strength * 1.0397;
int mb_x, mb_y;
+ float strength;
+ float avg_adj = 0.f;
+ /* Need to init it anyways for MB tree. */
+ if( h->param.rc.f_aq_strength == 0 )
+ {
+ int mb_xy;
+ memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
+ memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
+ if( h->frames.b_have_lowres )
+ for( mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
+ frame->i_inv_qscale_factor[mb_xy] = 256;
+ return;
+ }
+
+ if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+ {
+ for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
+ for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
+ {
+ uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
+ float qp_adj = x264_log2( energy + 2 );
+ qp_adj *= qp_adj;
+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
+ avg_adj += qp_adj;
+ }
+ avg_adj /= h->mb.i_mb_count;
+ strength = h->param.rc.f_aq_strength * avg_adj * (1.f / 6000.f);
+ }
+ else
+ strength = h->param.rc.f_aq_strength * 1.0397f;
+
for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
{
- uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
- int lz = x264_clz( energy );
- float qp_adj = strength * (log2_lut[(energy<<lz>>24)&0x7f] - lz + 16.573f);
- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
+ float qp_adj;
+ if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+ {
+ qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
+ qp_adj = strength * (qp_adj - avg_adj);
+ }
+ else
+ {
+ uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
+ qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
+ }
+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
+ frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
if( h->frames.b_have_lowres )
- frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj*(-1.f/6.f));
+ frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
}
}
@@ -257,22 +290,113 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
void x264_adaptive_quant( x264_t *h )
{
x264_emms();
- h->mb.i_qp = x264_clip3( h->rc->f_qpm + h->fenc->f_qp_offset[h->mb.i_mb_xy] + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
- /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
- * to lower the bit cost of the qp_delta. */
- if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
- h->mb.i_qp = h->mb.i_last_qp;
+ /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
+ float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
+ h->mb.i_qp = x264_clip3( h->rc->f_qpm + qp_offset + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+}
+
+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
+{
+ x264_ratecontrol_t *rc = h->rc;
+ uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
+ int i;
+
+ if( rc->entry[frame->i_frame].kept_as_ref )
+ {
+ uint8_t i_type;
+ if( rc->qpbuf_pos < 0 )
+ {
+ do
+ {
+ rc->qpbuf_pos++;
+
+ if( !fread( &i_type, 1, 1, rc->p_mbtree_stat_file_in ) )
+ goto fail;
+ if( fread( rc->qp_buffer[rc->qpbuf_pos], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_in ) != h->mb.i_mb_count )
+ goto fail;
+
+ if( i_type != i_type_actual && rc->qpbuf_pos == 1 )
+ {
+ x264_log(h, X264_LOG_ERROR, "MB-tree frametype %d doesn't match actual frametype %d.\n", i_type, i_type_actual);
+ return -1;
+ }
+ } while( i_type != i_type_actual );
+ }
+
+ for( i = 0; i < h->mb.i_mb_count; i++ )
+ {
+ frame->f_qp_offset[i] = ((float)(int16_t)endian_fix16( rc->qp_buffer[rc->qpbuf_pos][i] )) * (1/256.0);
+ if( h->frames.b_have_lowres )
+ frame->i_inv_qscale_factor[i] = x264_exp2fix8(frame->f_qp_offset[i]);
+ }
+ rc->qpbuf_pos--;
+ }
+ else
+ x264_adaptive_quant_frame( h, frame );
+ return 0;
+fail:
+ x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
+ return -1;
+}
+
+int x264_reference_build_list_optimal( x264_t *h )
+{
+ ratecontrol_entry_t *rce = h->rc->rce;
+ x264_frame_t *frames[16];
+ x264_weight_t weights[16][3];
+ int refcount[16];
+ int ref, i;
+
+ if( rce->refs != h->i_ref0 )
+ return -1;
+
+ memcpy( frames, h->fref0, sizeof(frames) );
+ memcpy( refcount, rce->refcount, sizeof(refcount) );
+ memcpy( weights, h->fenc->weight, sizeof(weights) );
+ memset( &h->fenc->weight[1][0], 0, sizeof(x264_weight_t[15][3]) );
+
+ /* For now don't reorder ref 0; it seems to lower quality
+ in most cases due to skips. */
+ for( ref = 1; ref < h->i_ref0; ref++ )
+ {
+ int max = -1;
+ int bestref = 1;
+
+ for( i = 1; i < h->i_ref0; i++ )
+ if( !frames[i]->b_duplicate || frames[i]->i_frame != h->fref0[ref-1]->i_frame )
+ /* Favor lower POC as a tiebreaker. */
+ COPY2_IF_GT( max, refcount[i], bestref, i );
+
+ /* FIXME: If there are duplicates from frames other than ref0 then it is possible
+ * that the optimal ordering doesnt place every duplicate. */
+
+ refcount[bestref] = -1;
+ h->fref0[ref] = frames[bestref];
+ memcpy( h->fenc->weight[ref], weights[bestref], sizeof(weights[bestref]) );
+ }
+
+ return 0;
+}
+
+static char *x264_strcat_filename( char *input, char *suffix )
+{
+ char *output = x264_malloc( strlen( input ) + strlen( suffix ) + 1 );
+ if( !output )
+ return NULL;
+ strcpy( output, input );
+ strcat( output, suffix );
+ return output;
}
int x264_ratecontrol_new( x264_t *h )
{
x264_ratecontrol_t *rc;
- int i;
+ int i, j;
x264_emms();
- rc = h->rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) );
- memset( rc, 0, h->param.i_threads * sizeof(x264_ratecontrol_t) );
+ CHECKED_MALLOCZERO( h->rc, h->param.i_threads * sizeof(x264_ratecontrol_t) );
+ rc = h->rc;
rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read;
@@ -283,6 +407,14 @@ int x264_ratecontrol_new( x264_t *h )
else
rc->fps = 25.0;
+ if( h->param.rc.b_mb_tree )
+ {
+ h->param.rc.f_pb_factor = 1;
+ rc->qcompress = 1;
+ }
+ else
+ rc->qcompress = h->param.rc.f_qcompress;
+
rc->bitrate = h->param.rc.i_bitrate * 1000.;
rc->rate_tolerance = h->param.rc.f_rate_tolerance;
rc->nmb = h->mb.i_mb_count;
@@ -304,8 +436,16 @@ int x264_ratecontrol_new( x264_t *h )
}
else if( h->param.rc.i_vbv_max_bitrate == 0 )
{
- x264_log( h, X264_LOG_DEBUG, "VBV maxrate unspecified, assuming CBR\n" );
- h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+ if( h->param.rc.i_rc_method == X264_RC_ABR )
+ {
+ x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
+ h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+ }
+ else
+ {
+ x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
+ h->param.rc.i_vbv_buffer_size = 0;
+ }
}
}
if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
@@ -314,16 +454,18 @@ int x264_ratecontrol_new( x264_t *h )
else if( h->param.rc.i_vbv_max_bitrate > 0 &&
h->param.rc.i_vbv_buffer_size > 0 )
{
- if( h->param.rc.i_vbv_buffer_size < 3 * h->param.rc.i_vbv_max_bitrate / rc->fps )
+ if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
{
- h->param.rc.i_vbv_buffer_size = 3 * h->param.rc.i_vbv_max_bitrate / rc->fps;
- x264_log( h, X264_LOG_WARNING, "VBV buffer size too small, using %d kbit\n",
+ h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
+ x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
h->param.rc.i_vbv_buffer_size );
}
if( h->param.rc.f_vbv_buffer_init > 1. )
h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
+ rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
+ h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
* 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
@@ -352,17 +494,19 @@ int x264_ratecontrol_new( x264_t *h )
rc->accum_p_norm = .01;
rc->accum_p_qp = ABR_INIT_QP * rc->accum_p_norm;
/* estimated ratio that produces a reasonable QP for the first I-frame */
- rc->cplxr_sum = .01 * pow( 7.0e5, h->param.rc.f_qcompress ) * pow( h->mb.i_mb_count, 0.5 );
+ rc->cplxr_sum = .01 * pow( 7.0e5, rc->qcompress ) * pow( h->mb.i_mb_count, 0.5 );
rc->wanted_bits_window = 1.0 * rc->bitrate / rc->fps;
rc->last_non_b_pict_type = SLICE_TYPE_I;
}
if( h->param.rc.i_rc_method == X264_RC_CRF )
{
- /* arbitrary rescaling to make CRF somewhat similar to QP */
+ /* Arbitrary rescaling to make CRF somewhat similar to QP.
+ * Try to compensate for MB-tree's effects as well. */
double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
- rc->rate_factor_constant = pow( base_cplx, 1 - h->param.rc.f_qcompress )
- / qp2qscale( h->param.rc.f_rf_constant );
+ double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
+ rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
+ / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
}
rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
@@ -370,11 +514,12 @@ int x264_ratecontrol_new( x264_t *h )
rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, 51 );
rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, 51 );
+ h->mb.ip_offset = rc->ip_offset + 0.5;
rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
rc->last_qscale = qp2qscale(26);
- rc->pred = x264_malloc( 5*sizeof(predictor_t) );
- rc->pred_b_from_p = x264_malloc( sizeof(predictor_t) );
+ CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
+ CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
for( i = 0; i < 5; i++ )
{
rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
@@ -383,9 +528,14 @@ int x264_ratecontrol_new( x264_t *h )
rc->pred[i].coeff= 2.0;
rc->pred[i].count= 1.0;
rc->pred[i].decay= 0.5;
- rc->row_preds[i].coeff= .25;
- rc->row_preds[i].count= 1.0;
- rc->row_preds[i].decay= 0.5;
+ rc->pred[i].offset= 0.0;
+ for( j = 0; j < 2; j++ )
+ {
+ rc->row_preds[i][j].coeff= .25;
+ rc->row_preds[i][j].count= 1.0;
+ rc->row_preds[i][j].decay= 0.5;
+ rc->row_preds[i][j].offset= 0.0;
+ }
}
*rc->pred_b_from_p = rc->pred[0];
@@ -408,35 +558,47 @@ int x264_ratecontrol_new( x264_t *h )
x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n");
return -1;
}
+ if( h->param.rc.b_mb_tree )
+ {
+ char *mbtree_stats_in = x264_strcat_filename( h->param.rc.psz_stat_in, ".mbtree" );
+ if( !mbtree_stats_in )
+ return -1;
+ rc->p_mbtree_stat_file_in = fopen( mbtree_stats_in, "rb" );
+ x264_free( mbtree_stats_in );
+ if( !rc->p_mbtree_stat_file_in )
+ {
+ x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n");
+ return -1;
+ }
+ }
/* check whether 1st pass options were compatible with current options */
if( !strncmp( stats_buf, "#options:", 9 ) )
{
- int i;
+ int i, j;
char *opts = stats_buf;
stats_in = strchr( stats_buf, '\n' );
if( !stats_in )
return -1;
*stats_in = '\0';
stats_in++;
-
- if( ( p = strstr( opts, "bframes=" ) ) && sscanf( p, "bframes=%d", &i )
- && h->param.i_bframe != i )
+ if( sscanf( opts, "#options: %dx%d", &i, &j ) != 2 )
{
- x264_log( h, X264_LOG_ERROR, "different number of B-frames than 1st pass (%d vs %d)\n",
- h->param.i_bframe, i );
+ x264_log( h, X264_LOG_ERROR, "resolution specified in stats file not valid\n" );
+ return -1;
+ }
+ else if( h->param.rc.b_mb_tree && (i != h->param.i_width || j != h->param.i_height) )
+ {
+ x264_log( h, X264_LOG_ERROR, "MB-tree doesn't support different resolution than 1st pass (%dx%d vs %dx%d)\n",
+ h->param.i_width, h->param.i_height, i, j );
return -1;
}
- /* since B-adapt doesn't (yet) take into account B-pyramid,
- * the converse is not a problem */
- if( strstr( opts, "b_pyramid=1" ) && !h->param.b_bframe_pyramid )
- x264_log( h, X264_LOG_WARNING, "1st pass used B-pyramid, 2nd doesn't\n" );
-
- if( ( p = strstr( opts, "keyint=" ) ) && sscanf( p, "keyint=%d", &i )
- && h->param.i_keyint_max != i )
- x264_log( h, X264_LOG_WARNING, "different keyint than 1st pass (%d vs %d)\n",
- h->param.i_keyint_max, i );
+ CMP_OPT_FIRST_PASS( "wpredp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) );
+ CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe );
+ CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid );
+ CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh );
+ CMP_OPT_FIRST_PASS( "keyint", h->param.i_keyint_max );
if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR )
x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" );
@@ -454,6 +616,9 @@ int x264_ratecontrol_new( x264_t *h )
x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" );
return -1;
}
+
+ if( h->param.rc.b_mb_tree && ( p = strstr( opts, "rc_lookahead=" ) ) && sscanf( p, "rc_lookahead=%d", &i ) )
+ h->param.rc.i_lookahead = i;
}
/* find number of pics */
@@ -479,8 +644,7 @@ int x264_ratecontrol_new( x264_t *h )
return -1;
}
- rc->entry = (ratecontrol_entry_t*) x264_malloc(rc->num_entries * sizeof(ratecontrol_entry_t));
- memset(rc->entry, 0, rc->num_entries * sizeof(ratecontrol_entry_t));
+ CHECKED_MALLOCZERO( rc->entry, rc->num_entries * sizeof(ratecontrol_entry_t) );
/* init all to skipped p frames */
for(i=0; i<rc->num_entries; i++)
@@ -502,6 +666,7 @@ int x264_ratecontrol_new( x264_t *h )
int e;
char *next;
float qp;
+ int ref;
next= strchr(p, ';');
if(next)
@@ -524,17 +689,56 @@ int x264_ratecontrol_new( x264_t *h )
&rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count,
&rce->s_count, &rce->direct_mode);
- switch(pict_type)
+ p = strstr( p, "ref:" );
+ if( !p )
+ goto parse_error;
+ p += 4;
+ for( ref = 0; ref < 16; ref++ )
{
- case 'I': rce->kept_as_ref = 1;
- case 'i': rce->pict_type = SLICE_TYPE_I; break;
- case 'P': rce->pict_type = SLICE_TYPE_P; break;
- case 'B': rce->kept_as_ref = 1;
- case 'b': rce->pict_type = SLICE_TYPE_B; break;
+ if( sscanf( p, " %d", &rce->refcount[ref] ) != 1 )
+ break;
+ p = strchr( p+1, ' ' );
+ if( !p )
+ goto parse_error;
+ }
+ rce->refs = ref;
+
+ /* find weights */
+ rce->i_weight_denom = -1;
+ char *w = strchr( p, 'w' );
+ if( w )
+ if( sscanf( w, "w:%hd,%hd,%hd", &rce->i_weight_denom, &rce->weight[0], &rce->weight[1] ) != 3 )
+ rce->i_weight_denom = -1;
+
+ if( pict_type != 'b' )
+ rce->kept_as_ref = 1;
+ switch( pict_type )
+ {
+ case 'I':
+ rce->frame_type = X264_TYPE_IDR;
+ rce->pict_type = SLICE_TYPE_I;
+ break;
+ case 'i':
+ rce->frame_type = X264_TYPE_I;
+ rce->pict_type = SLICE_TYPE_I;
+ break;
+ case 'P':
+ rce->frame_type = X264_TYPE_P;
+ rce->pict_type = SLICE_TYPE_P;
+ break;
+ case 'B':
+ rce->frame_type = X264_TYPE_BREF;
+ rce->pict_type = SLICE_TYPE_B;
+ break;
+ case 'b':
+ rce->frame_type = X264_TYPE_B;
+ rce->pict_type = SLICE_TYPE_B;
+ break;
default: e = -1; break;
}
if(e < 10)
{
+parse_error:
x264_log(h, X264_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
return -1;
}
@@ -556,10 +760,9 @@ int x264_ratecontrol_new( x264_t *h )
if( h->param.rc.b_stat_write )
{
char *p;
-
- rc->psz_stat_file_tmpname = x264_malloc( strlen(h->param.rc.psz_stat_out) + 6 );
- strcpy( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out );
- strcat( rc->psz_stat_file_tmpname, ".temp" );
+ rc->psz_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".temp" );
+ if( !rc->psz_stat_file_tmpname )
+ return -1;
rc->p_stat_file_out = fopen( rc->psz_stat_file_tmpname, "wb" );
if( rc->p_stat_file_out == NULL )
@@ -569,8 +772,31 @@ int x264_ratecontrol_new( x264_t *h )
}
p = x264_param2string( &h->param, 1 );
- fprintf( rc->p_stat_file_out, "#options: %s\n", p );
+ if( p )
+ fprintf( rc->p_stat_file_out, "#options: %s\n", p );
x264_free( p );
+ if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
+ {
+ rc->psz_mbtree_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree.temp" );
+ rc->psz_mbtree_stat_file_name = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree" );
+ if( !rc->psz_mbtree_stat_file_tmpname || !rc->psz_mbtree_stat_file_name )
+ return -1;
+
+ rc->p_mbtree_stat_file_out = fopen( rc->psz_mbtree_stat_file_tmpname, "wb" );
+ if( rc->p_mbtree_stat_file_out == NULL )
+ {
+ x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n");
+ return -1;
+ }
+ }
+ }
+
+ if( h->param.rc.b_mb_tree && (h->param.rc.b_stat_read || h->param.rc.b_stat_write) )
+ {
+ CHECKED_MALLOC( rc->qp_buffer[0], h->mb.i_mb_count * sizeof(uint16_t) );
+ if( h->param.i_bframe_pyramid && h->param.rc.b_stat_read )
+ CHECKED_MALLOC( rc->qp_buffer[1], h->mb.i_mb_count * sizeof(uint16_t) );
+ rc->qpbuf_pos = -1;
}
for( i=0; i<h->param.i_threads; i++ )
@@ -579,18 +805,20 @@ int x264_ratecontrol_new( x264_t *h )
if( i )
{
rc[i] = rc[0];
- memcpy( &h->thread[i]->param, &h->param, sizeof( x264_param_t ) );
+ h->thread[i]->param = h->param;
h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp;
}
}
return 0;
+fail:
+ return -1;
}
static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
{
int len = 0;
- char *tok, UNUSED *saveptr;
+ char *tok, UNUSED *saveptr=NULL;
z->param = NULL;
z->f_bitrate_factor = 1;
if( 3 <= sscanf(p, "%u,%u,q=%u%n", &z->i_start, &z->i_end, &z->i_qp, &len) )
@@ -607,8 +835,9 @@ static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
p += len;
if( !*p )
return 0;
- z->param = x264_malloc( sizeof(x264_param_t) );
+ CHECKED_MALLOC( z->param, sizeof(x264_param_t) );
memcpy( z->param, &h->param, sizeof(x264_param_t) );
+ z->param->param_free = x264_free;
while( (tok = strtok_r( p, ",", &saveptr )) )
{
char *val = strchr( tok, '=' );
@@ -625,6 +854,8 @@ static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
p = NULL;
}
return 0;
+fail:
+ return -1;
}
static int parse_zones( x264_t *h )
@@ -633,20 +864,21 @@ static int parse_zones( x264_t *h )
int i;
if( h->param.rc.psz_zones && !h->param.rc.i_zones )
{
- char *p, *tok, UNUSED *saveptr;
- char *psz_zones = x264_malloc( strlen(h->param.rc.psz_zones)+1 );
+ char *psz_zones, *p;
+ CHECKED_MALLOC( psz_zones, strlen( h->param.rc.psz_zones )+1 );
strcpy( psz_zones, h->param.rc.psz_zones );
h->param.rc.i_zones = 1;
for( p = psz_zones; *p; p++ )
h->param.rc.i_zones += (*p == '/');
- h->param.rc.zones = x264_malloc( h->param.rc.i_zones * sizeof(x264_zone_t) );
+ CHECKED_MALLOC( h->param.rc.zones, h->param.rc.i_zones * sizeof(x264_zone_t) );
p = psz_zones;
for( i = 0; i < h->param.rc.i_zones; i++ )
{
- tok = strtok_r( p, "/", &saveptr );
- if( !tok || parse_zone( h, &h->param.rc.zones[i], tok ) )
+ int i_tok = strcspn( p, "/" );
+ p[i_tok] = 0;
+ if( parse_zone( h, &h->param.rc.zones[i], p ) )
return -1;
- p = NULL;
+ p += i_tok + 1;
}
x264_free( psz_zones );
}
@@ -671,7 +903,7 @@ static int parse_zones( x264_t *h )
}
rc->i_zones = h->param.rc.i_zones + 1;
- rc->zones = x264_malloc( rc->i_zones * sizeof(x264_zone_t) );
+ CHECKED_MALLOC( rc->zones, rc->i_zones * sizeof(x264_zone_t) );
memcpy( rc->zones+1, h->param.rc.zones, (rc->i_zones-1) * sizeof(x264_zone_t) );
// default zone to fall back to if none of the others match
@@ -679,7 +911,7 @@ static int parse_zones( x264_t *h )
rc->zones[0].i_end = INT_MAX;
rc->zones[0].b_force_qp = 0;
rc->zones[0].f_bitrate_factor = 1;
- rc->zones[0].param = x264_malloc( sizeof(x264_param_t) );
+ CHECKED_MALLOC( rc->zones[0].param, sizeof(x264_param_t) );
memcpy( rc->zones[0].param, &h->param, sizeof(x264_param_t) );
for( i = 1; i < rc->i_zones; i++ )
{
@@ -689,6 +921,8 @@ static int parse_zones( x264_t *h )
}
return 0;
+fail:
+ return -1;
}
static x264_zone_t *get_zone( x264_t *h, int frame_num )
@@ -709,9 +943,10 @@ void x264_ratecontrol_summary( x264_t *h )
if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 )
{
double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
+ double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
- qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress )
- * rc->cplxr_sum / rc->wanted_bits_window ) );
+ qscale2qp( pow( base_cplx, 1 - rc->qcompress )
+ * rc->cplxr_sum / rc->wanted_bits_window ) - mbtree_offset );
}
}
@@ -719,11 +954,13 @@ void x264_ratecontrol_delete( x264_t *h )
{
x264_ratecontrol_t *rc = h->rc;
int i;
+ int b_regular_file;
if( rc->p_stat_file_out )
{
+ b_regular_file = x264_is_regular_file( rc->p_stat_file_out );
fclose( rc->p_stat_file_out );
- if( h->i_frame >= rc->num_entries )
+ if( h->i_frame >= rc->num_entries && b_regular_file )
if( rename( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out ) != 0 )
{
x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
@@ -731,16 +968,32 @@ void x264_ratecontrol_delete( x264_t *h )
}
x264_free( rc->psz_stat_file_tmpname );
}
+ if( rc->p_mbtree_stat_file_out )
+ {
+ b_regular_file = x264_is_regular_file( rc->p_mbtree_stat_file_out );
+ fclose( rc->p_mbtree_stat_file_out );
+ if( h->i_frame >= rc->num_entries && b_regular_file )
+ if( rename( rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ) != 0 )
+ {
+ x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
+ rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name );
+ }
+ x264_free( rc->psz_mbtree_stat_file_tmpname );
+ x264_free( rc->psz_mbtree_stat_file_name );
+ }
+ if( rc->p_mbtree_stat_file_in )
+ fclose( rc->p_mbtree_stat_file_in );
x264_free( rc->pred );
x264_free( rc->pred_b_from_p );
x264_free( rc->entry );
+ x264_free( rc->qp_buffer[0] );
+ x264_free( rc->qp_buffer[1] );
if( rc->zones )
{
x264_free( rc->zones[0].param );
- if( h->param.rc.psz_zones )
- for( i=1; i<rc->i_zones; i++ )
- if( rc->zones[i].param != rc->zones[0].param )
- x264_free( rc->zones[i].param );
+ for( i=1; i<rc->i_zones; i++ )
+ if( rc->zones[i].param != rc->zones[0].param && rc->zones[i].param->param_free )
+ rc->zones[i].param->param_free( rc->zones[i].param );
x264_free( rc->zones );
}
x264_free( rc );
@@ -775,7 +1028,7 @@ static void accum_p_qp_update( x264_t *h, float qp )
}
/* Before encoding a frame, choose a QP for it */
-void x264_ratecontrol_start( x264_t *h, int i_force_qp )
+void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
{
x264_ratecontrol_t *rc = h->rc;
ratecontrol_entry_t *rce = NULL;
@@ -808,15 +1061,11 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
{
memset( h->fdec->i_row_bits, 0, h->sps->i_mb_height * sizeof(int) );
rc->row_pred = &rc->row_preds[h->sh.i_type];
- update_vbv_plan( h );
+ update_vbv_plan( h, overhead );
}
if( h->sh.i_type != SLICE_TYPE_B )
- {
- rc->bframes = 0;
- while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) )
- rc->bframes++;
- }
+ rc->bframes = h->fenc->i_bframes;
if( i_force_qp )
{
@@ -847,6 +1096,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
}
}
+ q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+
rc->qpa_rc =
rc->qpa_aq = 0;
h->fdec->f_qp_avg_rc =
@@ -857,12 +1108,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
if( rce )
rce->new_qp = rc->qp;
- /* accum_p_qp needs to be here so that future frames can benefit from the
- * data before this frame is done. but this only works because threading
- * guarantees to not re-encode any frames. so the non-threaded case does
- * accum_p_qp later. */
- if( h->param.i_threads > 1 )
- accum_p_qp_update( h, rc->qp );
+ accum_p_qp_update( h, rc->qp );
if( h->sh.i_type != SLICE_TYPE_B )
rc->last_non_b_pict_type = h->sh.i_type;
@@ -873,27 +1119,36 @@ static double predict_row_size( x264_t *h, int y, int qp )
/* average between two predictors:
* absolute SATD, and scaled bit cost of the colocated row in the previous frame */
x264_ratecontrol_t *rc = h->rc;
- double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] );
+ double pred_s = predict_size( rc->row_pred[0], qp2qscale(qp), h->fdec->i_row_satd[y] );
double pred_t = 0;
- if( h->sh.i_type != SLICE_TYPE_I
- && h->fref0[0]->i_type == h->fdec->i_type
- && h->fref0[0]->i_row_satd[y] > 0
- && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
+ if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref0[0]->i_row_qp[y] )
+ {
+ if( h->sh.i_type == SLICE_TYPE_P
+ && h->fref0[0]->i_type == h->fdec->i_type
+ && h->fref0[0]->i_row_satd[y] > 0
+ && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
+ {
+ pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y]
+ * qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp);
+ }
+ if( pred_t == 0 )
+ pred_t = pred_s;
+ return (pred_s + pred_t) / 2;
+ }
+ /* Our QP is lower than the reference! */
+ else
{
- pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y]
- * qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp);
+ double pred_intra = predict_size( rc->row_pred[1], qp2qscale(qp), h->fdec->i_row_satds[0][0][y] );
+ /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
+ return pred_intra + pred_s;
}
- if( pred_t == 0 )
- pred_t = pred_s;
-
- return (pred_s + pred_t) / 2;
}
static double row_bits_so_far( x264_t *h, int y )
{
int i;
double bits = 0;
- for( i = 0; i <= y; i++ )
+ for( i = h->i_threadslice_start; i <= y; i++ )
bits += h->fdec->i_row_bits[i];
return bits;
}
@@ -902,7 +1157,7 @@ static double predict_row_size_sum( x264_t *h, int y, int qp )
{
int i;
double bits = row_bits_so_far(h, y);
- for( i = y+1; i < h->sps->i_mb_height; i++ )
+ for( i = y+1; i < h->i_threadslice_end; i++ )
bits += predict_row_size( h, i, qp );
return bits;
}
@@ -919,83 +1174,84 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
rc->qpa_rc += rc->f_qpm;
rc->qpa_aq += h->mb.i_qp;
- if( h->mb.i_mb_x != h->sps->i_mb_width - 1 || !rc->b_vbv)
+ if( h->mb.i_mb_x != h->sps->i_mb_width - 1 || !rc->b_vbv )
return;
h->fdec->i_row_qp[y] = rc->qpm;
- if( h->sh.i_type == SLICE_TYPE_B )
+ update_predictor( rc->row_pred[0], qp2qscale(rc->qpm), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
+ if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->i_row_qp[y] )
+ update_predictor( rc->row_pred[1], qp2qscale(rc->qpm), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );
+
+ /* tweak quality based on difference from predicted size */
+ if( y < h->i_threadslice_end-1 )
{
- /* B-frames shouldn't use lower QP than their reference frames.
- * This code is a bit overzealous in limiting B-frame quantizers, but it helps avoid
- * underflows due to the fact that B-frames are not explicitly covered by VBV. */
- if( y < h->sps->i_mb_height-1 )
+ int prev_row_qp = h->fdec->i_row_qp[y];
+ int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
+ int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
+
+ /* B-frames shouldn't use lower QP than their reference frames. */
+ if( h->sh.i_type == SLICE_TYPE_B )
{
- int i_estimated;
- int avg_qp = X264_MAX(h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1])
- + rc->pb_offset * ((h->fenc->i_type == X264_TYPE_BREF) ? 0.5 : 1);
- rc->qpm = X264_MIN(X264_MAX( rc->qp, avg_qp), 51); //avg_qp could go higher than 51 due to pb_offset
- i_estimated = row_bits_so_far(h, y); //FIXME: compute full estimated size
- if (i_estimated > h->rc->frame_size_planned)
- x264_ratecontrol_set_estimated_size(h, i_estimated);
+ i_qp_min = X264_MAX( i_qp_min, X264_MAX( h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1] ) );
+ rc->qpm = X264_MAX( rc->qpm, i_qp_min );
}
- }
- else
- {
- update_predictor( rc->row_pred, qp2qscale(rc->qpm), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
- /* tweak quality based on difference from predicted size */
- if( y < h->sps->i_mb_height-1 && h->stat.i_slice_count[h->sh.i_type] > 0 )
+ float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
+ float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
+ float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
+ /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
+ float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
+ float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
+ int b1 = predict_row_size_sum( h, y, rc->qpm );
+
+ /* Assume that if this slice has become larger than expected,
+ * the other slices will have gotten equally larger. */
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+
+ /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
+ /* area at the top of the frame was measured inaccurately. */
+ if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
+ return;
+
+ if( h->sh.i_type != SLICE_TYPE_I )
+ rc_tol /= 2;
+
+ if( !rc->b_vbv_min_rate )
+ i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp );
+
+ while( rc->qpm < i_qp_max
+ && ((b1 > rc->frame_size_planned + rc_tol) ||
+ (rc->buffer_fill - b1 < buffer_left_planned * 0.5) ||
+ (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
{
- int prev_row_qp = h->fdec->i_row_qp[y];
- int b0 = predict_row_size_sum( h, y, rc->qpm );
- int b1 = b0;
- int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
- int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
- float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
- float rc_tol = 1;
- float headroom = 0;
-
- /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
- /* area at the top of the frame was measured inaccurately. */
- if(row_bits_so_far(h,y) < 0.05 * rc->frame_size_planned)
- return;
-
- headroom = buffer_left_planned/rc->buffer_size;
- if(h->sh.i_type != SLICE_TYPE_I)
- headroom /= 2;
- rc_tol += headroom;
-
- if( !rc->b_vbv_min_rate )
- i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp );
-
- while( rc->qpm < i_qp_max
- && (b1 > rc->frame_size_planned * rc_tol
- || (rc->buffer_fill - b1 < buffer_left_planned * 0.5)))
- {
- rc->qpm ++;
- b1 = predict_row_size_sum( h, y, rc->qpm );
- }
+ rc->qpm ++;
+ b1 = predict_row_size_sum( h, y, rc->qpm );
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+ }
- /* avoid VBV underflow */
- while( (rc->qpm < h->param.rc.i_qp_max)
- && (rc->buffer_fill - b1 < rc->buffer_size * 0.005))
- {
- rc->qpm ++;
- b1 = predict_row_size_sum( h, y, rc->qpm );
- }
+ while( rc->qpm > i_qp_min
+ && (rc->qpm > h->fdec->i_row_qp[0] || rc->single_frame_vbv)
+ && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp)
+ || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
+ {
+ rc->qpm --;
+ b1 = predict_row_size_sum( h, y, rc->qpm );
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+ }
- while( rc->qpm > i_qp_min
- && rc->qpm > h->fdec->i_row_qp[0]
- && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp)
- || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
- {
- rc->qpm --;
- b1 = predict_row_size_sum( h, y, rc->qpm );
- }
- x264_ratecontrol_set_estimated_size(h, b1);
+ /* avoid VBV underflow */
+ while( (rc->qpm < h->param.rc.i_qp_max)
+ && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
+ {
+ rc->qpm ++;
+ b1 = predict_row_size_sum( h, y, rc->qpm );
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
}
+
+ x264_ratecontrol_set_estimated_size(h, b1);
}
+
/* loses the fractional part of the frame-wise qp */
rc->f_qpm = rc->qpm;
}
@@ -1018,8 +1274,8 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
* So just calculate the average QP used so far. */
int i;
- h->param.rc.i_qp_constant = (h->stat.i_slice_count[SLICE_TYPE_P] == 0) ? 24
- : 1 + h->stat.f_slice_qp[SLICE_TYPE_P] / h->stat.i_slice_count[SLICE_TYPE_P];
+ h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24
+ : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P];
rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, 51 );
rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, 51 );
@@ -1042,27 +1298,23 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
}
return X264_TYPE_AUTO;
}
- switch( rc->entry[frame_num].pict_type )
- {
- case SLICE_TYPE_I:
- return rc->entry[frame_num].kept_as_ref ? X264_TYPE_IDR : X264_TYPE_I;
-
- case SLICE_TYPE_B:
- return rc->entry[frame_num].kept_as_ref ? X264_TYPE_BREF : X264_TYPE_B;
-
- case SLICE_TYPE_P:
- default:
- return X264_TYPE_P;
- }
+ return rc->entry[frame_num].frame_type;
}
else
- {
return X264_TYPE_AUTO;
- }
+}
+
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
+{
+ ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
+ if( h->param.analyse.i_weighted_pred <= 0 )
+ return;
+ if( rce->i_weight_denom >= 0 )
+ SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0], rce->i_weight_denom, rce->weight[1] );
}
/* After encoding one frame, save stats and update ratecontrol state */
-void x264_ratecontrol_end( x264_t *h, int bits )
+int x264_ratecontrol_end( x264_t *h, int bits )
{
x264_ratecontrol_t *rc = h->rc;
const int *mbs = h->stat.frame.i_mb_count;
@@ -1090,8 +1342,8 @@ void x264_ratecontrol_end( x264_t *h, int bits )
( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
: '-';
- fprintf( rc->p_stat_file_out,
- "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n",
+ if( fprintf( rc->p_stat_file_out,
+ "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:",
h->fenc->i_frame, h->i_frame,
c_type, rc->qpa_rc,
h->stat.frame.i_tex_bits,
@@ -1100,7 +1352,43 @@ void x264_ratecontrol_end( x264_t *h, int bits )
h->stat.frame.i_mb_count_i,
h->stat.frame.i_mb_count_p,
h->stat.frame.i_mb_count_skip,
- c_direct);
+ c_direct) < 0 )
+ goto fail;
+
+ /* Only write information for reference reordering once. */
+ int use_old_stats = h->param.rc.b_stat_read && rc->rce->refs > 1;
+ for( i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref0); i++ )
+ {
+ int refcount = use_old_stats ? rc->rce->refcount[i]
+ : h->param.b_interlaced ? h->stat.frame.i_mb_count_ref[0][i*2]
+ + h->stat.frame.i_mb_count_ref[0][i*2+1]
+ : h->stat.frame.i_mb_count_ref[0][i];
+ if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 )
+ goto fail;
+ }
+
+ if( h->sh.weight[0][0].weightfn )
+ {
+ if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32, h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
+ goto fail;
+ }
+
+ if( fprintf( rc->p_stat_file_out, ";\n") < 0 )
+ goto fail;
+
+ /* Don't re-write the data in multi-pass mode. */
+ if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
+ {
+ uint8_t i_type = h->sh.i_type;
+ int i;
+ /* Values are stored as big-endian FIX8.8 */
+ for( i = 0; i < h->mb.i_mb_count; i++ )
+ rc->qp_buffer[0][i] = endian_fix16( h->fenc->f_qp_offset[i]*256.0 );
+ if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 )
+ goto fail;
+ if( fwrite( rc->qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count )
+ goto fail;
+ }
}
if( rc->b_abr )
@@ -1116,9 +1404,6 @@ void x264_ratecontrol_end( x264_t *h, int bits )
rc->cplxr_sum *= rc->cbr_decay;
rc->wanted_bits_window += rc->bitrate / rc->fps;
rc->wanted_bits_window *= rc->cbr_decay;
-
- if( h->param.i_threads == 1 )
- accum_p_qp_update( h, rc->qpa_rc );
}
if( rc->b_2pass )
@@ -1131,20 +1416,20 @@ void x264_ratecontrol_end( x264_t *h, int bits )
if( h->sh.i_type == SLICE_TYPE_B )
{
rc->bframe_bits += bits;
- if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) )
+ if( h->fenc->b_last_minigop_bframe )
{
update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa_rc),
h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
- /* In some cases, such as completely blank scenes, pred_b_from_p can go nuts */
- /* Hackily cap the predictor coeff in case this happens. */
- /* FIXME FIXME FIXME */
- rc->pred_b_from_p->coeff = X264_MIN( rc->pred_b_from_p->coeff, 10. );
rc->bframe_bits = 0;
}
}
}
update_vbv( h, bits );
+ return 0;
+fail:
+ x264_log(h, X264_LOG_ERROR, "ratecontrol_end: stats file could not be written to\n");
+ return -1;
}
/****************************************************************************
@@ -1160,11 +1445,11 @@ static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor
double q;
x264_zone_t *zone = get_zone( h, frame_num );
- q = pow( rce->blurred_complexity, 1 - h->param.rc.f_qcompress );
+ q = pow( rce->blurred_complexity, 1 - rcc->qcompress );
// avoid NaN's in the rc_eq
if(!isfinite(q) || rce->tex_bits + rce->mv_bits == 0)
- q = rcc->last_qscale;
+ q = rcc->last_qscale_for[rce->pict_type];
else
{
rcc->last_rceq = q;
@@ -1252,17 +1537,28 @@ static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q)
static double predict_size( predictor_t *p, double q, double var )
{
- return p->coeff*var / (q*p->count);
+ return (p->coeff*var + p->offset) / (q*p->count);
}
static void update_predictor( predictor_t *p, double q, double var, double bits )
{
+ const double range = 1.5;
if( var < 10 )
return;
- p->count *= p->decay;
- p->coeff *= p->decay;
- p->count ++;
- p->coeff += bits*q / var;
+ double old_coeff = p->coeff / p->count;
+ double new_coeff = bits*q / var;
+ double new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range );
+ double new_offset = bits*q - new_coeff_clipped * var;
+ if( new_offset >= 0 )
+ new_coeff = new_coeff_clipped;
+ else
+ new_offset = 0;
+ p->count *= p->decay;
+ p->coeff *= p->decay;
+ p->offset *= p->decay;
+ p->count ++;
+ p->coeff += new_coeff;
+ p->offset += new_offset;
}
// update VBV after encoding a frame
@@ -1277,30 +1573,34 @@ static void update_vbv( x264_t *h, int bits )
if( !rcc->b_vbv )
return;
- rct->buffer_fill_final += rct->buffer_rate - bits;
+ rct->buffer_fill_final -= bits;
if( rct->buffer_fill_final < 0 )
- x264_log( h, X264_LOG_WARNING, "VBV underflow (%.0f bits)\n", rct->buffer_fill_final );
- rct->buffer_fill_final = x264_clip3f( rct->buffer_fill_final, 0, rct->buffer_size );
+ x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
+ rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
+ rct->buffer_fill_final += rct->buffer_rate;
+ rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
}
// provisionally update VBV according to the planned size of all frames currently in progress
-static void update_vbv_plan( x264_t *h )
+static void update_vbv_plan( x264_t *h, int overhead )
{
x264_ratecontrol_t *rcc = h->rc;
- rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
- if( h->param.i_threads > 1 )
+ rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
+ if( h->i_thread_frames > 1 )
{
int j = h->rc - h->thread[0]->rc;
int i;
- for( i=1; i<h->param.i_threads; i++ )
+ for( i=1; i<h->i_thread_frames; i++ )
{
- x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+ x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
double bits = t->rc->frame_size_planned;
if( !t->b_thread_active )
continue;
bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
- rcc->buffer_fill += rcc->buffer_rate - bits;
- rcc->buffer_fill = x264_clip3( rcc->buffer_fill, 0, rcc->buffer_size );
+ rcc->buffer_fill -= bits;
+ rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
+ rcc->buffer_fill += rcc->buffer_rate;
+ rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
}
}
}
@@ -1314,49 +1614,104 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
double q0 = q;
/* B-frames are not directly subject to VBV,
- * since they are controlled by the P-frames' QPs.
- * FIXME: in 2pass we could modify previous frames' QP too,
- * instead of waiting for the buffer to fill */
- if( rcc->b_vbv &&
- ( pict_type == SLICE_TYPE_P ||
- ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) )
- {
- if( rcc->buffer_fill/rcc->buffer_size < 0.5 )
- q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 );
- }
+ * since they are controlled by the P-frames' QPs. */
if( rcc->b_vbv && rcc->last_satd > 0 )
{
- /* Now a hard threshold to make sure the frame fits in VBV.
- * This one is mostly for I-frames. */
- double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
- double qf = 1.0;
- if( bits > rcc->buffer_fill/2 )
- qf = x264_clip3f( rcc->buffer_fill/(2*bits), 0.2, 1.0 );
- q /= qf;
- bits *= qf;
- if( bits < rcc->buffer_rate/2 )
- q *= bits*2/rcc->buffer_rate;
- q = X264_MAX( q0, q );
+ /* Lookahead VBV: raise the quantizer as necessary such that no frames in
+ * the lookahead overflow and such that the buffer is in a reasonable state
+ * by the end of the lookahead. */
+ if( h->param.rc.i_lookahead )
+ {
+ int j, iterations, terminate = 0;
+
+ /* Avoid an infinite loop. */
+ for( iterations = 0; iterations < 1000 && terminate != 3; iterations++ )
+ {
+ double frame_q[3];
+ double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+ double buffer_fill_cur = rcc->buffer_fill - cur_bits;
+ double target_fill;
+ frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
+ frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
+ frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor;
+
+ /* Loop over the planned future frames. */
+ for( j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
+ {
+ buffer_fill_cur += rcc->buffer_rate;
+ int i_type = h->fenc->i_planned_type[j];
+ int i_satd = h->fenc->i_planned_satd[j];
+ if( i_type == X264_TYPE_AUTO )
+ break;
+ i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
+ cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
+ buffer_fill_cur -= cur_bits;
+ }
+ /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
+ target_fill = X264_MIN( rcc->buffer_fill + j * rcc->buffer_rate * 0.5, rcc->buffer_size * 0.5 );
+ if( buffer_fill_cur < target_fill )
+ {
+ q *= 1.01;
+ terminate |= 1;
+ continue;
+ }
+ /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
+ target_fill = x264_clip3f( rcc->buffer_fill - j * rcc->buffer_rate * 0.5, rcc->buffer_size * 0.8, rcc->buffer_size );
+ if( rcc->b_vbv_min_rate && buffer_fill_cur > target_fill )
+ {
+ q /= 1.01;
+ terminate |= 2;
+ continue;
+ }
+ break;
+ }
+ }
+ /* Fallback to old purely-reactive algorithm: no lookahead. */
+ else
+ {
+ if( ( pict_type == SLICE_TYPE_P ||
+ ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) &&
+ rcc->buffer_fill/rcc->buffer_size < 0.5 )
+ {
+ q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 );
+ }
+
+ /* Now a hard threshold to make sure the frame fits in VBV.
+ * This one is mostly for I-frames. */
+ double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+ double qf = 1.0;
+ /* For small VBVs, allow the frame to use up the entire VBV. */
+ double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1;
+ /* For single-frame VBVs, request that the frame use up the entire VBV. */
+ double min_fill_factor = rcc->single_frame_vbv ? 1 : 2;
+
+ if( bits > rcc->buffer_fill/max_fill_factor )
+ qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
+ q /= qf;
+ bits *= qf;
+ if( bits < rcc->buffer_rate/min_fill_factor )
+ q *= bits*min_fill_factor/rcc->buffer_rate;
+ q = X264_MAX( q0, q );
+ }
/* Check B-frame complexity, and use up any bits that would
* overflow before the next P-frame. */
- if( h->sh.i_type == SLICE_TYPE_P )
+ if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
{
int nb = rcc->bframes;
+ double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
double pbbits = bits;
double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
double space;
-
- if( bbits > rcc->buffer_rate )
+ if( bbits > rcc->buffer_rate )
nb = 0;
pbbits += nb * bbits;
space = rcc->buffer_fill + (1+nb)*rcc->buffer_rate - rcc->buffer_size;
if( pbbits < space )
{
- q *= X264_MAX( pbbits / space,
- bits / (0.5 * rcc->buffer_size) );
+ q *= X264_MAX( pbbits / space, bits / (0.5 * rcc->buffer_size) );
}
q = X264_MAX( q0-5, q );
}
@@ -1389,9 +1744,9 @@ static float rate_estimate_qscale( x264_t *h )
int pict_type = h->sh.i_type;
double lmin = rcc->lmin[pict_type];
double lmax = rcc->lmax[pict_type];
- int64_t total_bits = 8*(h->stat.i_slice_size[SLICE_TYPE_I]
- + h->stat.i_slice_size[SLICE_TYPE_P]
- + h->stat.i_slice_size[SLICE_TYPE_B]);
+ int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I]
+ + h->stat.i_frame_size[SLICE_TYPE_P]
+ + h->stat.i_frame_size[SLICE_TYPE_B]);
if( rcc->b_2pass )
{
@@ -1434,14 +1789,20 @@ static float rate_estimate_qscale( x264_t *h )
else
q += rcc->pb_offset;
- rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
+ if( rcc->b_2pass && rcc->b_vbv )
+ rcc->frame_size_planned = qscale2bits( &rce, q );
+ else
+ rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
- rcc->last_satd = 0;
+
+ /* For row SATDs */
+ if( rcc->b_vbv )
+ rcc->last_satd = x264_rc_analyse_slice( h );
return qp2qscale(q);
}
else
{
- double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
+ double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
if( rcc->b_2pass )
{
@@ -1451,13 +1812,13 @@ static float rate_estimate_qscale( x264_t *h )
if( rcc->b_vbv )
{
- if( h->param.i_threads > 1 )
+ if( h->i_thread_frames > 1 )
{
int j = h->rc - h->thread[0]->rc;
int i;
- for( i=1; i<h->param.i_threads; i++ )
+ for( i=1; i<h->i_thread_frames; i++ )
{
- x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+ x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
double bits = t->rc->frame_size_planned;
if( !t->b_thread_active )
continue;
@@ -1468,16 +1829,16 @@ static float rate_estimate_qscale( x264_t *h )
}
else
{
- if( h->fenc->i_frame < h->param.i_threads )
+ if( h->fenc->i_frame < h->i_thread_frames )
predicted_bits += (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps;
else
- predicted_bits += (int64_t)(h->param.i_threads - 1) * rcc->bitrate / rcc->fps;
+ predicted_bits += (int64_t)(h->i_thread_frames - 1) * rcc->bitrate / rcc->fps;
}
diff = predicted_bits - (int64_t)rce.expected_bits;
q = rce.new_qscale;
q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2);
- if( ((h->fenc->i_frame + 1 - h->param.i_threads) >= rcc->fps) &&
+ if( ((h->fenc->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
(rcc->expected_bits_sum > 0))
{
/* Adjust quant based on the difference between
@@ -1505,7 +1866,7 @@ static float rate_estimate_qscale( x264_t *h )
expected_size = qscale2bits(&rce, q);
expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
}
- rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h );
+ rcc->last_satd = x264_rc_analyse_slice( h );
}
q = x264_clip3f( q, lmin, lmax );
}
@@ -1523,7 +1884,7 @@ static float rate_estimate_qscale( x264_t *h )
double wanted_bits, overflow=1, lmin, lmax;
- rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h );
+ rcc->last_satd = x264_rc_analyse_slice( h );
rcc->short_term_cplxsum *= 0.5;
rcc->short_term_cplxcount *= 0.5;
rcc->short_term_cplxsum += rcc->last_satd;
@@ -1544,7 +1905,7 @@ static float rate_estimate_qscale( x264_t *h )
}
else
{
- int i_frame_done = h->fenc->i_frame + 1 - h->param.i_threads;
+ int i_frame_done = h->fenc->i_frame + 1 - h->i_thread_frames;
q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
@@ -1578,10 +1939,11 @@ static float rate_estimate_qscale( x264_t *h )
q = x264_clip3f(q, lmin, lmax);
}
- else if( h->param.rc.i_rc_method == X264_RC_CRF )
+ else if( h->param.rc.i_rc_method == X264_RC_CRF && rcc->qcompress != 1 )
{
q = qp2qscale( ABR_INIT_QP ) / fabs( h->param.rc.f_ip_factor );
}
+ rcc->qp_novbv = qscale2qp(q);
//FIXME use get_diff_limited_q() ?
q = clip_qscale( h, pict_type, q );
@@ -1591,17 +1953,73 @@ static float rate_estimate_qscale( x264_t *h )
rcc->last_qscale = q;
if( !(rcc->b_2pass && !rcc->b_vbv) && h->fenc->i_frame == 0 )
- rcc->last_qscale_for[SLICE_TYPE_P] = q;
+ rcc->last_qscale_for[SLICE_TYPE_P] = q * fabs( h->param.rc.f_ip_factor );
- if( rcc->b_2pass && rcc->b_vbv)
+ if( rcc->b_2pass && rcc->b_vbv )
rcc->frame_size_planned = qscale2bits(&rce, q);
else
rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+
+ /* Always use up the whole VBV in this case. */
+ if( rcc->single_frame_vbv )
+ rcc->frame_size_planned = rcc->buffer_rate;
x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
return q;
}
}
+void x264_threads_distribute_ratecontrol( x264_t *h )
+{
+ int i, row, totalsize = 0;
+ if( h->rc->b_vbv )
+ for( row = 0; row < h->sps->i_mb_height; row++ )
+ totalsize += h->fdec->i_row_satd[row];
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[i];
+ x264_ratecontrol_t *rc = h->rc;
+ memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
+ /* Calculate the planned slice size. */
+ if( h->rc->b_vbv && rc->frame_size_planned )
+ {
+ int size = 0;
+ for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
+ size += h->fdec->i_row_satd[row];
+ t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
+ }
+ else
+ t->rc->slice_size_planned = 0;
+ }
+}
+
+void x264_threads_merge_ratecontrol( x264_t *h )
+{
+ int i, j, k;
+ x264_ratecontrol_t *rc = h->rc;
+ x264_emms();
+
+ for( i = 1; i < h->param.i_threads; i++ )
+ {
+ x264_ratecontrol_t *t = h->thread[i]->rc;
+ rc->qpa_rc += t->qpa_rc;
+ rc->qpa_aq += t->qpa_aq;
+ for( j = 0; j < 5; j++ )
+ for( k = 0; k < 2; k++ )
+ {
+ rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
+ rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
+ rc->row_preds[j][k].count += t->row_preds[j][k].count;
+ }
+ }
+ for( j = 0; j < 5; j++ )
+ for( k = 0; k < 2; k++ )
+ {
+ rc->row_preds[j][k].coeff /= h->param.i_threads;
+ rc->row_preds[j][k].offset /= h->param.i_threads;
+ rc->row_preds[j][k].count /= h->param.i_threads;
+ }
+}
+
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
{
if( cur != prev )
@@ -1621,6 +2039,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
COPY(short_term_cplxcount);
COPY(bframes);
COPY(prev_zone);
+ COPY(qpbuf_pos);
#undef COPY
}
if( cur != next )
@@ -1704,7 +2123,7 @@ static double count_expected_bits( x264_t *h )
return expected_bits;
}
-static void vbv_pass2( x264_t *h )
+static int vbv_pass2( x264_t *h )
{
/* for each interval of buffer_full .. underflow, uniformly increase the qp of all
* frames in the interval until either buffer is full at some intermediate frame or the
@@ -1712,7 +2131,7 @@ static void vbv_pass2( x264_t *h )
* Then do the converse to put bits back into overflow areas until target size is met */
x264_ratecontrol_t *rcc = h->rc;
- double *fills = x264_malloc((rcc->num_entries+1)*sizeof(double));
+ double *fills;
double all_available_bits = h->param.rc.i_bitrate * 1000. * rcc->num_entries / rcc->fps;
double expected_bits = 0;
double adjustment;
@@ -1722,6 +2141,7 @@ static void vbv_pass2( x264_t *h )
double qscale_max = qp2qscale(h->param.rc.i_qp_max);
int iterations = 0;
int adj_min, adj_max;
+ CHECKED_MALLOC( fills, (rcc->num_entries+1)*sizeof(double) );
fills++;
@@ -1753,7 +2173,7 @@ static void vbv_pass2( x264_t *h )
adj_max = fix_underflow(h, t0, t1, 1.001, qscale_min, qscale_max);
expected_bits = count_expected_bits(h);
- } while((expected_bits < .995*all_available_bits) && ((int)(expected_bits+.5) > (int)(prev_bits+.5)) );
+ } while((expected_bits < .995*all_available_bits) && ((int64_t)(expected_bits+.5) > (int64_t)(prev_bits+.5)) );
if (!adj_max)
x264_log( h, X264_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n");
@@ -1763,6 +2183,9 @@ static void vbv_pass2( x264_t *h )
rcc->entry[i].expected_vbv = rcc->buffer_size - fills[i];
x264_free(fills-1);
+ return 0;
+fail:
+ return -1;
}
static int init_pass2( x264_t *h )
@@ -1830,9 +2253,9 @@ static int init_pass2( x264_t *h )
rce->blurred_complexity = cplx_sum / weight_sum;
}
- qscale = x264_malloc(sizeof(double)*rcc->num_entries);
- if(filter_size > 1)
- blurred_qscale = x264_malloc(sizeof(double)*rcc->num_entries);
+ CHECKED_MALLOC( qscale, sizeof(double)*rcc->num_entries );
+ if( filter_size > 1 )
+ CHECKED_MALLOC( blurred_qscale, sizeof(double)*rcc->num_entries );
else
blurred_qscale = qscale;
@@ -1845,7 +2268,11 @@ static int init_pass2( x264_t *h )
expected_bits = 1;
for(i=0; i<rcc->num_entries; i++)
- expected_bits += qscale2bits(&rcc->entry[i], get_qscale(h, &rcc->entry[i], 1.0, i));
+ {
+ double q = get_qscale(h, &rcc->entry[i], 1.0, i);
+ expected_bits += qscale2bits(&rcc->entry[i], q);
+ rcc->last_qscale_for[rcc->entry[i].pict_type] = q;
+ }
step_mult = all_available_bits / expected_bits;
rate_factor = 0;
@@ -1862,6 +2289,7 @@ static int init_pass2( x264_t *h )
for(i=0; i<rcc->num_entries; i++)
{
qscale[i] = get_qscale(h, &rcc->entry[i], rate_factor, i);
+ rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i];
}
/* fixed I/B qscale relative to P */
@@ -1914,7 +2342,8 @@ static int init_pass2( x264_t *h )
x264_free(blurred_qscale);
if(rcc->b_vbv)
- vbv_pass2(h);
+ if( vbv_pass2( h ) )
+ return -1;
expected_bits = count_expected_bits(h);
if(fabs(expected_bits/all_available_bits - 1.0) > 0.01)
@@ -1949,6 +2378,6 @@ static int init_pass2( x264_t *h )
}
return 0;
+fail:
+ return -1;
}
-
-
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index 3310d3c..5a8d088 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -29,16 +29,22 @@ void x264_ratecontrol_delete( x264_t * );
void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
void x264_adaptive_quant( x264_t * );
+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
+int x264_reference_build_list_optimal( x264_t *h );
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
-void x264_ratecontrol_start( x264_t *, int i_force_qp );
+void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
int x264_ratecontrol_slice_type( x264_t *, int i_frame );
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
void x264_ratecontrol_mb( x264_t *, int bits );
int x264_ratecontrol_qp( x264_t * );
-void x264_ratecontrol_end( x264_t *, int bits );
+int x264_ratecontrol_end( x264_t *, int bits );
void x264_ratecontrol_summary( x264_t * );
void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
int x264_ratecontrol_get_estimated_size( x264_t const *);
int x264_rc_analyse_slice( x264_t *h );
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_threads_distribute_ratecontrol( x264_t *h );
+void x264_threads_merge_ratecontrol( x264_t *h );
#endif
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 480d71a..3ed4a47 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -52,10 +52,9 @@ static uint16_t cabac_size_5ones[128];
#undef x264_cabac_encode_decision_noup
#define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
#define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
-#define x264_cabac_encode_terminal(c) x264_cabac_size_decision_noup(c,276,0)
+#define x264_cabac_encode_terminal(c) ((c)->f8_bits_encoded += 7)
#define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256)
#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
-#define x264_cabac_encode_flush(h,c)
#define x264_macroblock_write_cabac static x264_macroblock_size_cabac
#include "cabac.c"
@@ -105,7 +104,7 @@ static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
{
- DECLARE_ALIGNED_16(static uint8_t zero[16]);
+ ALIGNED_16(static uint8_t zero[16]);
int satd = 0;
uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
@@ -124,16 +123,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
}
- satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
+ satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
}
return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
}
static inline int ssd_mb( x264_t *h )
{
- return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
- + ssd_plane(h, PIXEL_8x8, 1, 0, 0)
- + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
+ int chromassd = ssd_plane(h, PIXEL_8x8, 1, 0, 0) + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
+ chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+ return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chromassd;
}
static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
@@ -141,6 +140,7 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
int b_transform_bak = h->mb.b_transform_8x8;
int i_ssd;
int i_bits;
+ int type_bak = h->mb.i_type;
x264_macroblock_encode( h );
@@ -159,13 +159,12 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
}
else
{
- bs_t bs_tmp = h->out.bs;
- bs_tmp.i_bits_encoded = 0;
- x264_macroblock_size_cavlc( h, &bs_tmp );
- i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
+ x264_macroblock_size_cavlc( h );
+ i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
}
h->mb.b_transform_8x8 = b_transform_bak;
+ h->mb.i_type = type_bak;
return i_ssd + i_bits;
}
@@ -203,12 +202,11 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
{
uint64_t i_ssd, i_bits;
int i8 = i4 >> 2;
+ int chromassd;
if( i_pixel == PIXEL_16x16 )
{
- int type_bak = h->mb.i_type;
int i_cost = x264_rd_cost_mb( h, i_lambda2 );
- h->mb.i_type = type_bak;
return i_cost;
}
@@ -223,9 +221,10 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
if( i_pixel == PIXEL_8x16 )
x264_macroblock_encode_p8x8( h, i8+2 );
- i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 )
- + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
- + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+ chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
+ + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+ chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+ i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 ) + chromassd;
if( h->param.b_cabac )
{
@@ -357,31 +356,6 @@ void x264_rdo_init( void )
}
}
-// should the intra and inter lambdas be different?
-// I'm just matching the behaviour of deadzone quant.
-static const int lambda2_tab[2][52] = {
- // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
- { 46, 58, 73, 92, 117, 147,
- 185, 233, 294, 370, 466, 587,
- 740, 932, 1174, 1480, 1864, 2349,
- 2959, 3728, 4697, 5918, 7457, 9395,
- 11837, 14914, 18790, 23674, 29828, 37581,
- 47349, 59656, 75163, 94699, 119313, 150326,
- 189399, 238627, 300652, 378798, 477255, 601304,
- 757596, 954511, 1202608, 1515192, 1909022, 2405217,
- 3030384, 3818045, 4810435, 6060769 },
- // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
- { 27, 34, 43, 54, 68, 86,
- 108, 136, 172, 216, 273, 343,
- 433, 545, 687, 865, 1090, 1374,
- 1731, 2180, 2747, 3461, 4361, 5494,
- 6922, 8721, 10988, 13844, 17442, 21976,
- 27688, 34885, 43953, 55377, 69771, 87906,
- 110755, 139543, 175813, 221511, 279087, 351627,
- 443023, 558174, 703255, 886046, 1116348, 1406511,
- 1772093, 2232697, 2813022, 3544186 }
-};
-
typedef struct {
int64_t score;
int level_idx; // index into level_tree[]
@@ -540,7 +514,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
/* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
{
- int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
+ int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];
int predicted_coef = orig_coef - i_coef * signs[i];
int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
@@ -584,7 +558,14 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
}
- n.score += ssd;
+ if( j || i || dc )
+ n.score += ssd;
+ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
+ else
+ {
+ d = i_coef * signs[0] - ((unquant_abs_level * signs[0] + 8)&~15);
+ n.score += (int64_t)d*d * coef_weight[i];
+ }
/* save the node if it's better than any existing node with the same cabac ctx */
if( n.score < nodes_cur[node_ctx].score )
@@ -610,11 +591,13 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
}
j = bnode->level_idx;
- for( i = b_ac; i < i_coefs; i++ )
+ for( i = b_ac; j; i++ )
{
dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
j = level_tree[j].next;
}
+ for( ; i < i_coefs; i++ )
+ dct[zigzag[i]] = 0;
return 1;
}
@@ -622,32 +605,32 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
- int i_qp, int i_ctxBlockCat, int b_intra )
+ int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma )
{
- return quant_trellis_cabac( h, (int16_t*)dct,
+ return quant_trellis_cabac( h, dct,
h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
- i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
+ i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
}
-int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
- int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+ int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx )
{
int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
- return quant_trellis_cabac( h, (int16_t*)dct,
+ return quant_trellis_cabac( h, dct,
h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
x264_dct4_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan4[h->mb.b_interlaced],
- i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
+ i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx );
}
-int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
int i_qp, int b_intra, int idx )
{
- return quant_trellis_cabac( h, (int16_t*)dct,
+ return quant_trellis_cabac( h, dct,
h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
x264_dct8_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan8[h->mb.b_interlaced],
- DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );
+ DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx );
}
diff --git a/encoder/set.c b/encoder/set.c
index 3103fcd..f79919b 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -24,9 +24,6 @@
#include <math.h>
#include "common/common.h"
-#ifndef _MSC_VER
-#include "config.h"
-#endif
#include "set.h"
#define bs_write_ue bs_write_ue_big
@@ -83,7 +80,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE;
else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
sps->i_profile_idc = PROFILE_HIGH;
- else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced )
+ else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->analyse.i_weighted_pred > 0 )
sps->i_profile_idc = PROFILE_MAIN;
else
sps->i_profile_idc = PROFILE_BASELINE;
@@ -97,11 +94,9 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
sps->b_constraint_set2 = 0;
sps->i_log2_max_frame_num = 4; /* at least 4 */
- while( (1 << sps->i_log2_max_frame_num) <= param->i_keyint_max )
- {
+ while( (1 << sps->i_log2_max_frame_num) <= param->i_keyint_max && sps->i_log2_max_frame_num < 10 )
sps->i_log2_max_frame_num++;
- }
- sps->i_log2_max_frame_num++; /* just in case */
+ sps->i_log2_max_frame_num++;
sps->i_poc_type = 0;
if( sps->i_poc_type == 0 )
@@ -185,19 +180,21 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
}
sps->vui.b_timing_info_present = 0;
- if( param->i_fps_num > 0 && param->i_fps_den > 0)
+ if( param->i_timebase_num > 0 && param->i_timebase_den > 0 )
{
sps->vui.b_timing_info_present = 1;
- sps->vui.i_num_units_in_tick = param->i_fps_den;
- sps->vui.i_time_scale = param->i_fps_num * 2;
- sps->vui.b_fixed_frame_rate = 1;
+ sps->vui.i_num_units_in_tick = param->i_timebase_num;
+ sps->vui.i_time_scale = param->i_timebase_den * 2;
+ sps->vui.b_fixed_frame_rate = !param->b_vfr_input;
}
- sps->vui.i_num_reorder_frames = param->b_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
+ sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
/* extra slot with pyramid so that we don't have to override the
* order of forgetting old pictures */
sps->vui.i_max_dec_frame_buffering =
- sps->i_num_ref_frames = X264_MIN(16, X264_MAX(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames));
+ sps->i_num_ref_frames = X264_MIN(16, X264_MAX3(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
+ param->i_bframe_pyramid ? 4 : 1 ));
+ sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
sps->vui.b_bitstream_restriction = 1;
if( sps->vui.b_bitstream_restriction )
@@ -213,6 +210,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
void x264_sps_write( bs_t *s, x264_sps_t *sps )
{
+ bs_realign( s );
bs_write( s, 8, sps->i_profile_idc );
bs_write( s, 1, sps->b_constraint_set0 );
bs_write( s, 1, sps->b_constraint_set1 );
@@ -362,6 +360,7 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
}
bs_rbsp_trailing( s );
+ bs_flush( s );
}
void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
@@ -372,13 +371,13 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
pps->i_sps_id = sps->i_id;
pps->b_cabac = param->b_cabac;
- pps->b_pic_order = 0;
+ pps->b_pic_order = param->b_interlaced;
pps->i_num_slice_groups = 1;
pps->i_num_ref_idx_l0_active = 1;
pps->i_num_ref_idx_l1_active = 1;
- pps->b_weighted_pred = 0;
+ pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 : param->rc.i_qp_constant;
@@ -386,7 +385,7 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
pps->b_deblocking_filter_control = 1;
- pps->b_constrained_intra_pred = 0;
+ pps->b_constrained_intra_pred = param->b_constrained_intra;
pps->b_redundant_pic_cnt = 0;
pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
@@ -426,6 +425,7 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
void x264_pps_write( bs_t *s, x264_pps_t *pps )
{
+ bs_realign( s );
bs_write_ue( s, pps->i_id );
bs_write_ue( s, pps->i_sps_id );
@@ -468,9 +468,29 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps )
}
bs_rbsp_trailing( s );
+ bs_flush( s );
}
-void x264_sei_version_write( x264_t *h, bs_t *s )
+void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
+{
+ int payload_size;
+
+ bs_realign( s );
+ bs_write( s, 8, 0x06 ); // payload_type = Recovery Point
+ payload_size = bs_size_ue( recovery_frame_cnt ) + 4;
+
+ bs_write( s, 8, (payload_size + 7) / 8);
+ bs_write_ue( s, recovery_frame_cnt ); // recovery_frame_cnt
+ bs_write( s, 1, 1 ); //exact_match_flag 1
+ bs_write( s, 1, 0 ); //broken_link_flag 0
+ bs_write( s, 2, 0 ); //changing_slice_group 0
+
+ bs_align_10( s );
+ bs_rbsp_trailing( s );
+ bs_flush( s );
+}
+
+int x264_sei_version_write( x264_t *h, bs_t *s )
{
int i;
// random ID number generated according to ISO-11578
@@ -479,14 +499,19 @@ void x264_sei_version_write( x264_t *h, bs_t *s )
0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
};
char *opts = x264_param2string( &h->param, 0 );
- char *version = x264_malloc( 200 + strlen(opts) );
+ char *version;
int length;
+ if( !opts )
+ return -1;
+ CHECKED_MALLOC( version, 200 + strlen( opts ) );
+
sprintf( version, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
"Copyleft 2003-2009 - http://www.videolan.org/x264.html - options: %s",
X264_BUILD, X264_VERSION, opts );
length = strlen(version)+1+16;
+ bs_realign( s );
bs_write( s, 8, 0x5 ); // payload_type = user_data_unregistered
// payload_size
for( i = 0; i <= length-255; i += 255 )
@@ -499,9 +524,14 @@ void x264_sei_version_write( x264_t *h, bs_t *s )
bs_write( s, 8, version[i] );
bs_rbsp_trailing( s );
+ bs_flush( s );
x264_free( opts );
x264_free( version );
+ return 0;
+fail:
+ x264_free( opts );
+ return -1;
}
const x264_level_t x264_levels[] =
@@ -536,7 +566,7 @@ int x264_validate_levels( x264_t *h, int verbose )
{
int ret = 0;
int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
- int dpb = mbs * 384 * h->sps->i_num_ref_frames;
+ int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
const x264_level_t *l = x264_levels;
@@ -550,7 +580,7 @@ int x264_validate_levels( x264_t *h, int verbose )
h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
if( dpb > l->dpb )
ERROR( "DPB size (%d frames, %d bytes) > level limit (%d frames, %d bytes)\n",
- h->sps->i_num_ref_frames, dpb, (int)(l->dpb / (384*mbs)), l->dpb );
+ h->sps->vui.i_max_dec_frame_buffering, dpb, (int)(l->dpb / (384*mbs)), l->dpb );
#define CHECK( name, limit, val ) \
if( (val) > (limit) ) \
diff --git a/encoder/set.h b/encoder/set.h
index 3611c9a..125f7e1 100644
--- a/encoder/set.h
+++ b/encoder/set.h
@@ -28,7 +28,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
void x264_sps_write( bs_t *s, x264_sps_t *sps );
void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
void x264_pps_write( bs_t *s, x264_pps_t *pps );
-void x264_sei_version_write( x264_t *h, bs_t *s );
+void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
+int x264_sei_version_write( x264_t *h, bs_t *s );
int x264_validate_levels( x264_t *h, int verbose );
#endif
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 2c16429..057f6a6 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -3,8 +3,9 @@
*****************************************************************************
* Copyright (C) 2005-2008 x264 project
*
- * Authors: Loren Merritt <lorenm at u.washington.edu>
- * Jason Garrett-Glaser <darkshikari at gmail.com>
+ * Authors: Jason Garrett-Glaser <darkshikari at gmail.com>
+ * Loren Merritt <lorenm at u.washington.edu>
+ * Dylan Yudaken <dyudaken at gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,27 +23,227 @@
*****************************************************************************/
#include <math.h>
-#include <limits.h>
#include "common/common.h"
#include "common/cpu.h"
#include "macroblock.h"
#include "me.h"
+static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
+ x264_frame_t **frames, int p0, int p1, int b,
+ int b_intra_penalty );
static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
- a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
+ a->i_qp = X264_LOOKAHEAD_QP;
a->i_lambda = x264_lambda_tab[ a->i_qp ];
x264_mb_analyse_load_costs( h, a );
- h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
- h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
+ if( h->param.analyse.i_subpel_refine > 1 )
+ {
+ h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method );
+ h->mb.i_subpel_refine = 4;
+ }
+ else
+ {
+ h->mb.i_me_method = X264_ME_DIA;
+ h->mb.i_subpel_refine = 2;
+ }
h->mb.b_chroma_me = 0;
}
+/* makes a non-h264 weight (i.e. fix7), into an h264 weight */
+static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_weight_t *w )
+{
+ w->i_offset = offset;
+ w->i_denom = 7;
+ w->i_scale = weight_nonh264;
+ while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) )
+ {
+ w->i_denom--;
+ w->i_scale >>= 1;
+ }
+ w->i_scale = X264_MIN( w->i_scale, 127 );
+}
+
+void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
+{
+ int x,y;
+ uint32_t sad = 0;
+ uint64_t ssd = 0;
+ uint8_t *p = frame->plane[0];
+ int stride = frame->i_stride[0];
+ int width = frame->i_width[0];
+ int height = frame->i_lines[0];
+ for( y = 0; y < height>>4; y++, p += stride*16 )
+ for( x = 0; x < width; x += 16 )
+ {
+ uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
+ sad += (uint32_t)res;
+ ssd += res >> 32;
+ }
+ frame->i_pixel_sum = sad;
+ frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
+}
+
+static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
+{
+ int ref0_distance = fenc->i_frame - ref->i_frame - 1;
+ /* Note: this will never run during lookahead as weights_analyse is only called if no
+ * motion search has been done. */
+ if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
+ {
+ int i_stride = fenc->i_stride_lowres;
+ int i_lines = fenc->i_lines_lowres;
+ int i_width = fenc->i_width_lowres;
+ int i_mb_xy = 0;
+ int x,y;
+ uint8_t *p = dest;
+
+ for( y = 0; y < i_lines; y += 8, p += i_stride*8 )
+ for( x = 0; x < i_width; x += 8, i_mb_xy++ )
+ {
+ int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
+ int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
+ h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride,
+ mvx+(x<<2), mvy+(y<<2), 8, 8, weight_none );
+ }
+ x264_emms();
+ return dest;
+ }
+ x264_emms();
+ return ref->lowres[0];
+}
+
+static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w )
+{
+ int x, y;
+ unsigned int cost = 0;
+ int i_stride = fenc->i_stride_lowres;
+ int i_lines = fenc->i_lines_lowres;
+ int i_width = fenc->i_width_lowres;
+ uint8_t *fenc_plane = fenc->lowres[0];
+ ALIGNED_ARRAY_8( uint8_t, buf,[8*8] );
+ int pixoff = 0;
+ int i_mb = 0;
+
+ if( w )
+ {
+ for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
+ {
+ w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
+ cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
+ }
+ /* Add cost of weights in the slice header. */
+ int numslices;
+ if( h->param.i_slice_count )
+ numslices = h->param.i_slice_count;
+ else if( h->param.i_slice_max_mbs )
+ numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
+ else
+ numslices = 1;
+ /* FIXME: find a way to account for --slice-max-size?
+ * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+ * Since using lowres frames, assume lambda = 1. */
+ cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
+ }
+ else
+ for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+ cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
+ x264_emms();
+ return cost;
+}
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+{
+ float fenc_mean, ref_mean, fenc_var, ref_var;
+ int i_off, offset_search;
+ int minoff, minscale, mindenom;
+ unsigned int minscore, origscore;
+ int i_delta_index = fenc->i_frame - ref->i_frame - 1;
+ /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+ const float epsilon = 1.0/128.0;
+ float guess_scale;
+ int found;
+ x264_weight_t *weights = fenc->weight[0];
+
+ fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
+ ref_var = round( sqrt( ref->i_pixel_ssd ) );
+ fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+ ref_mean = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+
+ //early termination
+ if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
+ {
+ SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+ return;
+ }
+
+ guess_scale = ref_var ? fenc_var/ref_var : 0;
+ x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[0] );
+
+ found = 0;
+ mindenom = weights[0].i_denom;
+ minscale = weights[0].i_scale;
+ minoff = 0;
+ offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
+
+ if( !fenc->b_intra_calculated )
+ {
+ x264_mb_analysis_t a;
+ x264_lowres_context_init( h, &a );
+ x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
+ }
+ uint8_t *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
+ origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0 );
+
+ if( !minscore )
+ {
+ SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+ return;
+ }
+
+ // This gives a slight improvement due to rounding errors but only tests
+ // one offset on lookahead.
+ // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
+ for( i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
+ {
+ SET_WEIGHT( weights[0], 1, minscale, mindenom, i_off );
+ unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0] );
+ COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+ }
+ x264_emms();
+
+ /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
+ /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+ if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
+ {
+ SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+ return;
+ }
+ else
+ SET_WEIGHT( weights[0], 1, minscale, mindenom, minoff );
+
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn )
+ fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
+
+ if( weights[0].weightfn && b_lookahead )
+ {
+ //scale lowres in lookahead for slicetype_frame_cost
+ uint8_t *src = ref->buffer_lowres[0];
+ uint8_t *dst = h->mb.p_weight_buf[0];
+ int width = ref->i_width_lowres + PADH*2;
+ int height = ref->i_lines_lowres + PADV*2;
+ x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
+ width, height, &weights[0] );
+ fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
+ }
+}
+
static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
- x264_frame_t **frames, int p0, int p1, int b,
- int dist_scale_factor, int do_search[2] )
+ x264_frame_t **frames, int p0, int p1, int b,
+ int dist_scale_factor, int do_search[2], const x264_weight_t *w )
{
x264_frame_t *fref0 = frames[p0];
x264_frame_t *fref1 = frames[p1];
@@ -53,21 +254,22 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
const int i_mb_stride = h->sps->i_mb_width;
const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride;
const int i_stride = fenc->i_stride_lowres;
- const int i_pel_offset = 8 * ( i_mb_x + i_mb_y * i_stride );
+ const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride);
const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
int16_t (*fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] };
int (*fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
- DECLARE_ALIGNED_8( uint8_t pix1[9*FDEC_STRIDE] );
+ ALIGNED_ARRAY_8( uint8_t, pix1,[9*FDEC_STRIDE] );
uint8_t *pix2 = pix1+8;
x264_me_t m[2];
int i_bcost = COST_MAX;
int l, i;
+ int list_used = 0;
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );
- if( !p0 && !p1 && !b )
+ if( p0 == p1 )
goto lowres_intra_mb;
// no need for h->mb.mv_min[]
@@ -90,6 +292,9 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
(dst)[2] = &(src)[2][i_pel_offset]; \
(dst)[3] = &(src)[3][i_pel_offset]; \
}
+#define LOAD_WPELS_LUMA(dst,src) \
+ (dst) = &(src)[i_pel_offset];
+
#define CLIP_MV( mv ) \
{ \
mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
@@ -97,33 +302,54 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
}
#define TRY_BIDIR( mv0, mv1, penalty ) \
{ \
- int stride1 = 16, stride2 = 16; \
- uint8_t *src1, *src2; \
int i_cost; \
- src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
- (mv0)[0], (mv0)[1], 8, 8 ); \
- src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
- (mv1)[0], (mv1)[1], 8, 8 ); \
- h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
+ if( h->param.analyse.i_subpel_refine <= 1 ) \
+ { \
+ int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
+ int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
+ uint8_t *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
+ uint8_t *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
+ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
+ } \
+ else \
+ { \
+ int stride1 = 16, stride2 = 16; \
+ uint8_t *src1, *src2; \
+ src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
+ (mv0)[0], (mv0)[1], 8, 8, w ); \
+ src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
+ (mv1)[0], (mv1)[1], 8, 8, w ); \
+ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
+ } \
i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
- if( i_bcost > i_cost ) \
- i_bcost = i_cost; \
+ COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
}
m[0].i_pixel = PIXEL_8x8;
m[0].p_cost_mv = a->p_cost_mv;
m[0].i_stride[0] = i_stride;
m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
+ m[0].weight = w;
+ m[0].i_ref = 0;
LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres );
+ m[0].p_fref_w = m[0].p_fref[0];
+ if( w[0].weightfn )
+ LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );
if( b_bidir )
{
int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
- int dmv[2][2];
-
- h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
+ ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
+
+ m[1].i_pixel = PIXEL_8x8;
+ m[1].p_cost_mv = a->p_cost_mv;
+ m[1].i_stride[0] = i_stride;
+ m[1].p_fenc[0] = h->mb.pic.p_fenc[0];
+ m[1].i_ref = 0;
+ m[1].weight = weight_none;
LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
+ m[1].p_fref_w = m[1].p_fref[0];
dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
@@ -131,15 +357,16 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
dmv[1][1] = dmv[0][1] - mvr[1];
CLIP_MV( dmv[0] );
CLIP_MV( dmv[1] );
+ if( h->param.analyse.i_subpel_refine <= 1 )
+ M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
TRY_BIDIR( dmv[0], dmv[1], 0 );
- if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] )
+ if( M64( dmv ) )
{
int i_cost;
h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
- if( i_bcost > i_cost )
- i_bcost = i_cost;
+ COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
}
}
@@ -149,13 +376,13 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
{
int i_mvc = 0;
int16_t (*fenc_mv)[2] = fenc_mvs[l];
- DECLARE_ALIGNED_4( int16_t mvc[4][2] );
+ ALIGNED_4( int16_t mvc[4][2] );
/* Reverse-order MV prediction. */
- *(uint32_t*)mvc[0] = 0;
- *(uint32_t*)mvc[1] = 0;
- *(uint32_t*)mvc[2] = 0;
-#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
+ M32( mvc[0] ) = 0;
+ M32( mvc[1] ) = 0;
+ M32( mvc[2] ) = 0;
+#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
if( i_mb_x < h->sps->i_mb_width - 1 )
MVC(fenc_mv[1]);
if( i_mb_y < h->sps->i_mb_height - 1 )
@@ -171,35 +398,39 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_me_search( h, &m[l], mvc, i_mvc );
m[l].cost -= 2; // remove mvcost from skip mbs
- if( *(uint32_t*)m[l].mv )
+ if( M32( m[l].mv ) )
m[l].cost += 5;
- *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv;
+ CP32( fenc_mvs[l], m[l].mv );
*fenc_costs[l] = m[l].cost;
}
else
{
- *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l];
+ CP32( m[l].mv, fenc_mvs[l] );
m[l].cost = *fenc_costs[l];
}
- i_bcost = X264_MIN( i_bcost, m[l].cost );
+ COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
}
- if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
+ if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
+ /* Store to width-2 bitfield. */
+ frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
+ frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
+
lowres_intra_mb:
/* forbid intra-mbs in B-frames, because it's rare and not worth checking */
/* FIXME: Should we still forbid them now that we cache intra scores? */
- if( !b_bidir )
+ if( !b_bidir || h->param.rc.b_mb_tree )
{
int i_icost, b_intra;
if( !fenc->b_intra_calculated )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
const int intra_penalty = 5;
- int satds[4];
+ int satds[3];
memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
for( i=0; i<8; i++ )
@@ -207,29 +438,30 @@ lowres_intra_mb:
pix++;
if( h->pixf.intra_mbcmp_x3_8x8c )
- {
h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
- h->predict_8x8c[I_PRED_CHROMA_P]( pix );
- satds[I_PRED_CHROMA_P] =
- h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
- }
else
{
- for( i=0; i<4; i++ )
+ for( i=0; i<3; i++ )
{
h->predict_8x8c[i]( pix );
satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
}
}
- i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] );
+ i_icost = X264_MIN3( satds[0], satds[1], satds[2] );
- h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
- for( i=3; i<9; i++ )
+ if( h->param.analyse.i_subpel_refine > 1 )
{
- int satd;
- h->predict_8x8[i]( pix, edge );
- satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ h->predict_8x8c[I_PRED_CHROMA_P]( pix );
+ int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
+ h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+ for( i=3; i<9; i++ )
+ {
+ int satd;
+ h->predict_8x8[i]( pix, edge );
+ satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ i_icost = X264_MIN( i_icost, satd );
+ }
}
i_icost += intra_penalty;
@@ -237,18 +469,25 @@ lowres_intra_mb:
}
else
i_icost = fenc->i_intra_cost[i_mb_xy];
- b_intra = i_icost < i_bcost;
- if( b_intra )
- i_bcost = i_icost;
- if( (i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
- && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1)
- || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+ if( !b_bidir )
{
- fenc->i_intra_mbs[b-p0] += b_intra;
- fenc->i_cost_est[0][0] += i_icost;
+ b_intra = i_icost < i_bcost;
+ if( b_intra )
+ i_bcost = i_icost;
+ if( (i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
+ && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1)
+ || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+ {
+ fenc->i_intra_mbs[b-p0] += b_intra;
+ fenc->i_cost_est[0][0] += i_icost;
+ if( h->param.rc.i_aq_mode )
+ fenc->i_cost_est_aq[0][0] += (i_icost * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
+ }
}
}
+ fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
+
return i_bcost;
}
#undef TRY_BIDIR
@@ -259,14 +498,15 @@ lowres_intra_mb:
h->sps->i_mb_width * h->sps->i_mb_height)
static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
- x264_frame_t **frames, int p0, int p1, int b,
- int b_intra_penalty )
+ x264_frame_t **frames, int p0, int p1, int b,
+ int b_intra_penalty )
{
+
int i_score = 0;
/* Don't use the AQ'd scores for slicetype decision. */
int i_score_aq = 0;
int do_search[2];
-
+ const x264_weight_t *w = weight_none;
/* Check whether we already evaluated this frame
* If we have tried this frame as P, then we have also tried
* the preceding frames as B. (is this still true?) */
@@ -283,13 +523,24 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
/* For each list, check to see whether we have lowres motion-searched this reference frame before. */
do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
- if( do_search[0] ) frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ if( do_search[0] )
+ {
+ if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
+ || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
+ {
+ x264_emms();
+ x264_weights_analyse( h, frames[b], frames[p0], 1 );
+ w = frames[b]->weight[0];
+ }
+ frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ }
if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
if( b == p1 )
{
frames[b]->i_intra_mbs[b-p0] = 0;
frames[b]->i_cost_est[0][0] = 0;
+ frames[b]->i_cost_est_aq[0][0] = 0;
}
if( p1 != p0 )
dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
@@ -299,14 +550,15 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
/* the edge mbs seem to reduce the predictive quality of the
* whole frame's score, but are needed for a spatial distribution. */
- if( h->param.rc.i_vbv_buffer_size || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+ if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
+ h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
{
for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
{
row_satd[ h->mb.i_mb_y ] = 0;
for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
{
- int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+ int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
int i_mb_cost_aq = i_mb_cost;
if( h->param.rc.i_aq_mode )
i_mb_cost_aq = (i_mb_cost_aq * frames[b]->i_inv_qscale_factor[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride] + 128) >> 8;
@@ -327,7 +579,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
for( h->mb.i_mb_y = h->sps->i_mb_height - 2; h->mb.i_mb_y > 0; h->mb.i_mb_y-- )
for( h->mb.i_mb_x = h->sps->i_mb_width - 2; h->mb.i_mb_x > 0; h->mb.i_mb_x-- )
{
- int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+ int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
int i_mb_cost_aq = i_mb_cost;
if( h->param.rc.i_aq_mode )
i_mb_cost_aq = (i_mb_cost_aq * frames[b]->i_inv_qscale_factor[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride] + 128) >> 8;
@@ -337,7 +589,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
}
if( b != p1 )
- i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+ i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
else
frames[b]->b_intra_calculated = 1;
@@ -355,7 +607,250 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
return i_score;
}
-#define MAX_LENGTH (X264_BFRAME_MAX*4)
+/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
+ * re-running lookahead. */
+static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b )
+{
+ int i_score = 0;
+ int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
+ float *qp_offset = IS_X264_TYPE_B(frames[b]->i_type) ? frames[b]->f_qp_offset_aq : frames[b]->f_qp_offset;
+ x264_emms();
+ for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+ {
+ row_satd[ h->mb.i_mb_y ] = 0;
+ for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
+ {
+ int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
+ int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
+ float qp_adj = qp_offset[i_mb_xy];
+ i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
+ row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+ if( (h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
+ h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1) ||
+ h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+ {
+ i_score += i_mb_cost;
+ }
+ }
+ }
+ return i_score;
+}
+
+static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
+{
+ int mb_index;
+ x264_emms();
+ float weightdelta = 0.0;
+ if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
+ weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
+
+ /* Allow the strength to be adjusted via qcompress, since the two
+ * concepts are very similar. */
+ float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
+ for( mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
+ {
+ int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8;
+ if( intra_cost )
+ {
+ int propagate_cost = frame->i_propagate_cost[mb_index];
+ float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
+ frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
+ }
+ }
+}
+
+static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced )
+{
+ uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
+ int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+ int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
+ int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
+ int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
+ int *buf = h->scratch_buffer;
+ uint16_t *propagate_cost = frames[b]->i_propagate_cost;
+
+ /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
+ if( !referenced )
+ memset( frames[b]->i_propagate_cost, 0, h->sps->i_mb_width * sizeof(uint16_t) );
+
+ for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+ {
+ int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
+ h->mc.mbtree_propagate_cost( buf, propagate_cost,
+ frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
+ frames[b]->i_inv_qscale_factor+mb_index, h->sps->i_mb_width );
+ if( referenced )
+ propagate_cost += h->sps->i_mb_width;
+ for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++, mb_index++ )
+ {
+ int propagate_amount = buf[h->mb.i_mb_x];
+ /* Don't propagate for an intra block. */
+ if( propagate_amount > 0 )
+ {
+ /* Access width-2 bitfield. */
+ int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
+ int list;
+ /* Follow the MVs to the previous frame(s). */
+ for( list = 0; list < 2; list++ )
+ if( (lists_used >> list)&1 )
+ {
+ int x = mvs[list][mb_index][0];
+ int y = mvs[list][mb_index][1];
+ int listamount = propagate_amount;
+ int mbx = (x>>5)+h->mb.i_mb_x;
+ int mby = (y>>5)+h->mb.i_mb_y;
+ int idx0 = mbx + mby*h->mb.i_mb_stride;
+ int idx1 = idx0 + 1;
+ int idx2 = idx0 + h->mb.i_mb_stride;
+ int idx3 = idx0 + h->mb.i_mb_stride + 1;
+ x &= 31;
+ y &= 31;
+ int idx0weight = (32-y)*(32-x);
+ int idx1weight = (32-y)*x;
+ int idx2weight = y*(32-x);
+ int idx3weight = y*x;
+
+ /* Apply bipred weighting. */
+ if( lists_used == 3 )
+ listamount = (listamount * bipred_weights[list] + 32) >> 6;
+
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
+
+ /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
+ * be counted. */
+ if( mbx < h->sps->i_mb_width-1 && mby < h->sps->i_mb_height-1 && mbx >= 0 && mby >= 0 )
+ {
+ CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
+ CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
+ CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
+ CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
+ }
+ else /* Check offsets individually */
+ {
+ if( mbx < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx >= 0 && mby >= 0 )
+ CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
+ if( mbx+1 < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx+1 >= 0 && mby >= 0 )
+ CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
+ if( mbx < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx >= 0 && mby+1 >= 0 )
+ CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
+ if( mbx+1 < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
+ CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
+ }
+ }
+ }
+ }
+ }
+
+ if( h->param.rc.i_vbv_buffer_size && referenced )
+ x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 );
+}
+
+static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
+{
+ int i, idx = !b_intra;
+ int last_nonb, cur_nonb = 1;
+ int bframes = 0;
+ if( b_intra )
+ x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 );
+
+ i = num_frames-1;
+ while( i > 0 && frames[i]->i_type == X264_TYPE_B )
+ i--;
+ last_nonb = i;
+
+ if( last_nonb < idx )
+ return;
+
+ memset( frames[last_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
+ while( i-- > idx )
+ {
+ cur_nonb = i;
+ while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 )
+ cur_nonb--;
+ if( cur_nonb < idx )
+ break;
+ x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, last_nonb, 0 );
+ memset( frames[cur_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
+ bframes = last_nonb - cur_nonb - 1;
+ if( h->param.i_bframe_pyramid && bframes > 1 )
+ {
+ int middle = (bframes + 1)/2 + cur_nonb;
+ x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, middle, 0 );
+ memset( frames[middle]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
+ while( i > cur_nonb )
+ {
+ int p0 = i > middle ? middle : cur_nonb;
+ int p1 = i < middle ? middle : last_nonb;
+ if( i != middle )
+ {
+ x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
+ x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 );
+ }
+ i--;
+ }
+ x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 );
+ }
+ else
+ {
+ while( i > cur_nonb )
+ {
+ x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
+ x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 );
+ i--;
+ }
+ }
+ x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 );
+ last_nonb = cur_nonb;
+ }
+
+ x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
+ if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
+ x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 );
+}
+
+static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
+{
+ int cost = x264_slicetype_frame_cost( h, a, frames, p0, p1, b, 0 );
+ if( h->param.rc.i_aq_mode )
+ {
+ if( h->param.rc.b_mb_tree )
+ return x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
+ else
+ return frames[b]->i_cost_est_aq[b-p0][p1-b];
+ }
+ return cost;
+}
+
+static void x264_vbv_lookahead( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int keyframe )
+{
+ int last_nonb = 0, cur_nonb = 1, next_nonb, i, idx = 0;
+ while( cur_nonb < num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+ cur_nonb++;
+ next_nonb = keyframe ? last_nonb : cur_nonb;
+
+ while( cur_nonb <= num_frames )
+ {
+ /* P/I cost: This shouldn't include the cost of next_nonb */
+ if( next_nonb != cur_nonb )
+ {
+ int p0 = IS_X264_TYPE_I( frames[cur_nonb]->i_type ) ? cur_nonb : last_nonb;
+ frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, p0, cur_nonb, cur_nonb );
+ frames[next_nonb]->i_planned_type[idx] = frames[cur_nonb]->i_type;
+ idx++;
+ }
+ /* Handle the B-frames: coded order */
+ for( i = last_nonb+1; i < cur_nonb; i++, idx++ )
+ {
+ frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, last_nonb, cur_nonb, i );
+ frames[next_nonb]->i_planned_type[idx] = X264_TYPE_B;
+ }
+ last_nonb = cur_nonb;
+ cur_nonb++;
+ while( cur_nonb <= num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+ cur_nonb++;
+ }
+ frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO;
+}
static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold )
{
@@ -380,8 +875,18 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
if( cost > threshold )
break;
- for( next_b = loc; next_b < next_p && cost < threshold; next_b++ )
- cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
+ if( h->param.i_bframe_pyramid && next_p - cur_p > 2 )
+ {
+ int middle = cur_p + (next_p - cur_p)/2;
+ cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, middle, 0 );
+ for( next_b = loc; next_b < middle && cost < threshold; next_b++ )
+ cost += x264_slicetype_frame_cost( h, a, frames, cur_p, middle, next_b, 0 );
+ for( next_b = middle+1; next_b < next_p && cost < threshold; next_b++ )
+ cost += x264_slicetype_frame_cost( h, a, frames, middle, next_p, next_b, 0 );
+ }
+ else
+ for( next_b = loc; next_b < next_p && cost < threshold; next_b++ )
+ cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
loc = next_p + 1;
cur_p = next_p;
@@ -393,27 +898,24 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
/* Uses strings due to the fact that the speed of the control functions is
negligable compared to the cost of running slicetype_frame_cost, and because
it makes debugging easier. */
-static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[MAX_LENGTH] )
+static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, char (*best_paths)[X264_LOOKAHEAD_MAX] )
{
- char paths[X264_BFRAME_MAX+2][MAX_LENGTH] = {{0}};
- int num_paths = X264_MIN(max_bframes+1, length);
- int suffix_size, loc, path;
+ char paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX] = {{0}};
+ int num_paths = X264_MIN( max_bframes+1, length );
+ int path;
int best_cost = COST_MAX;
int best_path_index = 0;
- length = X264_MIN(length,MAX_LENGTH);
-
- /* Iterate over all currently possible paths and add suffixes to each one */
- for( suffix_size = 0; suffix_size < num_paths; suffix_size++ )
- {
- memcpy( paths[suffix_size], best_paths[length - (suffix_size + 1)], length - (suffix_size + 1) );
- for( loc = 0; loc < suffix_size; loc++ )
- strcat( paths[suffix_size], "B" );
- strcat( paths[suffix_size], "P" );
- }
- /* Calculate the actual cost of each of the current paths */
+ /* Iterate over all currently possible paths */
for( path = 0; path < num_paths; path++ )
{
+ /* Add suffixes to the current path */
+ int len = length - (path + 1);
+ memcpy( paths[path], best_paths[len % (X264_BFRAME_MAX+1)], len );
+ memset( paths[path]+len, 'B', path );
+ strcat( paths[path], "P" );
+
+ /* Calculate the actual cost of the current path */
int cost = x264_slicetype_path_cost( h, a, frames, paths[path], best_cost );
if( cost < best_cost )
{
@@ -423,19 +925,10 @@ static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
}
/* Store the best path. */
- memcpy( best_paths[length], paths[best_path_index], length );
-}
-
-static int x264_slicetype_path_search( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int bframes, int buffer )
-{
- char best_paths[MAX_LENGTH][MAX_LENGTH] = {"","P"};
- int n;
- for( n = 2; n < length-1; n++ )
- x264_slicetype_path( h, a, frames, n, bframes, buffer, best_paths );
- return strspn( best_paths[length-2], "B" );
+ memcpy( best_paths[length % (X264_BFRAME_MAX+1)], paths[best_path_index], length );
}
-static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1 )
+static int scenecut_internal( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int print )
{
x264_frame_t *frame = frames[p1];
x264_slicetype_frame_cost( h, a, frames, p0, p1, p1, 0 );
@@ -443,7 +936,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
int icost = frame->i_cost_est[0][0];
int pcost = frame->i_cost_est[p1-p0][0];
float f_bias;
- int i_gop_size = frame->i_frame - h->frames.i_last_idr;
+ int i_gop_size = frame->i_frame - h->lookahead->i_last_keyframe;
float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
/* magic numbers pulled out of thin air */
float f_thresh_min = f_thresh_max * h->param.i_keyint_min
@@ -452,7 +945,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
if( h->param.i_keyint_min == h->param.i_keyint_max )
f_thresh_min= f_thresh_max;
- if( i_gop_size < h->param.i_keyint_min / 4 )
+ if( i_gop_size < h->param.i_keyint_min / 4 || h->param.b_intra_refresh )
f_bias = f_thresh_min / 4;
else if( i_gop_size <= h->param.i_keyint_min )
f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min;
@@ -465,7 +958,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
}
res = pcost >= (1.0 - f_bias) * icost;
- if( res )
+ if( res && print )
{
int imb = frame->i_intra_mbs[p1-p0];
int pmb = NUM_MBS - imb;
@@ -477,144 +970,267 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
return res;
}
-static void x264_slicetype_analyse( x264_t *h )
+static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut, int num_frames )
+{
+ int curp0, curp1, i, maxp1 = p0 + 1;
+
+ /* Only do analysis during a normal scenecut check. */
+ if( real_scenecut && h->param.i_bframe )
+ {
+ /* Look ahead to avoid coding short flashes as scenecuts. */
+ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+ /* Don't analyse any more frames than the trellis would have covered. */
+ maxp1 += h->param.i_bframe;
+ else
+ maxp1++;
+ maxp1 = X264_MIN( maxp1, num_frames );
+
+ /* Where A and B are scenes: AAAAAABBBAAAAAA
+ * If BBB is shorter than (maxp1-p0), it is detected as a flash
+ * and not considered a scenecut. */
+ for( curp1 = p1; curp1 <= maxp1; curp1++ )
+ if( !scenecut_internal( h, a, frames, p0, curp1, 0 ) )
+ /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
+ for( i = curp1; i > p0; i-- )
+ frames[i]->b_scenecut = 0;
+
+ /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
+ * If each of BB ... EE are shorter than (maxp1-p0), they are
+ * detected as flashes and not considered scenecuts.
+ * Instead, the first F frame becomes a scenecut. */
+ for( curp0 = p0; curp0 < maxp1; curp0++ )
+ if( scenecut_internal( h, a, frames, curp0, maxp1, 0 ) )
+ /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
+ frames[curp0]->b_scenecut = 0;
+ }
+
+ /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
+ if( !frames[p1]->b_scenecut )
+ return 0;
+ return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
+}
+
+void x264_slicetype_analyse( x264_t *h, int keyframe )
{
x264_mb_analysis_t a;
- x264_frame_t *frames[X264_BFRAME_MAX*4+3] = { NULL, };
- int num_frames;
- int keyint_limit;
- int j;
+ x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
+ int num_frames, orig_num_frames, keyint_limit, idr_frame_type, i, j;
int i_mb_count = NUM_MBS;
int cost1p0, cost2p0, cost1b1, cost2p1;
- int idr_frame_type;
+ int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
+ if( h->param.b_deterministic )
+ i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe );
assert( h->frames.b_have_lowres );
- if( !h->frames.last_nonb )
+ if( !h->lookahead->last_nonb )
return;
- frames[0] = h->frames.last_nonb;
- for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ )
- frames[j+1] = h->frames.next[j];
- keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1;
- num_frames = X264_MIN( j, keyint_limit );
- if( num_frames == 0 )
+ frames[0] = h->lookahead->last_nonb;
+ for( j = 0; j < i_max_search && h->lookahead->next.list[j]->i_type == X264_TYPE_AUTO; j++ )
+ frames[j+1] = h->lookahead->next.list[j];
+
+ if( !j )
return;
- x264_lowres_context_init( h, &a );
- idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+ keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_keyframe - 1;
+ orig_num_frames = num_frames = h->param.b_intra_refresh ? j : X264_MIN( j, keyint_limit );
- if( num_frames == 1 )
+ x264_lowres_context_init( h, &a );
+ idr_frame_type = frames[1]->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+
+ /* This is important psy-wise: if we have a non-scenecut keyframe,
+ * there will be significant visual artifacts if the frames just before
+ * go down in quality due to being referenced less, despite it being
+ * more RD-optimal. */
+ if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) || h->param.rc.i_vbv_buffer_size )
+ num_frames = j;
+ else if( num_frames == 1 )
{
-no_b_frames:
frames[1]->i_type = X264_TYPE_P;
- if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
+ if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames ) )
frames[1]->i_type = idr_frame_type;
return;
}
+ else if( num_frames == 0 )
+ {
+ frames[1]->i_type = idr_frame_type;
+ return;
+ }
- if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+ int num_bframes = 0;
+ int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+ int num_analysed_frames = num_frames;
+ int reset_start;
+ if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames ) )
{
- int num_bframes;
- int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
- if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
- {
- frames[1]->i_type = idr_frame_type;
- return;
- }
- num_bframes = x264_slicetype_path_search( h, &a, frames, num_frames, max_bframes, num_frames-max_bframes );
- assert(num_bframes < num_frames);
+ frames[1]->i_type = idr_frame_type;
+ return;
+ }
- for( j = 1; j < num_bframes+1; j++ )
+ if( h->param.i_bframe )
+ {
+ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
{
- if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
+ char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX] = {"","P"};
+ int n;
+
+ /* Perform the frametype analysis. */
+ for( n = 2; n < num_frames; n++ )
+ x264_slicetype_path( h, &a, frames, n, max_bframes, best_paths );
+ if( num_frames > 1 )
{
- frames[j]->i_type = X264_TYPE_P;
- return;
+ int best_path_index = (num_frames-1) % (X264_BFRAME_MAX+1);
+ num_bframes = strspn( best_paths[best_path_index], "B" );
+ /* Load the results of the analysis into the frame types. */
+ for( j = 1; j < num_frames; j++ )
+ frames[j]->i_type = best_paths[best_path_index][j-1] == 'B' ? X264_TYPE_B : X264_TYPE_P;
}
- frames[j]->i_type = X264_TYPE_B;
+ frames[num_frames]->i_type = X264_TYPE_P;
}
- frames[num_bframes+1]->i_type = X264_TYPE_P;
- }
- else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
- {
- cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
- if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
- goto no_b_frames;
+ else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
+ {
+ for( i = 0; i <= num_frames-2; )
+ {
+ cost2p1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+2, 1 );
+ if( frames[i+2]->i_intra_mbs[2] > i_mb_count / 2 )
+ {
+ frames[i+1]->i_type = X264_TYPE_P;
+ frames[i+2]->i_type = X264_TYPE_P;
+ i += 2;
+ continue;
+ }
- cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
- cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
- cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
+ cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
+ cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
+ cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
- if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
- goto no_b_frames;
+ if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
+ {
+ frames[i+1]->i_type = X264_TYPE_P;
+ i += 1;
+ continue;
+ }
- // arbitrary and untuned
- #define INTER_THRESH 300
- #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
- frames[1]->i_type = X264_TYPE_B;
+ // arbitrary and untuned
+ #define INTER_THRESH 300
+ #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
+ frames[i+1]->i_type = X264_TYPE_B;
- for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
+ for( j = i+2; j <= X264_MIN( i+h->param.i_bframe, num_frames-1 ); j++ )
+ {
+ int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-i-1), INTER_THRESH/10);
+ int pcost = x264_slicetype_frame_cost( h, &a, frames, i+0, j+1, j+1, 1 );
+ if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-i+1] > i_mb_count/3 )
+ break;
+ frames[j]->i_type = X264_TYPE_B;
+ }
+ frames[j]->i_type = X264_TYPE_P;
+ i = j;
+ }
+ frames[num_frames]->i_type = X264_TYPE_P;
+ num_bframes = 0;
+ while( num_bframes < num_frames && frames[num_bframes+1]->i_type == X264_TYPE_B )
+ num_bframes++;
+ }
+ else
{
- int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
- int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
+ num_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+ for( j = 1; j < num_frames; j++ )
+ frames[j]->i_type = (j%(num_bframes+1)) ? X264_TYPE_B : X264_TYPE_P;
+ frames[num_frames]->i_type = X264_TYPE_P;
+ }
- if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 )
+ /* Check scenecut on the first minigop. */
+ for( j = 1; j < num_bframes+1; j++ )
+ if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames ) )
{
frames[j]->i_type = X264_TYPE_P;
+ num_analysed_frames = j;
break;
}
- else
- frames[j]->i_type = X264_TYPE_B;
- }
+
+ reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 );
}
else
{
- int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
- if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
- {
- frames[1]->i_type = idr_frame_type;
- return;
- }
+ for( j = 1; j <= num_frames; j++ )
+ frames[j]->i_type = X264_TYPE_P;
+ reset_start = !keyframe + 1;
+ num_bframes = 0;
+ }
+
+ /* Perform the actual macroblock tree analysis.
+ * Don't go farther than the maximum keyframe interval; this helps in short GOPs. */
+ if( h->param.rc.b_mb_tree )
+ x264_macroblock_tree( h, &a, frames, X264_MIN(num_frames, h->param.i_keyint_max), keyframe );
- for( j = 1; j < max_bframes+1; j++ )
+ /* Enforce keyframe limit. */
+ if( !h->param.b_intra_refresh )
+ for( j = 0; j < num_frames; j++ )
{
- if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
+ if( ((j-keyint_limit) % h->param.i_keyint_max) == 0 )
{
- frames[j]->i_type = X264_TYPE_P;
- return;
+ if( j && h->param.i_keyint_max > 1 )
+ frames[j]->i_type = X264_TYPE_P;
+ frames[j+1]->i_type = X264_TYPE_IDR;
+ reset_start = X264_MIN( reset_start, j+2 );
}
- frames[j]->i_type = X264_TYPE_B;
}
- frames[max_bframes+1]->i_type = X264_TYPE_P;
- }
+
+ if( h->param.rc.i_vbv_buffer_size )
+ x264_vbv_lookahead( h, &a, frames, num_frames, keyframe );
+
+ /* Restore frametypes for all frames that haven't actually been decided yet. */
+ for( j = reset_start; j <= num_frames; j++ )
+ frames[j]->i_type = X264_TYPE_AUTO;
}
void x264_slicetype_decide( x264_t *h )
{
+ x264_frame_t *frames[X264_BFRAME_MAX+2];
x264_frame_t *frm;
int bframes;
+ int brefs;
int i;
- if( h->frames.next[0] == NULL )
+ if( !h->lookahead->next.i_size )
return;
if( h->param.rc.b_stat_read )
{
/* Use the frame types from the first pass */
- for( i = 0; h->frames.next[i] != NULL; i++ )
- h->frames.next[i]->i_type =
- x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame );
+ for( i = 0; i < h->lookahead->next.i_size; i++ )
+ h->lookahead->next.list[i]->i_type =
+ x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame );
}
else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
- || h->param.i_scenecut_threshold )
- x264_slicetype_analyse( h );
+ || h->param.i_scenecut_threshold
+ || h->param.rc.b_mb_tree
+ || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) )
+ x264_slicetype_analyse( h, 0 );
- for( bframes = 0;; bframes++ )
+ for( bframes = 0, brefs = 0;; bframes++ )
{
- frm = h->frames.next[bframes];
+ frm = h->lookahead->next.list[bframes];
+ if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL &&
+ brefs == h->param.i_bframe_pyramid )
+ {
+ frm->i_type = X264_TYPE_B;
+ x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s \n",
+ frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid] );
+ }
+ /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
+ smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
+ else if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL &&
+ brefs && h->param.i_frame_reference <= (brefs+3) )
+ {
+ frm->i_type = X264_TYPE_B;
+ x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s and %d reference frames\n",
+ frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid], h->param.i_frame_reference );
+ }
/* Limit GOP size */
- if( frm->i_frame - h->frames.i_last_idr >= h->param.i_keyint_max )
+ if( (!h->param.b_intra_refresh || frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max )
{
if( frm->i_type == X264_TYPE_AUTO )
frm->i_type = X264_TYPE_IDR;
@@ -624,19 +1240,17 @@ void x264_slicetype_decide( x264_t *h )
if( frm->i_type == X264_TYPE_IDR )
{
/* Close GOP */
+ h->lookahead->i_last_keyframe = frm->i_frame;
+ frm->b_keyframe = 1;
if( bframes > 0 )
{
bframes--;
- h->frames.next[bframes]->i_type = X264_TYPE_P;
- }
- else
- {
- h->i_frame_num = 0;
+ h->lookahead->next.list[bframes]->i_type = X264_TYPE_P;
}
}
- if( bframes == h->param.i_bframe
- || h->frames.next[bframes+1] == NULL )
+ if( bframes == h->param.i_bframe ||
+ !h->lookahead->next.list[bframes+1] )
{
if( IS_X264_TYPE_B( frm->i_type ) )
x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" );
@@ -645,50 +1259,149 @@ void x264_slicetype_decide( x264_t *h )
frm->i_type = X264_TYPE_P;
}
- if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_B;
+ if( frm->i_type == X264_TYPE_BREF )
+ brefs++;
+
+ if( frm->i_type == X264_TYPE_AUTO )
+ frm->i_type = X264_TYPE_B;
+
else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
}
+
+ if( bframes )
+ h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1;
+ h->lookahead->next.list[bframes]->i_bframes = bframes;
+
+ /* insert a bref into the sequence */
+ if( h->param.i_bframe_pyramid && bframes > 1 && !brefs )
+ {
+ h->lookahead->next.list[bframes/2]->i_type = X264_TYPE_BREF;
+ brefs++;
+ }
+
+ /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
+ if( h->param.rc.i_rc_method != X264_RC_CQP )
+ {
+ x264_mb_analysis_t a;
+ int p0, p1, b;
+ p1 = b = bframes + 1;
+
+ x264_lowres_context_init( h, &a );
+
+ frames[0] = h->lookahead->last_nonb;
+ memcpy( &frames[1], h->lookahead->next.list, (bframes+1) * sizeof(x264_frame_t*) );
+ if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) )
+ p0 = bframes + 1;
+ else // P
+ p0 = 0;
+
+ x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+
+ if( (p0 != p1 || bframes) && h->param.rc.i_vbv_buffer_size )
+ {
+ /* We need the intra costs for row SATDs. */
+ x264_slicetype_frame_cost( h, &a, frames, b, b, b, 0 );
+
+ /* We need B-frame costs for row SATDs. */
+ p0 = 0;
+ for( b = 1; b <= bframes; b++ )
+ {
+ if( frames[b]->i_type == X264_TYPE_B )
+ for( p1 = b; frames[p1]->i_type == X264_TYPE_B; )
+ p1++;
+ else
+ p1 = bframes + 1;
+ x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+ if( frames[b]->i_type == X264_TYPE_BREF )
+ p0 = b;
+ }
+ }
+ }
+
+ /* Analyse for weighted P frames */
+ if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
+ && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ {
+ x264_emms();
+ x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
+ }
+
+ /* shift sequence to coded order.
+ use a small temporary list to avoid shifting the entire next buffer around */
+ int i_coded = h->lookahead->next.list[0]->i_frame;
+ if( bframes )
+ {
+ int index[] = { brefs+1, 1 };
+ for( i = 0; i < bframes; i++ )
+ {
+ int idx = index[h->lookahead->next.list[i]->i_type == X264_TYPE_BREF]++;
+ frames[idx] = h->lookahead->next.list[i];
+ frames[idx]->i_reordered_pts = h->lookahead->next.list[idx]->i_pts;
+ }
+ frames[0] = h->lookahead->next.list[bframes];
+ frames[0]->i_reordered_pts = h->lookahead->next.list[0]->i_pts;
+ memcpy( h->lookahead->next.list, frames, (bframes+1) * sizeof(x264_frame_t*) );
+ }
+ for( i = 0; i <= bframes; i++ )
+ h->lookahead->next.list[i]->i_coded = i_coded++;
}
int x264_rc_analyse_slice( x264_t *h )
{
- x264_mb_analysis_t a;
- x264_frame_t *frames[X264_BFRAME_MAX*4+2] = { NULL, };
int p0=0, p1, b;
int cost;
-
- x264_lowres_context_init( h, &a );
+ x264_emms();
if( IS_X264_TYPE_I(h->fenc->i_type) )
- {
p1 = b = 0;
- }
- else if( X264_TYPE_P == h->fenc->i_type )
- {
- p1 = 0;
- while( h->frames.current[p1] && IS_X264_TYPE_B( h->frames.current[p1]->i_type ) )
- p1++;
- p1++;
- b = p1;
- }
+ else if( h->fenc->i_type == X264_TYPE_P )
+ p1 = b = h->fenc->i_bframes + 1;
else //B
{
p1 = (h->fref1[0]->i_poc - h->fref0[0]->i_poc)/2;
- b = (h->fref1[0]->i_poc - h->fenc->i_poc)/2;
- frames[p1] = h->fref1[0];
+ b = (h->fenc->i_poc - h->fref0[0]->i_poc)/2;
}
- frames[p0] = h->fref0[0];
- frames[b] = h->fenc;
+ /* We don't need to assign p0/p1 since we are not performing any real analysis here. */
+ x264_frame_t **frames = &h->fenc - b;
- cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+ /* cost should have been already calculated by x264_slicetype_decide */
+ cost = frames[b]->i_cost_est[b-p0][p1-b];
+ assert( cost >= 0 );
+ if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
+ {
+ cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
+ if( b && h->param.rc.i_vbv_buffer_size )
+ x264_slicetype_frame_cost_recalculate( h, frames, b, b, b );
+ }
/* In AQ, use the weighted score instead. */
- if( h->param.rc.i_aq_mode )
- cost = frames[b]->i_cost_est[b-p0][p1-b];
+ else if( h->param.rc.i_aq_mode )
+ cost = frames[b]->i_cost_est_aq[b-p0][p1-b];
h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
h->fdec->i_satd = cost;
memcpy( h->fdec->i_row_satd, h->fenc->i_row_satd, h->sps->i_mb_height * sizeof(int) );
+ if( !IS_X264_TYPE_I(h->fenc->i_type) )
+ memcpy( h->fdec->i_row_satds[0][0], h->fenc->i_row_satds[0][0], h->sps->i_mb_height * sizeof(int) );
+
+ if( h->param.b_intra_refresh && h->param.rc.i_vbv_buffer_size && h->fenc->i_type == X264_TYPE_P )
+ {
+ int x, y;
+ int ip_factor = 256 * h->param.rc.f_ip_factor; /* fix8 */
+ for( y = 0; y < h->sps->i_mb_height; y++ )
+ {
+ int mb_xy = y * h->mb.i_mb_stride;
+ for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
+ {
+ int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
+ int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
+ int diff = intra_cost - inter_cost;
+ h->fdec->i_row_satd[y] += diff;
+ cost += diff;
+ }
+ }
+ }
+
return cost;
}
diff --git a/extras/avisynth_c.h b/extras/avisynth_c.h
new file mode 100644
index 0000000..27e8270
--- /dev/null
+++ b/extras/avisynth_c.h
@@ -0,0 +1,661 @@
+// Avisynth C Interface Version 0.20
+// Copyright 2003 Kevin Atkinson
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// As a special exception, I give you permission to link to the
+// Avisynth C interface with independent modules that communicate with
+// the Avisynth C interface solely through the interfaces defined in
+// avisynth_c.h, regardless of the license terms of these independent
+// modules, and to copy and distribute the resulting combined work
+// under terms of your choice, provided that every copy of the
+// combined work is accompanied by a complete copy of the source code
+// of the Avisynth C interface and Avisynth itself (with the version
+// used to produce the combined work), being distributed under the
+// terms of the GNU General Public License plus this exception. An
+// independent module is a module which is not derived from or based
+// on Avisynth C Interface, such as 3rd-party filters, import and
+// export plugins, or graphical user interfaces.
+
+#ifndef __AVISYNTH_C__
+#define __AVISYNTH_C__
+
+#ifdef __cplusplus
+# define EXTERN_C extern "C"
+#else
+# define EXTERN_C
+#endif
+
+#define AVSC_USE_STDCALL 1
+
+#ifndef AVSC_USE_STDCALL
+# define AVSC_CC __cdecl
+#else
+# define AVSC_CC __stdcall
+#endif
+
+#define AVSC_EXPORT EXTERN_C __declspec(dllexport)
+#define AVSC_INLINE static __inline
+#ifdef AVISYNTH_C_EXPORTS
+# define AVSC_API(ret) EXTERN_C __declspec(dllexport) ret AVSC_CC
+#else
+# define AVSC_API(ret) EXTERN_C __declspec(dllimport) ret AVSC_CC
+#endif
+
+typedef unsigned char BYTE;
+#ifdef __GNUC__
+typedef long long int INT64;
+#else
+typedef __int64 INT64;
+#endif
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// Constants
+//
+
+#ifndef __AVISYNTH_H__
+enum { AVISYNTH_INTERFACE_VERSION = 2 };
+#endif
+
+enum {AVS_SAMPLE_INT8 = 1<<0,
+ AVS_SAMPLE_INT16 = 1<<1,
+ AVS_SAMPLE_INT24 = 1<<2,
+ AVS_SAMPLE_INT32 = 1<<3,
+ AVS_SAMPLE_FLOAT = 1<<4};
+
+enum {AVS_PLANAR_Y=1<<0,
+ AVS_PLANAR_U=1<<1,
+ AVS_PLANAR_V=1<<2,
+ AVS_PLANAR_ALIGNED=1<<3,
+ AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
+ AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
+ AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED};
+
+ // Colorspace properties.
+enum {AVS_CS_BGR = 1<<28,
+ AVS_CS_YUV = 1<<29,
+ AVS_CS_INTERLEAVED = 1<<30,
+ AVS_CS_PLANAR = 1<<31};
+
+ // Specific colorformats
+enum {
+ AVS_CS_UNKNOWN = 0,
+ AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+ AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+ AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
+ AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR, // y-v-u, planar
+ AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR, // y-u-v, planar
+ AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR // same as above
+};
+
+enum {
+ AVS_IT_BFF = 1<<0,
+ AVS_IT_TFF = 1<<1,
+ AVS_IT_FIELDBASED = 1<<2};
+
+enum {
+ AVS_FILTER_TYPE=1,
+ AVS_FILTER_INPUT_COLORSPACE=2,
+ AVS_FILTER_OUTPUT_TYPE=9,
+ AVS_FILTER_NAME=4,
+ AVS_FILTER_AUTHOR=5,
+ AVS_FILTER_VERSION=6,
+ AVS_FILTER_ARGS=7,
+ AVS_FILTER_ARGS_INFO=8,
+ AVS_FILTER_ARGS_DESCRIPTION=10,
+ AVS_FILTER_DESCRIPTION=11};
+
+enum { //SUBTYPES
+ AVS_FILTER_TYPE_AUDIO=1,
+ AVS_FILTER_TYPE_VIDEO=2,
+ AVS_FILTER_OUTPUT_TYPE_SAME=3,
+ AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};
+
+enum {
+ AVS_CACHE_NOTHING=0,
+ AVS_CACHE_RANGE=1 };
+
+#define AVS_FRAME_ALIGN 16
+
+typedef struct AVS_Clip AVS_Clip;
+typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoInfo
+//
+
+// AVS_VideoInfo is layed out identicly to VideoInfo
+typedef struct AVS_VideoInfo {
+ int width, height; // width=0 means no video
+ unsigned fps_numerator, fps_denominator;
+ int num_frames;
+
+ int pixel_type;
+
+ int audio_samples_per_second; // 0 means no audio
+ int sample_type;
+ INT64 num_audio_samples;
+ int nchannels;
+
+ // Imagetype properties
+
+ int image_type;
+} AVS_VideoInfo;
+
+// useful functions of the above
+AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p)
+ { return (p->width!=0); }
+
+AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p)
+ { return (p->audio_samples_per_second!=0); }
+
+AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p)
+ { return !!(p->pixel_type&AVS_CS_BGR); }
+
+AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p)
+ { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties
+
+AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p)
+ { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; }
+
+AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p)
+ { return !!(p->pixel_type&AVS_CS_YUV ); }
+
+AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p)
+ { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; }
+
+AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p)
+ { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); }
+
+AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space)
+ { return ((p->pixel_type & c_space) == c_space); }
+
+AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property)
+ { return ((p->pixel_type & property)==property ); }
+
+AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p)
+ { return !!(p->pixel_type & AVS_CS_PLANAR); }
+
+AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p)
+ { return !!(p->image_type & AVS_IT_FIELDBASED); }
+
+AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p)
+ { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); }
+
+AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p)
+ { return !!(p->image_type & AVS_IT_BFF); }
+
+AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p)
+ { return !!(p->image_type & AVS_IT_TFF); }
+
+AVSC_INLINE int avs_bits_per_pixel(const AVS_VideoInfo * p)
+{
+ switch (p->pixel_type) {
+ case AVS_CS_BGR24: return 24;
+ case AVS_CS_BGR32: return 32;
+ case AVS_CS_YUY2: return 16;
+ case AVS_CS_YV12:
+ case AVS_CS_I420: return 12;
+ default: return 0;
+ }
+}
+AVSC_INLINE int avs_bytes_from_pixels(const AVS_VideoInfo * p, int pixels)
+ { return pixels * (avs_bits_per_pixel(p)>>3); } // Will work on planar images, but will return only luma planes
+
+AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p)
+ { return avs_bytes_from_pixels(p,p->width); } // Also only returns first plane on planar images
+
+AVSC_INLINE int avs_bmp_size(const AVS_VideoInfo * vi)
+ { if (avs_is_planar(vi)) {int p = vi->height * ((avs_row_size(vi)+3) & ~3); p+=p>>1; return p; } return vi->height * ((avs_row_size(vi)+3) & ~3); }
+
+AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p)
+ { return p->audio_samples_per_second; }
+
+
+AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p)
+{
+ switch (p->sample_type) {
+ case AVS_SAMPLE_INT8: return sizeof(signed char);
+ case AVS_SAMPLE_INT16: return sizeof(signed short);
+ case AVS_SAMPLE_INT24: return 3;
+ case AVS_SAMPLE_INT32: return sizeof(signed int);
+ case AVS_SAMPLE_FLOAT: return sizeof(float);
+ default: return 0;
+ }
+}
+AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p)
+ { return p->nchannels*avs_bytes_per_channel_sample(p);}
+
+AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames)
+ { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); }
+
+AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples)
+ { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); }
+
+AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes)
+ { return bytes / avs_bytes_per_audio_sample(p); }
+
+AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples)
+ { return samples * avs_bytes_per_audio_sample(p); }
+
+AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p)
+ { return p->nchannels; }
+
+AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p)
+ { return p->sample_type;}
+
+// useful mutator
+AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property)
+ { p->image_type|=property; }
+
+AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property)
+ { p->image_type&=~property; }
+
+AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased)
+ { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; }
+
+AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator)
+{
+ unsigned x=numerator, y=denominator;
+ while (y) { // find gcd
+ unsigned t = x%y; x = y; y = t;
+ }
+ p->fps_numerator = numerator/x;
+ p->fps_denominator = denominator/x;
+}
+
+AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y)
+{
+ return (x->pixel_type == y->pixel_type)
+ || (avs_is_yv12(x) && avs_is_yv12(y));
+}
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoFrame
+//
+
+// VideoFrameBuffer holds information about a memory block which is used
+// for video data. For efficiency, instances of this class are not deleted
+// when the refcount reaches zero; instead they're stored in a linked list
+// to be reused. The instances are deleted when the corresponding AVS
+// file is closed.
+
+// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer
+// DO NOT USE THIS STRUCTURE DIRECTLY
+typedef struct AVS_VideoFrameBuffer {
+ BYTE * data;
+ int data_size;
+ // sequence_number is incremented every time the buffer is changed, so
+ // that stale views can tell they're no longer valid.
+ long sequence_number;
+
+ long refcount;
+} AVS_VideoFrameBuffer;
+
+// VideoFrame holds a "window" into a VideoFrameBuffer.
+
+// AVS_VideoFrame is layed out identicly to IVideoFrame
+// DO NOT USE THIS STRUCTURE DIRECTLY
+typedef struct AVS_VideoFrame {
+ int refcount;
+ AVS_VideoFrameBuffer * vfb;
+ int offset, pitch, row_size, height, offsetU, offsetV, pitchUV; // U&V offsets are from top of picture.
+} AVS_VideoFrame;
+
+// Access functions for AVS_VideoFrame
+AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {
+ return p->pitch;}
+
+AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) {
+ switch (plane) {
+ case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV;}
+ return p->pitch;}
+
+AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {
+ return p->row_size; }
+
+AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) {
+ int r;
+ switch (plane) {
+ case AVS_PLANAR_U: case AVS_PLANAR_V:
+ if (p->pitchUV) return p->row_size>>1;
+ else return 0;
+ case AVS_PLANAR_U_ALIGNED: case AVS_PLANAR_V_ALIGNED:
+ if (p->pitchUV) {
+ int r = ((p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)) )>>1; // Aligned rowsize
+ if (r < p->pitchUV)
+ return r;
+ return p->row_size>>1;
+ } else return 0;
+ case AVS_PLANAR_Y_ALIGNED:
+ r = (p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize
+ if (r <= p->pitch)
+ return r;
+ return p->row_size;
+ }
+ return p->row_size;
+}
+
+AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {
+ return p->height;}
+
+AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) {
+ switch (plane) {
+ case AVS_PLANAR_U: case AVS_PLANAR_V:
+ if (p->pitchUV) return p->height>>1;
+ return 0;
+ }
+ return p->height;}
+
+AVSC_INLINE const BYTE* avs_get_read_ptr(const AVS_VideoFrame * p) {
+ return p->vfb->data + p->offset;}
+
+AVSC_INLINE const BYTE* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane)
+{
+ switch (plane) {
+ case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
+ case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
+ default: return p->vfb->data + p->offset;}
+}
+
+AVSC_INLINE int avs_is_writable(const AVS_VideoFrame * p) {
+ return (p->refcount == 1 && p->vfb->refcount == 1);}
+
+AVSC_INLINE BYTE* avs_get_write_ptr(const AVS_VideoFrame * p)
+{
+ if (avs_is_writable(p)) {
+ ++p->vfb->sequence_number;
+ return p->vfb->data + p->offset;
+ } else
+ return 0;
+}
+
+AVSC_INLINE BYTE* avs_get_write_ptr_p(const AVS_VideoFrame * p, int plane)
+{
+ if (plane==AVS_PLANAR_Y && avs_is_writable(p)) {
+ ++p->vfb->sequence_number;
+ return p->vfb->data + p->offset;
+ } else if (plane==AVS_PLANAR_Y) {
+ return 0;
+ } else {
+ switch (plane) {
+ case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
+ case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
+ default: return p->vfb->data + p->offset;
+ }
+ }
+}
+
+
+AVSC_API(void) avs_release_video_frame(AVS_VideoFrame *);
+// makes a shallow copy of a video frame
+AVSC_API(AVS_VideoFrame *) avs_copy_video_frame(AVS_VideoFrame *);
+
+AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f)
+ {avs_release_video_frame(f);}
+AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f)
+ {return avs_copy_video_frame(f);}
+
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_Value
+//
+
+// Treat AVS_Value as a fat pointer. That is use avs_copy_value
+// and avs_release_value appropiaty as you would if AVS_Value was
+// a pointer.
+
+// To maintain source code compatibility with future versions of the
+// avisynth_c API don't use the AVS_Value directly. Use the helper
+// functions below.
+
+// AVS_Value is layed out identicly to AVSValue
+typedef struct AVS_Value AVS_Value;
+struct AVS_Value {
+ short type; // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong
+ // for some function e'rror
+ short array_size;
+ union {
+ void * clip; // do not use directly, use avs_take_clip
+ char boolean;
+ int integer;
+ float floating_pt;
+ const char * string;
+ const AVS_Value * array;
+ } d;
+};
+
+// AVS_Value should be initilized with avs_void.
+// Should also set to avs_void after the value is released
+// with avs_copy_value. Consider it the equalvent of setting
+// a pointer to NULL
+static const AVS_Value avs_void = {'v'};
+
+AVSC_API(void) avs_copy_value(AVS_Value * dest, AVS_Value src);
+AVSC_API(void) avs_release_value(AVS_Value);
+
+AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; }
+AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; }
+AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; }
+AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; }
+AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; }
+AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; }
+AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; }
+AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; }
+
+AVSC_API(AVS_Clip *) avs_take_clip(AVS_Value, AVS_ScriptEnvironment *);
+AVSC_API(void) avs_set_to_clip(AVS_Value *, AVS_Clip *);
+
+AVSC_INLINE int avs_as_bool(AVS_Value v)
+ { return v.d.boolean; }
+AVSC_INLINE int avs_as_int(AVS_Value v)
+ { return v.d.integer; }
+AVSC_INLINE const char * avs_as_string(AVS_Value v)
+ { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; }
+AVSC_INLINE double avs_as_float(AVS_Value v)
+ { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; }
+AVSC_INLINE const char * avs_as_error(AVS_Value v)
+ { return avs_is_error(v) ? v.d.string : 0; }
+AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v)
+ { return v.d.array; }
+AVSC_INLINE int avs_array_size(AVS_Value v)
+ { return avs_is_array(v) ? v.array_size : 1; }
+AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index)
+ { return avs_is_array(v) ? v.d.array[index] : v; }
+
+// only use these functions on am AVS_Value that does not already have
+// an active value. Remember, treat AVS_Value as a fat pointer.
+AVSC_INLINE AVS_Value avs_new_value_bool(int v0)
+ { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; }
+AVSC_INLINE AVS_Value avs_new_value_int(int v0)
+ { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; }
+AVSC_INLINE AVS_Value avs_new_value_string(const char * v0)
+ { AVS_Value v; v.type = 's'; v.d.string = v0; return v; }
+AVSC_INLINE AVS_Value avs_new_value_float(float v0)
+ { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;}
+AVSC_INLINE AVS_Value avs_new_value_error(const char * v0)
+ { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; }
+AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0)
+ { AVS_Value v; avs_set_to_clip(&v, v0); return v; }
+AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size)
+ { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = size; return v; }
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_Clip
+//
+
+AVSC_API(void) avs_release_clip(AVS_Clip *);
+AVSC_API(AVS_Clip *) avs_copy_clip(AVS_Clip *);
+
+AVSC_API(const char *) avs_clip_get_error(AVS_Clip *); // return 0 if no error
+
+AVSC_API(const AVS_VideoInfo *) avs_get_video_info(AVS_Clip *);
+
+AVSC_API(int) avs_get_version(AVS_Clip *);
+
+AVSC_API(AVS_VideoFrame *) avs_get_frame(AVS_Clip *, int n);
+// The returned video frame must be released with avs_release_video_frame
+
+AVSC_API(int) avs_get_parity(AVS_Clip *, int n);
+// return field parity if field_based, else parity of first field in frame
+
+AVSC_API(int) avs_get_audio(AVS_Clip *, void * buf,
+ INT64 start, INT64 count);
+// start and count are in samples
+
+AVSC_API(int) avs_set_cache_hints(AVS_Clip *,
+ int cachehints, int frame_range);
+
+// This is the callback type used by avs_add_function
+typedef AVS_Value (AVSC_CC * AVS_ApplyFunc)
+ (AVS_ScriptEnvironment *, AVS_Value args, void * user_data);
+
+typedef struct AVS_FilterInfo AVS_FilterInfo;
+struct AVS_FilterInfo
+{
+ // these members should not be modified outside of the AVS_ApplyFunc callback
+ AVS_Clip * child;
+ AVS_VideoInfo vi;
+ AVS_ScriptEnvironment * env;
+ AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n);
+ int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n);
+ int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf,
+ INT64 start, INT64 count);
+ int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints,
+ int frame_range);
+ void (AVSC_CC * free_filter)(AVS_FilterInfo *);
+
+ // Should be set when ever there is an error to report.
+ // It is cleared before any of the above methods are called
+ const char * error;
+ // this is to store whatever and may be modified at will
+ void * user_data;
+};
+
+// Create a new filter
+// fi is set to point to the AVS_FilterInfo so that you can
+// modify it once it is initilized.
+// store_child should generally be set to true. If it is not
+// set than ALL methods (the function pointers) must be defined
+// If it is set than you do not need to worry about freeing the child
+// clip.
+AVSC_API(AVS_Clip *) avs_new_c_filter(AVS_ScriptEnvironment * e,
+ AVS_FilterInfo * * fi,
+ AVS_Value child, int store_child);
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_ScriptEnvironment
+//
+
+// For GetCPUFlags. These are backwards-compatible with those in VirtualDub.
+enum {
+ /* slowest CPU to support extension */
+ AVS_CPU_FORCE = 0x01, // N/A
+ AVS_CPU_FPU = 0x02, // 386/486DX
+ AVS_CPU_MMX = 0x04, // P55C, K6, PII
+ AVS_CPU_INTEGER_SSE = 0x08, // PIII, Athlon
+ AVS_CPU_SSE = 0x10, // PIII, Athlon XP/MP
+ AVS_CPU_SSE2 = 0x20, // PIV, Hammer
+ AVS_CPU_3DNOW = 0x40, // K6-2
+ AVS_CPU_3DNOW_EXT = 0x80, // Athlon
+ AVS_CPU_X86_64 = 0xA0, // Hammer (note: equiv. to 3DNow + SSE2,
+ // which only Hammer will have anyway)
+};
+
+
+AVSC_API(long) avs_get_cpu_flags(AVS_ScriptEnvironment *);
+AVSC_API(int) avs_check_version(AVS_ScriptEnvironment *, int version);
+
+AVSC_API(char *) avs_save_string(AVS_ScriptEnvironment *, const char* s, int length);
+AVSC_API(char *) avs_sprintf(AVS_ScriptEnvironment *, const char * fmt, ...);
+
+AVSC_API(char *) avs_vsprintf(AVS_ScriptEnvironment *, const char * fmt, void* val);
+ // note: val is really a va_list; I hope everyone typedefs va_list to a pointer
+
+AVSC_API(int) avs_add_function(AVS_ScriptEnvironment *,
+ const char * name, const char * params,
+ AVS_ApplyFunc apply, void * user_data);
+
+AVSC_API(int) avs_function_exists(AVS_ScriptEnvironment *, const char * name);
+
+AVSC_API(AVS_Value) avs_invoke(AVS_ScriptEnvironment *, const char * name,
+ AVS_Value args, const char** arg_names);
+// The returned value must be be released with avs_release_value
+
+AVSC_API(AVS_Value) avs_get_var(AVS_ScriptEnvironment *, const char* name);
+// The returned value must be be released with avs_release_value
+
+AVSC_API(int) avs_set_var(AVS_ScriptEnvironment *, const char* name, AVS_Value val);
+
+AVSC_API(int) avs_set_global_var(AVS_ScriptEnvironment *, const char* name, const AVS_Value val);
+
+//void avs_push_context(AVS_ScriptEnvironment *, int level=0);
+//void avs_pop_context(AVS_ScriptEnvironment *);
+
+AVSC_API(AVS_VideoFrame *) avs_new_video_frame_a(AVS_ScriptEnvironment *,
+ const AVS_VideoInfo * vi, int align);
+// align should be at least 16
+
+AVSC_INLINE
+AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env,
+ const AVS_VideoInfo * vi)
+ {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
+
+AVSC_INLINE
+AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env,
+ const AVS_VideoInfo * vi)
+ {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
+
+
+AVSC_API(int) avs_make_writable(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf);
+
+AVSC_API(void) avs_bit_blt(AVS_ScriptEnvironment *, BYTE* dstp, int dst_pitch, const BYTE* srcp, int src_pitch, int row_size, int height);
+
+typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env);
+AVSC_API(void) avs_at_exit(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data);
+
+AVSC_API(AVS_VideoFrame *) avs_subframe(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height);
+// The returned video frame must be be released
+
+AVSC_API(int) avs_set_memory_max(AVS_ScriptEnvironment *, int mem);
+
+AVSC_API(int) avs_set_working_dir(AVS_ScriptEnvironment *, const char * newdir);
+
+// avisynth.dll exports this; it's a way to use it as a library, without
+// writing an AVS script or without going through AVIFile.
+AVSC_API(AVS_ScriptEnvironment *) avs_create_script_environment(int version);
+
+// this symbol is the entry point for the plugin and must
+// be defined
+AVSC_EXPORT
+const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env);
+
+
+AVSC_API(void) avs_delete_script_environment(AVS_ScriptEnvironment *);
+
+
+AVSC_API(AVS_VideoFrame *) avs_subframe_planar(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV);
+// The returned video frame must be be released
+
+#endif
diff --git a/extras/getopt.c b/extras/getopt.c
index d2dbd30..434efe7 100644
--- a/extras/getopt.c
+++ b/extras/getopt.c
@@ -202,11 +202,7 @@ static char *posixly_correct;
# if HAVE_STRING_H
# include <string.h>
# else
-# ifdef _MSC_VER
-# include <string.h>
-# else
-# include <strings.h>
-# endif
+# include <strings.h>
# endif
/* Avoid depending on library functions or files
@@ -984,10 +980,7 @@ getopt (argc, argv, optstring)
0);
}
-#ifdef _MSC_VER
-
-int
-getopt_long (argc, argv, optstring, long_options, opt_index)
+int getopt_long (argc, argv, optstring, long_options, opt_index)
int argc;
char *const *argv;
const char *optstring;
@@ -997,8 +990,6 @@ getopt_long (argc, argv, optstring, long_options, opt_index)
return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0);
}
-#endif
-
#endif /* Not ELIDE_CODE. */
#ifdef TEST
diff --git a/input/avs.c b/input/avs.c
new file mode 100644
index 0000000..522f8fe
--- /dev/null
+++ b/input/avs.c
@@ -0,0 +1,316 @@
+/*****************************************************************************
+ * avs.c: x264 avisynth input module
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include <windows.h>
+
+/* the AVS interface currently uses __declspec to link function declarations to their definitions in the dll.
+ this has a side effect of preventing program execution if the avisynth dll is not found,
+ so define __declspec(dllimport) to nothing and work around this */
+#undef __declspec
+#define __declspec(i)
+#undef EXTERN_C
+
+#ifdef HAVE_AVISYNTH_C_H
+#include <avisynth_c.h>
+#else
+#include "extras/avisynth_c.h"
+#endif
+
+/* AVS uses a versioned interface to control backwards compatibility */
+/* YV12 support is required */
+#define AVS_INTERFACE_YV12 2
+/* when AVS supports other planar colorspaces, a workaround is required */
+#define AVS_INTERFACE_OTHER_PLANAR 5
+
+/* maximum size of the sequence of filters to try on non script files */
+#define AVS_MAX_SEQUENCE 5
+
+#define LOAD_AVS_FUNC(name, continue_on_fail) \
+{\
+ h->func.name = (void*)GetProcAddress( h->library, #name );\
+ if( !continue_on_fail && !h->func.name )\
+ goto fail;\
+}
+
+typedef struct
+{
+ AVS_Clip *clip;
+ AVS_ScriptEnvironment *env;
+ HMODULE library;
+ int num_frames;
+ /* declare function pointers for the utilized functions to be loaded without __declspec,
+ as the avisynth header does not compensate for this type of usage */
+ struct
+ {
+ const char *(__stdcall *avs_clip_get_error)( AVS_Clip *clip );
+ AVS_ScriptEnvironment *(__stdcall *avs_create_script_environment)( int version );
+ void (__stdcall *avs_delete_script_environment)( AVS_ScriptEnvironment *env );
+ AVS_VideoFrame *(__stdcall *avs_get_frame)( AVS_Clip *clip, int n );
+ int (__stdcall *avs_get_version)( AVS_Clip *clip );
+ const AVS_VideoInfo *(__stdcall *avs_get_video_info)( AVS_Clip *clip );
+ int (__stdcall *avs_function_exists)( AVS_ScriptEnvironment *env, const char *name );
+ AVS_Value (__stdcall *avs_invoke)( AVS_ScriptEnvironment *env, const char *name,
+ AVS_Value args, const char **arg_names );
+ void (__stdcall *avs_release_clip)( AVS_Clip *clip );
+ void (__stdcall *avs_release_value)( AVS_Value value );
+ void (__stdcall *avs_release_video_frame)( AVS_VideoFrame *frame );
+ AVS_Clip *(__stdcall *avs_take_clip)( AVS_Value, AVS_ScriptEnvironment *env );
+ } func;
+} avs_hnd_t;
+
+/* load the library and functions we require from it */
+static int avs_load_library( avs_hnd_t *h )
+{
+ h->library = LoadLibrary( "avisynth" );
+ if( !h->library )
+ return -1;
+ LOAD_AVS_FUNC( avs_clip_get_error, 0 );
+ LOAD_AVS_FUNC( avs_create_script_environment, 0 );
+ LOAD_AVS_FUNC( avs_delete_script_environment, 1 );
+ LOAD_AVS_FUNC( avs_get_frame, 0 );
+ LOAD_AVS_FUNC( avs_get_version, 0 );
+ LOAD_AVS_FUNC( avs_get_video_info, 0 );
+ LOAD_AVS_FUNC( avs_function_exists, 0 );
+ LOAD_AVS_FUNC( avs_invoke, 0 );
+ LOAD_AVS_FUNC( avs_release_clip, 0 );
+ LOAD_AVS_FUNC( avs_release_value, 0 );
+ LOAD_AVS_FUNC( avs_release_video_frame, 0 );
+ LOAD_AVS_FUNC( avs_take_clip, 0 );
+ return 0;
+fail:
+ FreeLibrary( h->library );
+ return -1;
+}
+
+/* generate a filter sequence to try based on the filename extension */
+static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] )
+{
+ int i=0, j;
+ const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 };
+ if( !strcasecmp( filename_ext, "avi" ) )
+ filter[i++] = "AVISource";
+ if( !strcasecmp( filename_ext, "d2v" ) )
+ filter[i++] = "MPEG2Source";
+ if( !strcasecmp( filename_ext, "dga" ) )
+ filter[i++] = "AVCSource";
+ for( j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ )
+ filter[i++] = all_purpose[j];
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ FILE *fh = fopen( psz_filename, "r" );
+ if( !fh )
+ return -1;
+ else if( !x264_is_regular_file( fh ) )
+ {
+ fprintf( stderr, "avs [error]: AVS input is incompatible with non-regular file `%s'\n", psz_filename );
+ return -1;
+ }
+ fclose( fh );
+
+ avs_hnd_t *h = malloc( sizeof(avs_hnd_t) );
+ if( !h )
+ return -1;
+ if( avs_load_library( h ) )
+ {
+ fprintf( stderr, "avs [error]: failed to load avisynth\n" );
+ return -1;
+ }
+ h->env = h->func.avs_create_script_environment( AVS_INTERFACE_YV12 );
+ if( !h->env )
+ {
+ fprintf( stderr, "avs [error]: failed to initiate avisynth\n" );
+ return -1;
+ }
+ AVS_Value arg = avs_new_value_string( psz_filename );
+ AVS_Value res;
+ char *filename_ext = get_filename_extension( psz_filename );
+
+ if( !strcasecmp( filename_ext, "avs" ) )
+ {
+ res = h->func.avs_invoke( h->env, "Import", arg, NULL );
+ if( avs_is_error( res ) )
+ {
+ fprintf( stderr, "avs [error]: %s\n", avs_as_string( res ) );
+ return -1;
+ }
+ /* check if the user is using a multi-threaded script and apply distributor if necessary.
+ adapted from avisynth's vfw interface */
+ AVS_Value mt_test = h->func.avs_invoke( h->env, "GetMTMode", avs_new_value_bool( 0 ), NULL );
+ int mt_mode = avs_is_int( mt_test ) ? avs_as_int( mt_test ) : 0;
+ h->func.avs_release_value( mt_test );
+ if( mt_mode > 0 && mt_mode < 5 )
+ {
+ AVS_Value temp = h->func.avs_invoke( h->env, "Distributor", res, NULL );
+ h->func.avs_release_value( res );
+ res = temp;
+ }
+ }
+ else /* non script file */
+ {
+ /* cycle through known source filters to find one that works */
+ const char *filter[AVS_MAX_SEQUENCE+1] = { 0 };
+ avs_build_filter_sequence( filename_ext, filter );
+ int i;
+ for( i = 0; filter[i]; i++ )
+ {
+ fprintf( stderr, "avs [info]: trying %s... ", filter[i] );
+ if( !h->func.avs_function_exists( h->env, filter[i] ) )
+ {
+ fprintf( stderr, "not found\n" );
+ continue;
+ }
+ if( !strncasecmp( filter[i], "FFmpegSource", 12 ) )
+ {
+ fprintf( stderr, "indexing... " );
+ fflush( stderr );
+ }
+ res = h->func.avs_invoke( h->env, filter[i], arg, NULL );
+ if( !avs_is_error( res ) )
+ {
+ fprintf( stderr, "succeeded\n" );
+ break;
+ }
+ fprintf( stderr, "failed\n" );
+ }
+ if( !filter[i] )
+ {
+ fprintf( stderr, "avs [error]: unable to find source filter to open `%s'\n", psz_filename );
+ return -1;
+ }
+ }
+ if( !avs_is_clip( res ) )
+ {
+ fprintf( stderr, "avs [error]: `%s' didn't return a video clip\n", psz_filename );
+ return -1;
+ }
+ h->clip = h->func.avs_take_clip( res, h->env );
+ int avs_version = h->func.avs_get_version( h->clip );
+ const AVS_VideoInfo *vi = h->func.avs_get_video_info( h->clip );
+ if( !avs_has_video( vi ) )
+ {
+ fprintf( stderr, "avs [error]: `%s' has no video data\n", psz_filename );
+ return -1;
+ }
+ if( vi->width&1 || vi->height&1 )
+ {
+ fprintf( stderr, "avs [error]: input clip width or height not divisible by 2 (%dx%d)\n",
+ vi->width, vi->height );
+ return -1;
+ }
+ /* always call ConvertToYV12 to convert non YV12 planar colorspaces to YV12 when user's AVS supports them,
+ as all planar colorspaces are flagged as YV12. If it is already YV12 in this case, the call does nothing */
+ if( !avs_is_yv12( vi ) || avs_version >= AVS_INTERFACE_OTHER_PLANAR )
+ {
+ h->func.avs_release_clip( h->clip );
+ fprintf( stderr, "avs %s\n", !avs_is_yv12( vi ) ? "[warning]: converting input clip to YV12"
+ : "[info]: avisynth 2.6+ detected, forcing conversion to YV12" );
+ const char *arg_name[2] = { NULL, "interlaced" };
+ AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
+ AVS_Value res2 = h->func.avs_invoke( h->env, "ConvertToYV12", avs_new_value_array( arg_arr, 2 ), arg_name );
+ if( avs_is_error( res2 ) )
+ {
+ fprintf( stderr, "avs [error]: couldn't convert input clip to YV12\n" );
+ return -1;
+ }
+ h->clip = h->func.avs_take_clip( res2, h->env );
+ h->func.avs_release_value( res2 );
+ vi = h->func.avs_get_video_info( h->clip );
+ }
+ h->func.avs_release_value( res );
+
+ info->width = vi->width;
+ info->height = vi->height;
+ info->fps_num = vi->fps_numerator;
+ info->fps_den = vi->fps_denominator;
+ h->num_frames = vi->num_frames;
+ info->csp = X264_CSP_YV12;
+ info->vfr = 0;
+
+ *p_handle = h;
+ return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+ avs_hnd_t *h = handle;
+ return h->num_frames;
+}
+
+static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+ pic->img.i_csp = i_csp;
+ pic->img.i_plane = 3;
+ pic->param = NULL;
+ return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+ static int plane[3] = { AVS_PLANAR_Y, AVS_PLANAR_V, AVS_PLANAR_U };
+ avs_hnd_t *h = handle;
+ if( i_frame >= h->num_frames )
+ return -1;
+ AVS_VideoFrame *frm =
+ p_pic->opaque = h->func.avs_get_frame( h->clip, i_frame );
+ int i;
+ const char *err = h->func.avs_clip_get_error( h->clip );
+ if( err )
+ {
+ fprintf( stderr, "avs [error]: %s occurred while reading frame %d\n", err, i_frame );
+ return -1;
+ }
+ for( i = 0; i < 3; i++ )
+ {
+ /* explicitly cast away the const attribute to avoid a warning */
+ p_pic->img.plane[i] = (uint8_t*)avs_get_read_ptr_p( frm, plane[i] );
+ p_pic->img.i_stride[i] = avs_get_pitch_p( frm, plane[i] );
+ }
+ return 0;
+}
+
+static int release_frame( x264_picture_t *pic, hnd_t handle )
+{
+ avs_hnd_t *h = handle;
+ h->func.avs_release_video_frame( pic->opaque );
+ return 0;
+}
+
+static void picture_clean( x264_picture_t *pic )
+{
+ memset( pic, 0, sizeof(x264_picture_t) );
+}
+
+static int close_file( hnd_t handle )
+{
+ avs_hnd_t *h = handle;
+ h->func.avs_release_clip( h->clip );
+ if( h->func.avs_delete_script_environment )
+ h->func.avs_delete_script_environment( h->env );
+ FreeLibrary( h->library );
+ free( h );
+ return 0;
+}
+
+cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/input/ffms.c b/input/ffms.c
new file mode 100644
index 0000000..b680967
--- /dev/null
+++ b/input/ffms.c
@@ -0,0 +1,247 @@
+/*****************************************************************************
+ * ffms.c: x264 ffmpegsource input module
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Mike Gurlitz <mike.gurlitz at gmail.com>
+ * Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include <ffms.h>
+#undef DECLARE_ALIGNED
+#include <libavcodec/avcodec.h>
+#include <libswscale/swscale.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#define SetConsoleTitle(t)
+#endif
+
+typedef struct
+{
+ FFMS_VideoSource *video_source;
+ FFMS_Track *track;
+ int total_frames;
+ struct SwsContext *scaler;
+ int pts_offset_flag;
+ int64_t pts_offset;
+ int reduce_pts;
+ int vfr_input;
+
+ int init_width;
+ int init_height;
+
+ int cur_width;
+ int cur_height;
+ int cur_pix_fmt;
+} ffms_hnd_t;
+
+static int FFMS_CC update_progress( int64_t current, int64_t total, void *private )
+{
+ if( current % 10 )
+ return 0;
+ char buf[200];
+ sprintf( buf, "ffms [info]: indexing input file [%.1f%%]", 100.0 * current / total );
+ fprintf( stderr, "%s \r", buf+5 );
+ SetConsoleTitle( buf );
+ fflush( stderr );
+ return 0;
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ ffms_hnd_t *h = calloc( 1, sizeof(ffms_hnd_t) );
+ if( !h )
+ return -1;
+ FFMS_Init( 0 );
+ FFMS_ErrorInfo e;
+ e.BufferSize = 0;
+ int seekmode = opt->seek ? FFMS_SEEK_NORMAL : FFMS_SEEK_LINEAR_NO_RW;
+
+ FFMS_Index *index = NULL;
+ if( opt->index )
+ {
+ struct stat index_s, input_s;
+ if( !stat( opt->index, &index_s ) && !stat( psz_filename, &input_s ) &&
+ input_s.st_mtime < index_s.st_mtime )
+ index = FFMS_ReadIndex( opt->index, &e );
+ }
+ if( !index )
+ {
+ index = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, NULL, &e );
+ fprintf( stderr, " \r" );
+ if( !index )
+ {
+ fprintf( stderr, "ffms [error]: could not create index\n" );
+ return -1;
+ }
+ if( opt->index && FFMS_WriteIndex( opt->index, index, &e ) )
+ fprintf( stderr, "ffms [warning]: could not write index file\n" );
+ }
+
+ int trackno = FFMS_GetFirstTrackOfType( index, FFMS_TYPE_VIDEO, &e );
+ if( trackno < 0 )
+ {
+ fprintf( stderr, "ffms [error]: could not find video track\n" );
+ return -1;
+ }
+
+ h->video_source = FFMS_CreateVideoSource( psz_filename, trackno, index, 1, seekmode, &e );
+ if( !h->video_source )
+ {
+ fprintf( stderr, "ffms [error]: could not create video source\n" );
+ return -1;
+ }
+
+ h->track = FFMS_GetTrackFromVideo( h->video_source );
+ const FFMS_TrackTimeBase *timebase = FFMS_GetTimeBase( h->track );
+
+ FFMS_DestroyIndex( index );
+ const FFMS_VideoProperties *videop = FFMS_GetVideoProperties( h->video_source );
+ h->total_frames = videop->NumFrames;
+ info->sar_height = videop->SARDen;
+ info->sar_width = videop->SARNum;
+ info->fps_den = videop->FPSDenominator;
+ info->fps_num = videop->FPSNumerator;
+ info->timebase_num = (int)timebase->Num;
+ h->vfr_input = info->vfr;
+
+ const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e );
+ if( !frame )
+ {
+ fprintf( stderr, "ffms [error]: could not read frame 0\n" );
+ return -1;
+ }
+
+ h->init_width = h->cur_width = info->width = frame->EncodedWidth;
+ h->init_height = h->cur_height = info->height = frame->EncodedHeight;
+ h->cur_pix_fmt = frame->EncodedPixelFormat;
+ info->interlaced = frame->InterlacedFrame;
+
+ if( h->cur_pix_fmt != PIX_FMT_YUV420P )
+ fprintf( stderr, "ffms [warning]: converting from %s to YV12\n",
+ avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
+
+ /* ffms timestamps are in milliseconds. Increasing timebase denominator could cause integer overflow.
+ * Conversely, reducing PTS may lose too much accuracy */
+ if( h->vfr_input )
+ {
+ int64_t timebase_den = (int64_t)timebase->Den * 1000;
+
+ if( timebase_den > INT_MAX )
+ {
+ info->timebase_den = (int)timebase->Den;
+ h->reduce_pts = 1;
+ }
+ else
+ {
+ info->timebase_den = (int)timebase->Den * 1000;
+ h->reduce_pts = 0;
+ }
+ }
+
+ *p_handle = h;
+ return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+ return ((ffms_hnd_t*)handle)->total_frames;
+}
+
+static int check_swscale( ffms_hnd_t *h, const FFMS_Frame *frame, int i_frame )
+{
+ if( h->scaler && h->cur_width == frame->EncodedWidth && h->cur_height == frame->EncodedHeight &&
+ h->cur_pix_fmt == frame->EncodedPixelFormat )
+ return 0;
+ if( h->scaler )
+ {
+ sws_freeContext( h->scaler );
+ fprintf( stderr, "ffms [warning]: stream properties changed to %dx%d, %s at frame %d \n", frame->EncodedWidth,
+ frame->EncodedHeight, avcodec_get_pix_fmt_name( frame->EncodedPixelFormat ), i_frame );
+ h->cur_width = frame->EncodedWidth;
+ h->cur_height = frame->EncodedHeight;
+ h->cur_pix_fmt = frame->EncodedPixelFormat;
+ }
+ h->scaler = sws_getContext( h->cur_width, h->cur_height, h->cur_pix_fmt, h->init_width, h->init_height,
+ PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL );
+ if( !h->scaler )
+ {
+ fprintf( stderr, "ffms [error]: could not open swscale context\n" );
+ return -1;
+ }
+ return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+ ffms_hnd_t *h = handle;
+ FFMS_ErrorInfo e;
+ e.BufferSize = 0;
+ const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, i_frame, &e );
+ if( !frame )
+ {
+ fprintf( stderr, "ffms [error]: could not read frame %d\n", i_frame );
+ return -1;
+ }
+
+ if( check_swscale( h, frame, i_frame ) )
+ return -1;
+ /* FFMS_VideoSource has a single FFMS_Frame buffer for all calls to GetFrame.
+ * With threaded input, copying the pointers would result in the data changing during encoding.
+ * FIXME: don't do redundant sws_scales for singlethreaded input, or fix FFMS to allow
+ * multiple FFMS_Frame buffers. */
+ sws_scale( h->scaler, (uint8_t**)frame->Data, (int*)frame->Linesize, 0,
+ frame->EncodedHeight, p_pic->img.plane, p_pic->img.i_stride );
+
+ const FFMS_FrameInfo *info = FFMS_GetFrameInfo( h->track, i_frame );
+
+ if( h->vfr_input )
+ {
+ if( info->PTS == AV_NOPTS_VALUE )
+ {
+ fprintf( stderr, "ffms [error]: invalid timestamp. "
+ "Use --force-cfr and specify a framerate with --fps\n" );
+ return -1;
+ }
+
+ if( !h->pts_offset_flag )
+ {
+ h->pts_offset = info->PTS;
+ h->pts_offset_flag = 1;
+ }
+
+ if( h->reduce_pts )
+ p_pic->i_pts = (int64_t)(((info->PTS - h->pts_offset) / 1000) + 0.5);
+ else
+ p_pic->i_pts = info->PTS - h->pts_offset;
+ }
+ return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+ ffms_hnd_t *h = handle;
+ sws_freeContext( h->scaler );
+ FFMS_DestroyVideoSource( h->video_source );
+ free( h );
+ return 0;
+}
+
+cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/input.h b/input/input.h
new file mode 100644
index 0000000..9fb425c
--- /dev/null
+++ b/input/input.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * input.h: x264 file input modules
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ * Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_INPUT_H
+#define X264_INPUT_H
+
+/* options that are used by only some demuxers */
+typedef struct
+{
+ char *index;
+ char *resolution; /* resolution string parsed by raw yuv input */
+ int seek;
+} cli_input_opt_t;
+
+/* properties of the source given by the demuxer */
+typedef struct
+{
+ int csp; /* X264_CSP_YV12 or X264_CSP_I420 */
+ int fps_num;
+ int fps_den;
+ int height;
+ int interlaced;
+ int sar_width;
+ int sar_height;
+ int timebase_num;
+ int timebase_den;
+ int vfr;
+ int width;
+} video_info_t;
+
+typedef struct
+{
+ int (*open_file)( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt );
+ int (*get_frame_total)( hnd_t handle );
+ int (*picture_alloc)( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+ int (*read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
+ int (*release_frame)( x264_picture_t *pic, hnd_t handle );
+ void (*picture_clean)( x264_picture_t *pic );
+ int (*close_file)( hnd_t handle );
+} cli_input_t;
+
+extern cli_input_t yuv_input;
+extern cli_input_t y4m_input;
+extern cli_input_t avs_input;
+extern cli_input_t thread_input;
+extern cli_input_t lavf_input;
+extern cli_input_t ffms_input;
+
+#endif
diff --git a/input/lavf.c b/input/lavf.c
new file mode 100644
index 0000000..180e509
--- /dev/null
+++ b/input/lavf.c
@@ -0,0 +1,272 @@
+/*****************************************************************************
+ * lavf.c: x264 libavformat input module
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Mike Gurlitz <mike.gurlitz at gmail.com>
+ * Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#undef DECLARE_ALIGNED
+#include <libavformat/avformat.h>
+#include <libswscale/swscale.h>
+
+typedef struct
+{
+ AVFormatContext *lavf;
+ int stream_id;
+ int next_frame;
+ int vfr_input;
+ int vertical_flip;
+ struct SwsContext *scaler;
+ int pts_offset_flag;
+ int64_t pts_offset;
+ x264_picture_t *first_pic;
+
+ int init_width;
+ int init_height;
+
+ int cur_width;
+ int cur_height;
+ enum PixelFormat cur_pix_fmt;
+} lavf_hnd_t;
+
+typedef struct
+{
+ AVFrame frame;
+ AVPacket packet;
+} lavf_pic_t;
+
+static int check_swscale( lavf_hnd_t *h, AVCodecContext *c, int i_frame )
+{
+ if( h->scaler && (h->cur_width == c->width) && (h->cur_height == c->height) && (h->cur_pix_fmt == c->pix_fmt) )
+ return 0;
+ if( h->scaler )
+ {
+ sws_freeContext( h->scaler );
+ fprintf( stderr, "lavf [warning]: stream properties changed to %dx%d, %s at frame %d \n",
+ c->width, c->height, avcodec_get_pix_fmt_name( c->pix_fmt ), i_frame );
+ h->cur_width = c->width;
+ h->cur_height = c->height;
+ h->cur_pix_fmt = c->pix_fmt;
+ }
+ h->scaler = sws_getContext( h->cur_width, h->cur_height, h->cur_pix_fmt, h->init_width, h->init_height,
+ PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL );
+ if( !h->scaler )
+ {
+ fprintf( stderr, "lavf [error]: could not open swscale context\n" );
+ return -1;
+ }
+ return 0;
+}
+
+static int read_frame_internal( x264_picture_t *p_pic, lavf_hnd_t *h, int i_frame, video_info_t *info )
+{
+ if( h->first_pic && !info )
+ {
+ /* see if the frame we are requesting is the frame we have already read and stored.
+ * if so, retrieve the pts and image data before freeing it. */
+ if( !i_frame )
+ {
+ XCHG( x264_image_t, p_pic->img, h->first_pic->img );
+ p_pic->i_pts = h->first_pic->i_pts;
+ }
+ lavf_input.picture_clean( h->first_pic );
+ free( h->first_pic );
+ h->first_pic = NULL;
+ if( !i_frame )
+ return 0;
+ }
+
+ AVCodecContext *c = h->lavf->streams[h->stream_id]->codec;
+ lavf_pic_t *pic_h = p_pic->opaque;
+ AVPacket *pkt = &pic_h->packet;
+ AVFrame *frame = &pic_h->frame;
+
+ while( i_frame >= h->next_frame )
+ {
+ int finished = 0;
+ while( !finished && av_read_frame( h->lavf, pkt ) >= 0 )
+ if( pkt->stream_index == h->stream_id )
+ {
+ c->reordered_opaque = pkt->pts;
+ if( avcodec_decode_video2( c, frame, &finished, pkt ) < 0 )
+ fprintf( stderr, "lavf [warning]: video decoding failed on frame %d\n", h->next_frame );
+ }
+ if( !finished )
+ {
+ if( avcodec_decode_video2( c, frame, &finished, pkt ) < 0 )
+ fprintf( stderr, "lavf [warning]: video decoding failed on frame %d\n", h->next_frame );
+ if( !finished )
+ return -1;
+ }
+ h->next_frame++;
+ }
+
+ if( check_swscale( h, c, i_frame ) )
+ return -1;
+ /* FIXME: avoid sws_scale where possible (no colorspace conversion). */
+ sws_scale( h->scaler, frame->data, frame->linesize, 0, c->height, p_pic->img.plane, p_pic->img.i_stride );
+
+ if( info )
+ info->interlaced = frame->interlaced_frame;
+
+ if( h->vfr_input )
+ {
+ p_pic->i_pts = 0;
+ if( frame->reordered_opaque != AV_NOPTS_VALUE )
+ p_pic->i_pts = frame->reordered_opaque;
+ else if( pkt->dts != AV_NOPTS_VALUE )
+ p_pic->i_pts = pkt->dts; // for AVI files
+ else if( info )
+ {
+ h->vfr_input = info->vfr = 0;
+ goto exit;
+ }
+ if( !h->pts_offset_flag )
+ {
+ h->pts_offset = p_pic->i_pts;
+ h->pts_offset_flag = 1;
+ }
+ p_pic->i_pts -= h->pts_offset;
+ }
+
+exit:
+ if( pkt->destruct )
+ pkt->destruct( pkt );
+ avcodec_get_frame_defaults( frame );
+ return 0;
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ lavf_hnd_t *h = malloc( sizeof(lavf_hnd_t) );
+ if( !h )
+ return -1;
+ av_register_all();
+ h->scaler = NULL;
+ if( !strcmp( psz_filename, "-" ) )
+ psz_filename = "pipe:";
+
+ if( av_open_input_file( &h->lavf, psz_filename, NULL, 0, NULL ) )
+ {
+ fprintf( stderr, "lavf [error]: could not open input file\n" );
+ return -1;
+ }
+
+ if( av_find_stream_info( h->lavf ) < 0 )
+ {
+ fprintf( stderr, "lavf [error]: could not find input stream info\n" );
+ return -1;
+ }
+
+ int i = 0;
+ while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != CODEC_TYPE_VIDEO )
+ i++;
+ if( i == h->lavf->nb_streams )
+ {
+ fprintf( stderr, "lavf [error]: could not find video stream\n" );
+ return -1;
+ }
+ h->stream_id = i;
+ h->next_frame = 0;
+ h->pts_offset_flag = 0;
+ h->pts_offset = 0;
+ AVCodecContext *c = h->lavf->streams[i]->codec;
+ h->init_width = h->cur_width = info->width = c->width;
+ h->init_height = h->cur_height = info->height = c->height;
+ h->cur_pix_fmt = c->pix_fmt;
+ info->fps_num = h->lavf->streams[i]->r_frame_rate.num;
+ info->fps_den = h->lavf->streams[i]->r_frame_rate.den;
+ info->timebase_num = h->lavf->streams[i]->time_base.num;
+ info->timebase_den = h->lavf->streams[i]->time_base.den;
+ h->vfr_input = info->vfr;
+ h->vertical_flip = 0;
+
+ /* avisynth stores rgb data vertically flipped. */
+ if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) &&
+ (h->cur_pix_fmt == PIX_FMT_BGRA || h->cur_pix_fmt == PIX_FMT_BGR24) )
+ info->csp |= X264_CSP_VFLIP;
+
+ if( h->cur_pix_fmt != PIX_FMT_YUV420P )
+ fprintf( stderr, "lavf [warning]: converting from %s to YV12\n",
+ avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
+
+ if( avcodec_open( c, avcodec_find_decoder( c->codec_id ) ) )
+ {
+ fprintf( stderr, "lavf [error]: could not find decoder for video stream\n" );
+ return -1;
+ }
+
+ /* prefetch the first frame and set/confirm flags */
+ h->first_pic = malloc( sizeof(x264_picture_t) );
+ if( !h->first_pic || lavf_input.picture_alloc( h->first_pic, info->csp, info->width, info->height ) )
+ {
+ fprintf( stderr, "lavf [error]: malloc failed\n" );
+ return -1;
+ }
+ else if( read_frame_internal( h->first_pic, h, 0, info ) )
+ return -1;
+
+ info->sar_height = c->sample_aspect_ratio.den;
+ info->sar_width = c->sample_aspect_ratio.num;
+ *p_handle = h;
+
+ return 0;
+}
+
+static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+ if( x264_picture_alloc( pic, i_csp, i_width, i_height ) )
+ return -1;
+ lavf_pic_t *pic_h = pic->opaque = malloc( sizeof(lavf_pic_t) );
+ if( !pic_h )
+ return -1;
+ avcodec_get_frame_defaults( &pic_h->frame );
+ av_init_packet( &pic_h->packet );
+ return 0;
+}
+
+/* FIXME */
+static int get_frame_total( hnd_t handle )
+{
+ return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+ return read_frame_internal( p_pic, handle, i_frame, NULL );
+}
+
+static void picture_clean( x264_picture_t *pic )
+{
+ free( pic->opaque );
+ x264_picture_clean( pic );
+}
+
+static int close_file( hnd_t handle )
+{
+ lavf_hnd_t *h = handle;
+ sws_freeContext( h->scaler );
+ avcodec_close( h->lavf->streams[h->stream_id]->codec );
+ av_close_input_file( h->lavf );
+ free( h );
+ return 0;
+}
+
+cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
diff --git a/input/thread.c b/input/thread.c
new file mode 100644
index 0000000..a88cfae
--- /dev/null
+++ b/input/thread.c
@@ -0,0 +1,136 @@
+/*****************************************************************************
+ * thread.c: x264 threaded input module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+extern cli_input_t input;
+
+typedef struct
+{
+ cli_input_t input;
+ hnd_t p_handle;
+ x264_picture_t pic;
+ x264_pthread_t tid;
+ int next_frame;
+ int frame_total;
+ int in_progress;
+ struct thread_input_arg_t *next_args;
+} thread_hnd_t;
+
+typedef struct thread_input_arg_t
+{
+ thread_hnd_t *h;
+ x264_picture_t *pic;
+ int i_frame;
+ int status;
+} thread_input_arg_t;
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ thread_hnd_t *h = malloc( sizeof(thread_hnd_t) );
+ if( !h || input.picture_alloc( &h->pic, info->csp, info->width, info->height ) )
+ {
+ fprintf( stderr, "x264 [error]: malloc failed\n" );
+ return -1;
+ }
+ h->input = input;
+ h->p_handle = *p_handle;
+ h->in_progress = 0;
+ h->next_frame = -1;
+ h->next_args = malloc( sizeof(thread_input_arg_t) );
+ if( !h->next_args )
+ return -1;
+ h->next_args->h = h;
+ h->next_args->status = 0;
+ h->frame_total = input.get_frame_total( h->p_handle );
+ thread_input.picture_alloc = h->input.picture_alloc;
+ thread_input.picture_clean = h->input.picture_clean;
+
+ *p_handle = h;
+ return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+ thread_hnd_t *h = handle;
+ return h->frame_total;
+}
+
+static void read_frame_thread_int( thread_input_arg_t *i )
+{
+ i->status = i->h->input.read_frame( i->pic, i->h->p_handle, i->i_frame );
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+ thread_hnd_t *h = handle;
+ int ret = 0;
+
+ if( h->next_frame >= 0 )
+ {
+ x264_pthread_join( h->tid, NULL );
+ ret |= h->next_args->status;
+ h->in_progress = 0;
+ }
+
+ if( h->next_frame == i_frame )
+ XCHG( x264_picture_t, *p_pic, h->pic );
+ else
+ ret |= h->input.read_frame( p_pic, h->p_handle, i_frame );
+
+ if( !h->frame_total || i_frame+1 < h->frame_total )
+ {
+ h->next_frame =
+ h->next_args->i_frame = i_frame+1;
+ h->next_args->pic = &h->pic;
+ if( x264_pthread_create( &h->tid, NULL, (void*)read_frame_thread_int, h->next_args ) )
+ return -1;
+ h->in_progress = 1;
+ }
+ else
+ h->next_frame = -1;
+
+ return ret;
+}
+
+static int release_frame( x264_picture_t *pic, hnd_t handle )
+{
+ thread_hnd_t *h = handle;
+ if( h->input.release_frame )
+ return h->input.release_frame( pic, h->p_handle );
+ return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+ thread_hnd_t *h = handle;
+ if( h->in_progress )
+ x264_pthread_join( h->tid, NULL );
+ h->input.close_file( h->p_handle );
+ h->input.picture_clean( &h->pic );
+ free( h->next_args );
+ free( h );
+ return 0;
+}
+
+cli_input_t thread_input = { open_file, get_frame_total, NULL, read_frame, release_frame, NULL, close_file };
diff --git a/input/y4m.c b/input/y4m.c
new file mode 100644
index 0000000..1619f74
--- /dev/null
+++ b/input/y4m.c
@@ -0,0 +1,245 @@
+/*****************************************************************************
+ * y4m.c: x264 y4m input module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+typedef struct
+{
+ FILE *fh;
+ int width, height;
+ int next_frame;
+ int seq_header_len, frame_header_len;
+ int frame_size;
+} y4m_hnd_t;
+
+#define Y4M_MAGIC "YUV4MPEG2"
+#define MAX_YUV4_HEADER 80
+#define Y4M_FRAME_MAGIC "FRAME"
+#define MAX_FRAME_HEADER 80
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
+ int i, n, d;
+ char header[MAX_YUV4_HEADER+10];
+ char *tokstart, *tokend, *header_end;
+ if( !h )
+ return -1;
+
+ h->next_frame = 0;
+ info->vfr = 0;
+
+ if( !strcmp( psz_filename, "-" ) )
+ h->fh = stdin;
+ else
+ h->fh = fopen(psz_filename, "rb");
+ if( h->fh == NULL )
+ return -1;
+
+ h->frame_header_len = strlen( Y4M_FRAME_MAGIC )+1;
+
+ /* Read header */
+ for( i = 0; i < MAX_YUV4_HEADER; i++ )
+ {
+ header[i] = fgetc( h->fh );
+ if( header[i] == '\n' )
+ {
+ /* Add a space after last option. Makes parsing "444" vs
+ "444alpha" easier. */
+ header[i+1] = 0x20;
+ header[i+2] = 0;
+ break;
+ }
+ }
+ if( i == MAX_YUV4_HEADER || strncmp( header, Y4M_MAGIC, strlen( Y4M_MAGIC ) ) )
+ return -1;
+
+ /* Scan properties */
+ header_end = &header[i+1]; /* Include space */
+ h->seq_header_len = i+1;
+ for( tokstart = &header[strlen( Y4M_MAGIC )+1]; tokstart < header_end; tokstart++ )
+ {
+ if( *tokstart == 0x20 )
+ continue;
+ switch( *tokstart++ )
+ {
+ case 'W': /* Width. Required. */
+ h->width = info->width = strtol( tokstart, &tokend, 10 );
+ tokstart=tokend;
+ break;
+ case 'H': /* Height. Required. */
+ h->height = info->height = strtol( tokstart, &tokend, 10 );
+ tokstart=tokend;
+ break;
+ case 'C': /* Color space */
+ if( strncmp( "420", tokstart, 3 ) )
+ {
+ fprintf( stderr, "y4m [error]: colorspace unhandled\n" );
+ return -1;
+ }
+ tokstart = strchr( tokstart, 0x20 );
+ break;
+ case 'I': /* Interlace type */
+ switch( *tokstart++ )
+ {
+ case 'p': break;
+ case '?':
+ case 't':
+ case 'b':
+ case 'm':
+ default:
+ info->interlaced = 1;
+ }
+ break;
+ case 'F': /* Frame rate - 0:0 if unknown */
+ if( sscanf( tokstart, "%d:%d", &n, &d ) == 2 && n && d )
+ {
+ x264_reduce_fraction( &n, &d );
+ info->fps_num = n;
+ info->fps_den = d;
+ }
+ tokstart = strchr( tokstart, 0x20 );
+ break;
+ case 'A': /* Pixel aspect - 0:0 if unknown */
+ /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
+ if( sscanf( tokstart, "%d:%d", &n, &d ) == 2 && n && d )
+ {
+ x264_reduce_fraction( &n, &d );
+ info->sar_width = n;
+ info->sar_height = d;
+ }
+ tokstart = strchr( tokstart, 0x20 );
+ break;
+ case 'X': /* Vendor extensions */
+ if( !strncmp( "YSCSS=", tokstart, 6 ) )
+ {
+ /* Older nonstandard pixel format representation */
+ tokstart += 6;
+ if( strncmp( "420JPEG",tokstart, 7 ) &&
+ strncmp( "420MPEG2",tokstart, 8 ) &&
+ strncmp( "420PALDV",tokstart, 8 ) )
+ {
+ fprintf( stderr, "y4m [error]: unsupported extended colorspace\n" );
+ return -1;
+ }
+ }
+ tokstart = strchr( tokstart, 0x20 );
+ break;
+ }
+ }
+
+ *p_handle = h;
+ return 0;
+}
+
+/* Most common case: frame_header = "FRAME" */
+static int get_frame_total( hnd_t handle )
+{
+ y4m_hnd_t *h = handle;
+ int i_frame_total = 0;
+
+ if( x264_is_regular_file( h->fh ) )
+ {
+ uint64_t init_pos = ftell( h->fh );
+ fseek( h->fh, 0, SEEK_END );
+ uint64_t i_size = ftell( h->fh );
+ fseek( h->fh, init_pos, SEEK_SET );
+ i_frame_total = (int)((i_size - h->seq_header_len) /
+ (3*(h->width*h->height)/2+h->frame_header_len));
+ }
+
+ return i_frame_total;
+}
+
+static int read_frame_internal( x264_picture_t *p_pic, y4m_hnd_t *h )
+{
+ int slen = strlen( Y4M_FRAME_MAGIC );
+ int i = 0;
+ char header[16];
+
+ /* Read frame header - without terminating '\n' */
+ if( fread( header, 1, slen, h->fh ) != slen )
+ return -1;
+
+ header[slen] = 0;
+ if( strncmp( header, Y4M_FRAME_MAGIC, slen ) )
+ {
+ fprintf( stderr, "y4m [error]: bad header magic (%"PRIx32" <=> %s)\n",
+ M32(header), header );
+ return -1;
+ }
+
+ /* Skip most of it */
+ while( i < MAX_FRAME_HEADER && fgetc( h->fh ) != '\n' )
+ i++;
+ if( i == MAX_FRAME_HEADER )
+ {
+ fprintf( stderr, "y4m [error]: bad frame header!\n" );
+ return -1;
+ }
+ h->frame_header_len = i+slen+1;
+
+ if( fread( p_pic->img.plane[0], h->width * h->height, 1, h->fh ) <= 0
+ || fread( p_pic->img.plane[1], h->width * h->height / 4, 1, h->fh ) <= 0
+ || fread( p_pic->img.plane[2], h->width * h->height / 4, 1, h->fh ) <= 0 )
+ return -1;
+
+ return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+ y4m_hnd_t *h = handle;
+
+ if( i_frame > h->next_frame )
+ {
+ if( x264_is_regular_file( h->fh ) )
+ fseek( h->fh, (uint64_t)i_frame*(3*(h->width*h->height)/2+h->frame_header_len)
+ + h->seq_header_len, SEEK_SET );
+ else
+ while( i_frame > h->next_frame )
+ {
+ if( read_frame_internal( p_pic, h ) )
+ return -1;
+ h->next_frame++;
+ }
+ }
+
+ if( read_frame_internal( p_pic, h ) )
+ return -1;
+
+ h->next_frame = i_frame+1;
+ return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+ y4m_hnd_t *h = handle;
+ if( !h || !h->fh )
+ return 0;
+ fclose( h->fh );
+ free( h );
+ return 0;
+}
+
+cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/yuv.c b/input/yuv.c
new file mode 100644
index 0000000..dbd0317
--- /dev/null
+++ b/input/yuv.c
@@ -0,0 +1,128 @@
+/*****************************************************************************
+ * yuv.c: x264 yuv input module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+typedef struct
+{
+ FILE *fh;
+ int width, height;
+ int next_frame;
+} yuv_hnd_t;
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ yuv_hnd_t *h = malloc( sizeof(yuv_hnd_t) );
+ if( !h )
+ return -1;
+
+ if( !opt->resolution )
+ {
+ /* try to parse the file name */
+ char *p;
+ for( p = psz_filename; *p; p++ )
+ if( *p >= '0' && *p <= '9' && sscanf( p, "%ux%u", &info->width, &info->height ) == 2 )
+ break;
+ }
+ else
+ sscanf( opt->resolution, "%ux%u", &info->width, &info->height );
+ if( !info->width || !info->height )
+ {
+ fprintf( stderr, "yuv [error]: rawyuv input requires a resolution.\n" );
+ return -1;
+ }
+
+ h->next_frame = 0;
+ info->vfr = 0;
+ h->width = info->width;
+ h->height = info->height;
+
+ if( !strcmp( psz_filename, "-" ) )
+ h->fh = stdin;
+ else
+ h->fh = fopen( psz_filename, "rb" );
+ if( h->fh == NULL )
+ return -1;
+
+ *p_handle = h;
+ return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+ yuv_hnd_t *h = handle;
+ int i_frame_total = 0;
+
+ if( x264_is_regular_file( h->fh ) )
+ {
+ fseek( h->fh, 0, SEEK_END );
+ uint64_t i_size = ftell( h->fh );
+ fseek( h->fh, 0, SEEK_SET );
+ i_frame_total = (int)(i_size / ( h->width * h->height * 3 / 2 ));
+ }
+
+ return i_frame_total;
+}
+
+static int read_frame_internal( x264_picture_t *p_pic, yuv_hnd_t *h )
+{
+ return fread( p_pic->img.plane[0], h->width * h->height, 1, h->fh ) <= 0
+ || fread( p_pic->img.plane[1], h->width * h->height / 4, 1, h->fh ) <= 0
+ || fread( p_pic->img.plane[2], h->width * h->height / 4, 1, h->fh ) <= 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+ yuv_hnd_t *h = handle;
+
+ if( i_frame > h->next_frame )
+ {
+ if( x264_is_regular_file( h->fh ) )
+ fseek( h->fh, (uint64_t)i_frame * h->width * h->height * 3 / 2, SEEK_SET );
+ else
+ while( i_frame > h->next_frame )
+ {
+ if( read_frame_internal( p_pic, h ) )
+ return -1;
+ h->next_frame++;
+ }
+ }
+
+ if( read_frame_internal( p_pic, h ) )
+ return -1;
+
+ h->next_frame = i_frame+1;
+ return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+ yuv_hnd_t *h = handle;
+ if( !h || !h->fh )
+ return 0;
+ fclose( h->fh );
+ free( h );
+ return 0;
+}
+
+cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/matroska.c b/matroska.c
deleted file mode 100644
index 35ae4cd..0000000
--- a/matroska.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/*****************************************************************************
- * matroska.c:
- *****************************************************************************
- * Copyright (C) 2005 Mike Matsnev
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *****************************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "common/osdep.h"
-#include "matroska.h"
-
-#define CLSIZE 1048576
-#define CHECK(x) do { if ((x) < 0) return -1; } while (0)
-
-struct mk_Context {
- struct mk_Context *next, **prev, *parent;
- struct mk_Writer *owner;
- unsigned id;
-
- void *data;
- unsigned d_cur, d_max;
-};
-
-typedef struct mk_Context mk_Context;
-
-struct mk_Writer {
- FILE *fp;
-
- unsigned duration_ptr;
-
- mk_Context *root, *cluster, *frame;
- mk_Context *freelist;
- mk_Context *actlist;
-
- int64_t def_duration;
- int64_t timescale;
- int64_t cluster_tc_scaled;
- int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
-
- char wrote_header, in_frame, keyframe;
-};
-
-static mk_Context *mk_createContext(mk_Writer *w, mk_Context *parent, unsigned id) {
- mk_Context *c;
-
- if (w->freelist) {
- c = w->freelist;
- w->freelist = w->freelist->next;
- } else {
- c = malloc(sizeof(*c));
- memset(c, 0, sizeof(*c));
- }
-
- if (c == NULL)
- return NULL;
-
- c->parent = parent;
- c->owner = w;
- c->id = id;
-
- if (c->owner->actlist)
- c->owner->actlist->prev = &c->next;
- c->next = c->owner->actlist;
- c->prev = &c->owner->actlist;
- c->owner->actlist = c;
-
- return c;
-}
-
-static int mk_appendContextData(mk_Context *c, const void *data, unsigned size) {
- unsigned ns = c->d_cur + size;
-
- if (ns > c->d_max) {
- void *dp;
- unsigned dn = c->d_max ? c->d_max << 1 : 16;
- while (ns > dn)
- dn <<= 1;
-
- dp = realloc(c->data, dn);
- if (dp == NULL)
- return -1;
-
- c->data = dp;
- c->d_max = dn;
- }
-
- memcpy((char*)c->data + c->d_cur, data, size);
-
- c->d_cur = ns;
-
- return 0;
-}
-
-static int mk_writeID(mk_Context *c, unsigned id) {
- unsigned char c_id[4] = { id >> 24, id >> 16, id >> 8, id };
-
- if (c_id[0])
- return mk_appendContextData(c, c_id, 4);
- if (c_id[1])
- return mk_appendContextData(c, c_id+1, 3);
- if (c_id[2])
- return mk_appendContextData(c, c_id+2, 2);
- return mk_appendContextData(c, c_id+3, 1);
-}
-
-static int mk_writeSize(mk_Context *c, unsigned size) {
- unsigned char c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
-
- if (size < 0x7f) {
- c_size[4] |= 0x80;
- return mk_appendContextData(c, c_size+4, 1);
- }
- if (size < 0x3fff) {
- c_size[3] |= 0x40;
- return mk_appendContextData(c, c_size+3, 2);
- }
- if (size < 0x1fffff) {
- c_size[2] |= 0x20;
- return mk_appendContextData(c, c_size+2, 3);
- }
- if (size < 0x0fffffff) {
- c_size[1] |= 0x10;
- return mk_appendContextData(c, c_size+1, 4);
- }
- return mk_appendContextData(c, c_size, 5);
-}
-
-static int mk_flushContextID(mk_Context *c) {
- unsigned char ff = 0xff;
-
- if (c->id == 0)
- return 0;
-
- CHECK(mk_writeID(c->parent, c->id));
- CHECK(mk_appendContextData(c->parent, &ff, 1));
-
- c->id = 0;
-
- return 0;
-}
-
-static int mk_flushContextData(mk_Context *c) {
- if (c->d_cur == 0)
- return 0;
-
- if (c->parent)
- CHECK(mk_appendContextData(c->parent, c->data, c->d_cur));
- else
- if (fwrite(c->data, c->d_cur, 1, c->owner->fp) != 1)
- return -1;
-
- c->d_cur = 0;
-
- return 0;
-}
-
-static int mk_closeContext(mk_Context *c, unsigned *off) {
- if (c->id) {
- CHECK(mk_writeID(c->parent, c->id));
- CHECK(mk_writeSize(c->parent, c->d_cur));
- }
-
- if (c->parent && off != NULL)
- *off += c->parent->d_cur;
-
- CHECK(mk_flushContextData(c));
-
- if (c->next)
- c->next->prev = c->prev;
- *(c->prev) = c->next;
- c->next = c->owner->freelist;
- c->owner->freelist = c;
-
- return 0;
-}
-
-static void mk_destroyContexts(mk_Writer *w) {
- mk_Context *cur, *next;
-
- for (cur = w->freelist; cur; cur = next) {
- next = cur->next;
- free(cur->data);
- free(cur);
- }
-
- for (cur = w->actlist; cur; cur = next) {
- next = cur->next;
- free(cur->data);
- free(cur);
- }
-
- w->freelist = w->actlist = w->root = NULL;
-}
-
-static int mk_writeStr(mk_Context *c, unsigned id, const char *str) {
- size_t len = strlen(str);
-
- CHECK(mk_writeID(c, id));
- CHECK(mk_writeSize(c, len));
- CHECK(mk_appendContextData(c, str, len));
- return 0;
-}
-
-static int mk_writeBin(mk_Context *c, unsigned id, const void *data, unsigned size) {
- CHECK(mk_writeID(c, id));
- CHECK(mk_writeSize(c, size));
- CHECK(mk_appendContextData(c, data, size));
- return 0;
-}
-
-static int mk_writeUInt(mk_Context *c, unsigned id, int64_t ui) {
- unsigned char c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
- unsigned i = 0;
-
- CHECK(mk_writeID(c, id));
- while (i < 7 && c_ui[i] == 0)
- ++i;
- CHECK(mk_writeSize(c, 8 - i));
- CHECK(mk_appendContextData(c, c_ui+i, 8 - i));
- return 0;
-}
-
-static int mk_writeSInt(mk_Context *c, unsigned id, int64_t si) {
- unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
- unsigned i = 0;
-
- CHECK(mk_writeID(c, id));
- if (si < 0)
- while (i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80)
- ++i;
- else
- while (i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80))
- ++i;
- CHECK(mk_writeSize(c, 8 - i));
- CHECK(mk_appendContextData(c, c_si+i, 8 - i));
- return 0;
-}
-
-static int mk_writeFloatRaw(mk_Context *c, float f) {
- union {
- float f;
- unsigned u;
- } u;
- unsigned char c_f[4];
-
- u.f = f;
- c_f[0] = u.u >> 24;
- c_f[1] = u.u >> 16;
- c_f[2] = u.u >> 8;
- c_f[3] = u.u;
-
- return mk_appendContextData(c, c_f, 4);
-}
-
-static int mk_writeFloat(mk_Context *c, unsigned id, float f) {
- CHECK(mk_writeID(c, id));
- CHECK(mk_writeSize(c, 4));
- CHECK(mk_writeFloatRaw(c, f));
- return 0;
-}
-
-static unsigned mk_ebmlSizeSize(unsigned s) {
- if (s < 0x7f)
- return 1;
- if (s < 0x3fff)
- return 2;
- if (s < 0x1fffff)
- return 3;
- if (s < 0x0fffffff)
- return 4;
- return 5;
-}
-
-static unsigned mk_ebmlSIntSize(int64_t si) {
- unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
- unsigned i = 0;
-
- if (si < 0)
- while (i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80)
- ++i;
- else
- while (i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80))
- ++i;
-
- return 8 - i;
-}
-
-mk_Writer *mk_createWriter(const char *filename) {
- mk_Writer *w = malloc(sizeof(*w));
- if (w == NULL)
- return NULL;
-
- memset(w, 0, sizeof(*w));
-
- w->root = mk_createContext(w, NULL, 0);
- if (w->root == NULL) {
- free(w);
- return NULL;
- }
-
- w->fp = fopen(filename, "wb");
- if (w->fp == NULL) {
- mk_destroyContexts(w);
- free(w);
- return NULL;
- }
-
- w->timescale = 1000000;
-
- return w;
-}
-
-int mk_writeHeader(mk_Writer *w, const char *writingApp,
- const char *codecID,
- const void *codecPrivate, unsigned codecPrivateSize,
- int64_t default_frame_duration,
- int64_t timescale,
- unsigned width, unsigned height,
- unsigned d_width, unsigned d_height)
-{
- mk_Context *c, *ti, *v;
-
- if (w->wrote_header)
- return -1;
-
- w->timescale = timescale;
- w->def_duration = default_frame_duration;
-
- if ((c = mk_createContext(w, w->root, 0x1a45dfa3)) == NULL) // EBML
- return -1;
- CHECK(mk_writeUInt(c, 0x4286, 1)); // EBMLVersion
- CHECK(mk_writeUInt(c, 0x42f7, 1)); // EBMLReadVersion
- CHECK(mk_writeUInt(c, 0x42f2, 4)); // EBMLMaxIDLength
- CHECK(mk_writeUInt(c, 0x42f3, 8)); // EBMLMaxSizeLength
- CHECK(mk_writeStr(c, 0x4282, "matroska")); // DocType
- CHECK(mk_writeUInt(c, 0x4287, 1)); // DocTypeVersion
- CHECK(mk_writeUInt(c, 0x4285, 1)); // DocTypeReadversion
- CHECK(mk_closeContext(c, 0));
-
- if ((c = mk_createContext(w, w->root, 0x18538067)) == NULL) // Segment
- return -1;
- CHECK(mk_flushContextID(c));
- CHECK(mk_closeContext(c, 0));
-
- if ((c = mk_createContext(w, w->root, 0x1549a966)) == NULL) // SegmentInfo
- return -1;
- CHECK(mk_writeStr(c, 0x4d80, "Haali Matroska Writer b0"));
- CHECK(mk_writeStr(c, 0x5741, writingApp));
- CHECK(mk_writeUInt(c, 0x2ad7b1, w->timescale));
- CHECK(mk_writeFloat(c, 0x4489, 0));
- w->duration_ptr = c->d_cur - 4;
- CHECK(mk_closeContext(c, &w->duration_ptr));
-
- if ((c = mk_createContext(w, w->root, 0x1654ae6b)) == NULL) // tracks
- return -1;
- if ((ti = mk_createContext(w, c, 0xae)) == NULL) // TrackEntry
- return -1;
- CHECK(mk_writeUInt(ti, 0xd7, 1)); // TrackNumber
- CHECK(mk_writeUInt(ti, 0x73c5, 1)); // TrackUID
- CHECK(mk_writeUInt(ti, 0x83, 1)); // TrackType
- CHECK(mk_writeUInt(ti, 0x9c, 0)); // FlagLacing
- CHECK(mk_writeStr(ti, 0x86, codecID)); // CodecID
- if (codecPrivateSize)
- CHECK(mk_writeBin(ti, 0x63a2, codecPrivate, codecPrivateSize)); // CodecPrivate
- if (default_frame_duration)
- CHECK(mk_writeUInt(ti, 0x23e383, default_frame_duration)); // DefaultDuration
-
- if ((v = mk_createContext(w, ti, 0xe0)) == NULL) // Video
- return -1;
- CHECK(mk_writeUInt(v, 0xb0, width));
- CHECK(mk_writeUInt(v, 0xba, height));
- CHECK(mk_writeUInt(v, 0x54b0, d_width));
- CHECK(mk_writeUInt(v, 0x54ba, d_height));
- CHECK(mk_closeContext(v, 0));
-
- CHECK(mk_closeContext(ti, 0));
-
- CHECK(mk_closeContext(c, 0));
-
- CHECK(mk_flushContextData(w->root));
-
- w->wrote_header = 1;
-
- return 0;
-}
-
-static int mk_closeCluster(mk_Writer *w) {
- if (w->cluster == NULL)
- return 0;
- CHECK(mk_closeContext(w->cluster, 0));
- w->cluster = NULL;
- CHECK(mk_flushContextData(w->root));
- return 0;
-}
-
-static int mk_flushFrame(mk_Writer *w) {
- int64_t delta, ref = 0;
- unsigned fsize, bgsize;
- unsigned char c_delta_flags[3];
-
- if (!w->in_frame)
- return 0;
-
- delta = w->frame_tc/w->timescale - w->cluster_tc_scaled;
- if (delta > 32767ll || delta < -32768ll)
- CHECK(mk_closeCluster(w));
-
- if (w->cluster == NULL) {
- w->cluster_tc_scaled = w->frame_tc / w->timescale;
- w->cluster = mk_createContext(w, w->root, 0x1f43b675); // Cluster
- if (w->cluster == NULL)
- return -1;
-
- CHECK(mk_writeUInt(w->cluster, 0xe7, w->cluster_tc_scaled)); // Timecode
-
- delta = 0;
- }
-
- fsize = w->frame ? w->frame->d_cur : 0;
- bgsize = fsize + 4 + mk_ebmlSizeSize(fsize + 4) + 1;
- if (!w->keyframe) {
- ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
- bgsize += 1 + 1 + mk_ebmlSIntSize(ref);
- }
-
- CHECK(mk_writeID(w->cluster, 0xa0)); // BlockGroup
- CHECK(mk_writeSize(w->cluster, bgsize));
- CHECK(mk_writeID(w->cluster, 0xa1)); // Block
- CHECK(mk_writeSize(w->cluster, fsize + 4));
- CHECK(mk_writeSize(w->cluster, 1)); // track number
-
- c_delta_flags[0] = delta >> 8;
- c_delta_flags[1] = delta;
- c_delta_flags[2] = 0;
- CHECK(mk_appendContextData(w->cluster, c_delta_flags, 3));
- if (w->frame) {
- CHECK(mk_appendContextData(w->cluster, w->frame->data, w->frame->d_cur));
- w->frame->d_cur = 0;
- }
- if (!w->keyframe)
- CHECK(mk_writeSInt(w->cluster, 0xfb, ref)); // ReferenceBlock
-
- w->in_frame = 0;
- w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
-
- if (w->cluster->d_cur > CLSIZE)
- CHECK(mk_closeCluster(w));
-
- return 0;
-}
-
-int mk_startFrame(mk_Writer *w) {
- if (mk_flushFrame(w) < 0)
- return -1;
-
- w->in_frame = 1;
- w->keyframe = 0;
-
- return 0;
-}
-
-int mk_setFrameFlags(mk_Writer *w,int64_t timestamp, int keyframe) {
- if (!w->in_frame)
- return -1;
-
- w->frame_tc = timestamp;
- w->keyframe = keyframe != 0;
-
- if (w->max_frame_tc < timestamp)
- w->max_frame_tc = timestamp;
-
- return 0;
-}
-
-int mk_addFrameData(mk_Writer *w, const void *data, unsigned size) {
- if (!w->in_frame)
- return -1;
-
- if (w->frame == NULL)
- if ((w->frame = mk_createContext(w, NULL, 0)) == NULL)
- return -1;
-
- return mk_appendContextData(w->frame, data, size);
-}
-
-int mk_close(mk_Writer *w) {
- int ret = 0;
- if (mk_flushFrame(w) < 0 || mk_closeCluster(w) < 0)
- ret = -1;
- if (w->wrote_header) {
- fseek(w->fp, w->duration_ptr, SEEK_SET);
- if (mk_writeFloatRaw(w->root, (float)((double)(w->max_frame_tc+w->def_duration) / w->timescale)) < 0 ||
- mk_flushContextData(w->root) < 0)
- ret = -1;
- }
- mk_destroyContexts(w);
- fclose(w->fp);
- free(w);
- return ret;
-}
-
diff --git a/muxers.c b/muxers.c
deleted file mode 100644
index d62be5c..0000000
--- a/muxers.c
+++ /dev/null
@@ -1,1019 +0,0 @@
-/*****************************************************************************
- * muxers.c: h264 file i/o plugins
- *****************************************************************************
- * Copyright (C) 2003-2008 x264 project
- *
- * Authors: Laurent Aimar <fenrir at via.ecp.fr>
- * Loren Merritt <lorenm at u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "x264.h"
-#include "matroska.h"
-#include "muxers.h"
-
-#ifndef _MSC_VER
-#include "config.h"
-#endif
-
-#include <sys/types.h>
-
-#ifdef AVIS_INPUT
-#include <windows.h>
-#include <vfw.h>
-#endif
-
-#ifdef MP4_OUTPUT
-#include <gpac/isomedia.h>
-#endif
-
-static int64_t gcd( int64_t a, int64_t b )
-{
- while (1)
- {
- int64_t c = a % b;
- if( !c )
- return b;
- a = b;
- b = c;
- }
-}
-
-typedef struct {
- FILE *fh;
- int width, height;
- int next_frame;
-} yuv_input_t;
-
-/* raw 420 yuv file operation */
-int open_file_yuv( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
- yuv_input_t *h = malloc(sizeof(yuv_input_t));
- h->width = p_param->i_width;
- h->height = p_param->i_height;
- h->next_frame = 0;
-
- if( !strcmp(psz_filename, "-") )
- h->fh = stdin;
- else
- h->fh = fopen(psz_filename, "rb");
- if( h->fh == NULL )
- return -1;
-
- *p_handle = (hnd_t)h;
- return 0;
-}
-
-int get_frame_total_yuv( hnd_t handle )
-{
- yuv_input_t *h = handle;
- int i_frame_total = 0;
-
- if( !fseek( h->fh, 0, SEEK_END ) )
- {
- uint64_t i_size = ftell( h->fh );
- fseek( h->fh, 0, SEEK_SET );
- i_frame_total = (int)(i_size / ( h->width * h->height * 3 / 2 ));
- }
-
- return i_frame_total;
-}
-
-int read_frame_yuv( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
- yuv_input_t *h = handle;
-
- if( i_frame != h->next_frame )
- if( fseek( h->fh, (uint64_t)i_frame * h->width * h->height * 3 / 2, SEEK_SET ) )
- return -1;
-
- if( fread( p_pic->img.plane[0], 1, h->width * h->height, h->fh ) <= 0
- || fread( p_pic->img.plane[1], 1, h->width * h->height / 4, h->fh ) <= 0
- || fread( p_pic->img.plane[2], 1, h->width * h->height / 4, h->fh ) <= 0 )
- return -1;
-
- h->next_frame = i_frame+1;
-
- return 0;
-}
-
-int close_file_yuv(hnd_t handle)
-{
- yuv_input_t *h = handle;
- if( !h || !h->fh )
- return 0;
- fclose( h->fh );
- free( h );
- return 0;
-}
-
-/* YUV4MPEG2 raw 420 yuv file operation */
-typedef struct {
- FILE *fh;
- int width, height;
- int next_frame;
- int seq_header_len, frame_header_len;
- int frame_size;
-} y4m_input_t;
-
-#define Y4M_MAGIC "YUV4MPEG2"
-#define MAX_YUV4_HEADER 80
-#define Y4M_FRAME_MAGIC "FRAME"
-#define MAX_FRAME_HEADER 80
-
-int open_file_y4m( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
- int i, n, d;
- char header[MAX_YUV4_HEADER+10];
- char *tokstart, *tokend, *header_end;
- y4m_input_t *h = malloc(sizeof(y4m_input_t));
-
- h->next_frame = 0;
-
- if( !strcmp(psz_filename, "-") )
- h->fh = stdin;
- else
- h->fh = fopen(psz_filename, "rb");
- if( h->fh == NULL )
- return -1;
-
- h->frame_header_len = strlen(Y4M_FRAME_MAGIC)+1;
-
- /* Read header */
- for( i=0; i<MAX_YUV4_HEADER; i++ )
- {
- header[i] = fgetc(h->fh);
- if( header[i] == '\n' )
- {
- /* Add a space after last option. Makes parsing "444" vs
- "444alpha" easier. */
- header[i+1] = 0x20;
- header[i+2] = 0;
- break;
- }
- }
- if( i == MAX_YUV4_HEADER || strncmp(header, Y4M_MAGIC, strlen(Y4M_MAGIC)) )
- return -1;
-
- /* Scan properties */
- header_end = &header[i+1]; /* Include space */
- h->seq_header_len = i+1;
- for( tokstart = &header[strlen(Y4M_MAGIC)+1]; tokstart < header_end; tokstart++ )
- {
- if(*tokstart==0x20) continue;
- switch(*tokstart++)
- {
- case 'W': /* Width. Required. */
- h->width = p_param->i_width = strtol(tokstart, &tokend, 10);
- tokstart=tokend;
- break;
- case 'H': /* Height. Required. */
- h->height = p_param->i_height = strtol(tokstart, &tokend, 10);
- tokstart=tokend;
- break;
- case 'C': /* Color space */
- if( strncmp("420", tokstart, 3) )
- {
- fprintf(stderr, "Colorspace unhandled\n");
- return -1;
- }
- tokstart = strchr(tokstart, 0x20);
- break;
- case 'I': /* Interlace type */
- switch(*tokstart++)
- {
- case 'p': break;
- case '?':
- case 't':
- case 'b':
- case 'm':
- default:
- fprintf(stderr, "Warning, this sequence might be interlaced\n");
- }
- break;
- case 'F': /* Frame rate - 0:0 if unknown */
- if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d )
- {
- x264_reduce_fraction( &n, &d );
- p_param->i_fps_num = n;
- p_param->i_fps_den = d;
- }
- tokstart = strchr(tokstart, 0x20);
- break;
- case 'A': /* Pixel aspect - 0:0 if unknown */
- /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
- if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d && !p_param->vui.i_sar_width && !p_param->vui.i_sar_height )
- {
- x264_reduce_fraction( &n, &d );
- p_param->vui.i_sar_width = n;
- p_param->vui.i_sar_height = d;
- }
- tokstart = strchr(tokstart, 0x20);
- break;
- case 'X': /* Vendor extensions */
- if( !strncmp("YSCSS=",tokstart,6) )
- {
- /* Older nonstandard pixel format representation */
- tokstart += 6;
- if( strncmp("420JPEG",tokstart,7) &&
- strncmp("420MPEG2",tokstart,8) &&
- strncmp("420PALDV",tokstart,8) )
- {
- fprintf(stderr, "Unsupported extended colorspace\n");
- return -1;
- }
- }
- tokstart = strchr(tokstart, 0x20);
- break;
- }
- }
-
- fprintf(stderr, "yuv4mpeg: %ix%i@%i/%ifps, %i:%i\n",
- h->width, h->height, p_param->i_fps_num, p_param->i_fps_den,
- p_param->vui.i_sar_width, p_param->vui.i_sar_height);
-
- *p_handle = (hnd_t)h;
- return 0;
-}
-
-/* Most common case: frame_header = "FRAME" */
-int get_frame_total_y4m( hnd_t handle )
-{
- y4m_input_t *h = handle;
- int i_frame_total = 0;
- uint64_t init_pos = ftell(h->fh);
-
- if( !fseek( h->fh, 0, SEEK_END ) )
- {
- uint64_t i_size = ftell( h->fh );
- fseek( h->fh, init_pos, SEEK_SET );
- i_frame_total = (int)((i_size - h->seq_header_len) /
- (3*(h->width*h->height)/2+h->frame_header_len));
- }
-
- return i_frame_total;
-}
-
-int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
- int slen = strlen(Y4M_FRAME_MAGIC);
- int i = 0;
- char header[16];
- y4m_input_t *h = handle;
-
- if( i_frame != h->next_frame )
- {
- if (fseek(h->fh, (uint64_t)i_frame*(3*(h->width*h->height)/2+h->frame_header_len)
- + h->seq_header_len, SEEK_SET))
- return -1;
- }
-
- /* Read frame header - without terminating '\n' */
- if (fread(header, 1, slen, h->fh) != slen)
- return -1;
-
- header[slen] = 0;
- if (strncmp(header, Y4M_FRAME_MAGIC, slen))
- {
- fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
- *((uint32_t*)header), header);
- return -1;
- }
-
- /* Skip most of it */
- while (i<MAX_FRAME_HEADER && fgetc(h->fh) != '\n')
- i++;
- if (i == MAX_FRAME_HEADER)
- {
- fprintf(stderr, "Bad frame header!\n");
- return -1;
- }
- h->frame_header_len = i+slen+1;
-
- if( fread(p_pic->img.plane[0], 1, h->width*h->height, h->fh) <= 0
- || fread(p_pic->img.plane[1], 1, h->width * h->height / 4, h->fh) <= 0
- || fread(p_pic->img.plane[2], 1, h->width * h->height / 4, h->fh) <= 0)
- return -1;
-
- h->next_frame = i_frame+1;
-
- return 0;
-}
-
-int close_file_y4m(hnd_t handle)
-{
- y4m_input_t *h = handle;
- if( !h || !h->fh )
- return 0;
- fclose( h->fh );
- free( h );
- return 0;
-}
-
-/* avs/avi input file support under cygwin */
-
-#ifdef AVIS_INPUT
-typedef struct {
- PAVISTREAM p_avi;
- int width, height;
-} avis_input_t;
-
-int open_file_avis( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
- avis_input_t *h = malloc(sizeof(avis_input_t));
- AVISTREAMINFO info;
- int i;
-
- *p_handle = (hnd_t)h;
-
- AVIFileInit();
- if( AVIStreamOpenFromFile( &h->p_avi, psz_filename, streamtypeVIDEO, 0, OF_READ, NULL ) )
- {
- AVIFileExit();
- return -1;
- }
-
- if( AVIStreamInfo(h->p_avi, &info, sizeof(AVISTREAMINFO)) )
- {
- AVIStreamRelease(h->p_avi);
- AVIFileExit();
- return -1;
- }
-
- // check input format
- if (info.fccHandler != MAKEFOURCC('Y', 'V', '1', '2'))
- {
- fprintf( stderr, "avis [error]: unsupported input format (%c%c%c%c)\n",
- (char)(info.fccHandler & 0xff), (char)((info.fccHandler >> 8) & 0xff),
- (char)((info.fccHandler >> 16) & 0xff), (char)((info.fccHandler >> 24)) );
-
- AVIStreamRelease(h->p_avi);
- AVIFileExit();
-
- return -1;
- }
-
- h->width =
- p_param->i_width = info.rcFrame.right - info.rcFrame.left;
- h->height =
- p_param->i_height = info.rcFrame.bottom - info.rcFrame.top;
- i = gcd(info.dwRate, info.dwScale);
- p_param->i_fps_den = info.dwScale / i;
- p_param->i_fps_num = info.dwRate / i;
-
- fprintf( stderr, "avis [info]: %dx%d @ %.2f fps (%d frames)\n",
- p_param->i_width, p_param->i_height,
- (double)p_param->i_fps_num / (double)p_param->i_fps_den,
- (int)info.dwLength );
-
- return 0;
-}
-
-int get_frame_total_avis( hnd_t handle )
-{
- avis_input_t *h = handle;
- AVISTREAMINFO info;
-
- if( AVIStreamInfo(h->p_avi, &info, sizeof(AVISTREAMINFO)) )
- return -1;
-
- return info.dwLength;
-}
-
-int read_frame_avis( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
- avis_input_t *h = handle;
-
- p_pic->img.i_csp = X264_CSP_YV12;
-
- if( AVIStreamRead(h->p_avi, i_frame, 1, p_pic->img.plane[0], h->width * h->height * 3 / 2, NULL, NULL ) )
- return -1;
-
- return 0;
-}
-
-int close_file_avis( hnd_t handle )
-{
- avis_input_t *h = handle;
- AVIStreamRelease(h->p_avi);
- AVIFileExit();
- free(h);
- return 0;
-}
-#endif
-
-
-#ifdef HAVE_PTHREAD
-typedef struct {
- int (*p_read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
- int (*p_close_infile)( hnd_t handle );
- hnd_t p_handle;
- x264_picture_t pic;
- x264_pthread_t tid;
- int next_frame;
- int frame_total;
- int in_progress;
- struct thread_input_arg_t *next_args;
-} thread_input_t;
-
-typedef struct thread_input_arg_t {
- thread_input_t *h;
- x264_picture_t *pic;
- int i_frame;
- int status;
-} thread_input_arg_t;
-
-int open_file_thread( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
- thread_input_t *h = malloc(sizeof(thread_input_t));
- x264_picture_alloc( &h->pic, X264_CSP_I420, p_param->i_width, p_param->i_height );
- h->p_read_frame = p_read_frame;
- h->p_close_infile = p_close_infile;
- h->p_handle = *p_handle;
- h->in_progress = 0;
- h->next_frame = -1;
- h->next_args = malloc(sizeof(thread_input_arg_t));
- h->next_args->h = h;
- h->next_args->status = 0;
- h->frame_total = p_get_frame_total( h->p_handle );
-
- *p_handle = (hnd_t)h;
- return 0;
-}
-
-int get_frame_total_thread( hnd_t handle )
-{
- thread_input_t *h = handle;
- return h->frame_total;
-}
-
-static void read_frame_thread_int( thread_input_arg_t *i )
-{
- i->status = i->h->p_read_frame( i->pic, i->h->p_handle, i->i_frame );
-}
-
-int read_frame_thread( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
- thread_input_t *h = handle;
- UNUSED void *stuff;
- int ret = 0;
-
- if( h->next_frame >= 0 )
- {
- x264_pthread_join( h->tid, &stuff );
- ret |= h->next_args->status;
- h->in_progress = 0;
- }
-
- if( h->next_frame == i_frame )
- {
- XCHG( x264_picture_t, *p_pic, h->pic );
- }
- else
- {
- ret |= h->p_read_frame( p_pic, h->p_handle, i_frame );
- }
-
- if( !h->frame_total || i_frame+1 < h->frame_total )
- {
- h->next_frame =
- h->next_args->i_frame = i_frame+1;
- h->next_args->pic = &h->pic;
- x264_pthread_create( &h->tid, NULL, (void*)read_frame_thread_int, h->next_args );
- h->in_progress = 1;
- }
- else
- h->next_frame = -1;
-
- return ret;
-}
-
-int close_file_thread( hnd_t handle )
-{
- thread_input_t *h = handle;
- if( h->in_progress )
- x264_pthread_join( h->tid, NULL );
- h->p_close_infile( h->p_handle );
- x264_picture_clean( &h->pic );
- free( h->next_args );
- free( h );
- return 0;
-}
-#endif
-
-
-int open_file_bsf( char *psz_filename, hnd_t *p_handle )
-{
- if ((*p_handle = fopen(psz_filename, "w+b")) == NULL)
- return -1;
-
- return 0;
-}
-
-int set_param_bsf( hnd_t handle, x264_param_t *p_param )
-{
- return 0;
-}
-
-int write_nalu_bsf( hnd_t handle, uint8_t *p_nalu, int i_size )
-{
- if (fwrite(p_nalu, i_size, 1, (FILE *)handle) > 0)
- return i_size;
- return -1;
-}
-
-int set_eop_bsf( hnd_t handle, x264_picture_t *p_picture )
-{
- return 0;
-}
-
-int close_file_bsf( hnd_t handle )
-{
- if ((handle == NULL) || (handle == stdout))
- return 0;
-
- return fclose((FILE *)handle);
-}
-
-/* -- mp4 muxing support ------------------------------------------------- */
-#ifdef MP4_OUTPUT
-
-typedef struct
-{
- GF_ISOFile *p_file;
- GF_AVCConfig *p_config;
- GF_ISOSample *p_sample;
- int i_track;
- uint32_t i_descidx;
- int i_time_inc;
- int i_time_res;
- int i_numframe;
- int i_init_delay;
- uint8_t b_sps;
- uint8_t b_pps;
-} mp4_t;
-
-
-static void recompute_bitrate_mp4(GF_ISOFile *p_file, int i_track)
-{
- u32 i, count, di, timescale, time_wnd, rate;
- u64 offset;
- Double br;
- GF_ESD *esd;
-
- esd = gf_isom_get_esd(p_file, i_track, 1);
- if (!esd) return;
-
- esd->decoderConfig->avgBitrate = 0;
- esd->decoderConfig->maxBitrate = 0;
- rate = time_wnd = 0;
-
- timescale = gf_isom_get_media_timescale(p_file, i_track);
- count = gf_isom_get_sample_count(p_file, i_track);
- for (i=0; i<count; i++) {
- GF_ISOSample *samp = gf_isom_get_sample_info(p_file, i_track, i+1, &di, &offset);
-
- if (samp->dataLength>esd->decoderConfig->bufferSizeDB) esd->decoderConfig->bufferSizeDB = samp->dataLength;
-
- if (esd->decoderConfig->bufferSizeDB < samp->dataLength) esd->decoderConfig->bufferSizeDB = samp->dataLength;
- esd->decoderConfig->avgBitrate += samp->dataLength;
- rate += samp->dataLength;
- if (samp->DTS > time_wnd + timescale) {
- if (rate > esd->decoderConfig->maxBitrate) esd->decoderConfig->maxBitrate = rate;
- time_wnd = samp->DTS;
- rate = 0;
- }
-
- gf_isom_sample_del(&samp);
- }
-
- br = (Double) (s64) gf_isom_get_media_duration(p_file, i_track);
- br /= timescale;
- esd->decoderConfig->avgBitrate = (u32) (esd->decoderConfig->avgBitrate / br);
- /*move to bps*/
- esd->decoderConfig->avgBitrate *= 8;
- esd->decoderConfig->maxBitrate *= 8;
-
- gf_isom_change_mpeg4_description(p_file, i_track, 1, esd);
- gf_odf_desc_del((GF_Descriptor *) esd);
-}
-
-
-int close_file_mp4( hnd_t handle )
-{
- mp4_t *p_mp4 = (mp4_t *)handle;
-
- if (p_mp4 == NULL)
- return 0;
-
- if (p_mp4->p_config)
- gf_odf_avc_cfg_del(p_mp4->p_config);
-
- if (p_mp4->p_sample)
- {
- if (p_mp4->p_sample->data)
- free(p_mp4->p_sample->data);
-
- gf_isom_sample_del(&p_mp4->p_sample);
- }
-
- if (p_mp4->p_file)
- {
- recompute_bitrate_mp4(p_mp4->p_file, p_mp4->i_track);
- gf_isom_set_pl_indication(p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15);
- gf_isom_set_storage_mode(p_mp4->p_file, GF_ISOM_STORE_FLAT);
- gf_isom_close(p_mp4->p_file);
- }
-
- free(p_mp4);
-
- return 0;
-}
-
-int open_file_mp4( char *psz_filename, hnd_t *p_handle )
-{
- mp4_t *p_mp4;
-
- *p_handle = NULL;
-
- if ((p_mp4 = (mp4_t *)malloc(sizeof(mp4_t))) == NULL)
- return -1;
-
- memset(p_mp4, 0, sizeof(mp4_t));
- p_mp4->p_file = gf_isom_open(psz_filename, GF_ISOM_OPEN_WRITE, NULL);
-
- if ((p_mp4->p_sample = gf_isom_sample_new()) == NULL)
- {
- close_file_mp4( p_mp4 );
- return -1;
- }
-
- gf_isom_set_brand_info(p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0);
-
- *p_handle = p_mp4;
-
- return 0;
-}
-
-
-int set_param_mp4( hnd_t handle, x264_param_t *p_param )
-{
- mp4_t *p_mp4 = (mp4_t *)handle;
-
- p_mp4->i_track = gf_isom_new_track(p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL,
- p_param->i_fps_num);
-
- p_mp4->p_config = gf_odf_avc_cfg_new();
- gf_isom_avc_config_new(p_mp4->p_file, p_mp4->i_track, p_mp4->p_config,
- NULL, NULL, &p_mp4->i_descidx);
-
- gf_isom_set_track_enabled(p_mp4->p_file, p_mp4->i_track, 1);
-
- gf_isom_set_visual_info(p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx,
- p_param->i_width, p_param->i_height);
-
- if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
- {
- uint64_t dw = p_param->i_width << 16;
- uint64_t dh = p_param->i_height << 16;
- double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
- if( sar > 1.0 )
- dw *= sar ;
- else
- dh /= sar;
- gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
- }
-
- p_mp4->p_sample->data = (char *)malloc(p_param->i_width * p_param->i_height * 3 / 2);
- if (p_mp4->p_sample->data == NULL)
- return -1;
-
- p_mp4->i_time_res = p_param->i_fps_num;
- p_mp4->i_time_inc = p_param->i_fps_den;
- p_mp4->i_init_delay = p_param->i_bframe ? (p_param->b_bframe_pyramid ? 2 : 1) : 0;
- p_mp4->i_init_delay *= p_mp4->i_time_inc;
- fprintf(stderr, "mp4 [info]: initial delay %d (scale %d)\n",
- p_mp4->i_init_delay, p_mp4->i_time_res);
-
- return 0;
-}
-
-
-int write_nalu_mp4( hnd_t handle, uint8_t *p_nalu, int i_size )
-{
- mp4_t *p_mp4 = (mp4_t *)handle;
- GF_AVCConfigSlot *p_slot;
- uint8_t type = p_nalu[4] & 0x1f;
- int psize;
-
- switch(type)
- {
- // sps
- case 0x07:
- if (!p_mp4->b_sps)
- {
- p_mp4->p_config->configurationVersion = 1;
- p_mp4->p_config->AVCProfileIndication = p_nalu[5];
- p_mp4->p_config->profile_compatibility = p_nalu[6];
- p_mp4->p_config->AVCLevelIndication = p_nalu[7];
- p_slot = (GF_AVCConfigSlot *)malloc(sizeof(GF_AVCConfigSlot));
- p_slot->size = i_size - 4;
- p_slot->data = (char *)malloc(p_slot->size);
- memcpy(p_slot->data, p_nalu + 4, i_size - 4);
- gf_list_add(p_mp4->p_config->sequenceParameterSets, p_slot);
- p_slot = NULL;
- p_mp4->b_sps = 1;
- }
- break;
-
- // pps
- case 0x08:
- if (!p_mp4->b_pps)
- {
- p_slot = (GF_AVCConfigSlot *)malloc(sizeof(GF_AVCConfigSlot));
- p_slot->size = i_size - 4;
- p_slot->data = (char *)malloc(p_slot->size);
- memcpy(p_slot->data, p_nalu + 4, i_size - 4);
- gf_list_add(p_mp4->p_config->pictureParameterSets, p_slot);
- p_slot = NULL;
- p_mp4->b_pps = 1;
- if (p_mp4->b_sps)
- gf_isom_avc_config_update(p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config);
- }
- break;
-
- // slice, sei
- case 0x1:
- case 0x5:
- case 0x6:
- psize = i_size - 4 ;
- memcpy(p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size);
- p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 0] = (psize >> 24) & 0xff;
- p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 1] = (psize >> 16) & 0xff;
- p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 2] = (psize >> 8) & 0xff;
- p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 3] = (psize >> 0) & 0xff;
- p_mp4->p_sample->dataLength += i_size;
- break;
- }
-
- return i_size;
-}
-
-int set_eop_mp4( hnd_t handle, x264_picture_t *p_picture )
-{
- mp4_t *p_mp4 = (mp4_t *)handle;
- uint64_t dts = (uint64_t)p_mp4->i_numframe * p_mp4->i_time_inc;
- uint64_t pts = (uint64_t)p_picture->i_pts;
- int32_t offset = p_mp4->i_init_delay + pts - dts;
-
- p_mp4->p_sample->IsRAP = p_picture->i_type == X264_TYPE_IDR ? 1 : 0;
- p_mp4->p_sample->DTS = dts;
- p_mp4->p_sample->CTS_Offset = offset;
- gf_isom_add_sample(p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample);
-
- p_mp4->p_sample->dataLength = 0;
- p_mp4->i_numframe++;
-
- return 0;
-}
-
-#endif
-
-
-/* -- mkv muxing support ------------------------------------------------- */
-typedef struct
-{
- mk_Writer *w;
-
- uint8_t *sps, *pps;
- int sps_len, pps_len;
-
- int width, height, d_width, d_height;
-
- int64_t frame_duration;
- int fps_num;
-
- int b_header_written;
- char b_writing_frame;
-} mkv_t;
-
-static int write_header_mkv( mkv_t *p_mkv )
-{
- int ret;
- uint8_t *avcC;
- int avcC_len;
-
- if( p_mkv->sps == NULL || p_mkv->pps == NULL ||
- p_mkv->width == 0 || p_mkv->height == 0 ||
- p_mkv->d_width == 0 || p_mkv->d_height == 0)
- return -1;
-
- avcC_len = 5 + 1 + 2 + p_mkv->sps_len + 1 + 2 + p_mkv->pps_len;
- avcC = malloc(avcC_len);
- if (avcC == NULL)
- return -1;
-
- avcC[0] = 1;
- avcC[1] = p_mkv->sps[1];
- avcC[2] = p_mkv->sps[2];
- avcC[3] = p_mkv->sps[3];
- avcC[4] = 0xff; // nalu size length is four bytes
- avcC[5] = 0xe1; // one sps
-
- avcC[6] = p_mkv->sps_len >> 8;
- avcC[7] = p_mkv->sps_len;
-
- memcpy(avcC+8, p_mkv->sps, p_mkv->sps_len);
-
- avcC[8+p_mkv->sps_len] = 1; // one pps
- avcC[9+p_mkv->sps_len] = p_mkv->pps_len >> 8;
- avcC[10+p_mkv->sps_len] = p_mkv->pps_len;
-
- memcpy( avcC+11+p_mkv->sps_len, p_mkv->pps, p_mkv->pps_len );
-
- ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
- avcC, avcC_len, p_mkv->frame_duration, 50000,
- p_mkv->width, p_mkv->height,
- p_mkv->d_width, p_mkv->d_height );
-
- free( avcC );
-
- p_mkv->b_header_written = 1;
-
- return ret;
-}
-
-int open_file_mkv( char *psz_filename, hnd_t *p_handle )
-{
- mkv_t *p_mkv;
-
- *p_handle = NULL;
-
- p_mkv = malloc(sizeof(*p_mkv));
- if (p_mkv == NULL)
- return -1;
-
- memset(p_mkv, 0, sizeof(*p_mkv));
-
- p_mkv->w = mk_createWriter(psz_filename);
- if (p_mkv->w == NULL)
- {
- free(p_mkv);
- return -1;
- }
-
- *p_handle = p_mkv;
-
- return 0;
-}
-
-int set_param_mkv( hnd_t handle, x264_param_t *p_param )
-{
- mkv_t *p_mkv = handle;
- int64_t dw, dh;
-
- if( p_param->i_fps_num > 0 )
- {
- p_mkv->frame_duration = (int64_t)p_param->i_fps_den *
- (int64_t)1000000000 / p_param->i_fps_num;
- p_mkv->fps_num = p_param->i_fps_num;
- }
- else
- {
- p_mkv->frame_duration = 0;
- p_mkv->fps_num = 1;
- }
-
- p_mkv->width = p_param->i_width;
- p_mkv->height = p_param->i_height;
-
- if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
- {
- dw = (int64_t)p_param->i_width * p_param->vui.i_sar_width;
- dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height;
- }
- else
- {
- dw = p_param->i_width;
- dh = p_param->i_height;
- }
-
- if( dw > 0 && dh > 0 )
- {
- int64_t x = gcd( dw, dh );
- dw /= x;
- dh /= x;
- }
-
- p_mkv->d_width = (int)dw;
- p_mkv->d_height = (int)dh;
-
- return 0;
-}
-
-int write_nalu_mkv( hnd_t handle, uint8_t *p_nalu, int i_size )
-{
- mkv_t *p_mkv = handle;
- uint8_t type = p_nalu[4] & 0x1f;
- uint8_t dsize[4];
- int psize;
-
- switch( type )
- {
- // sps
- case 0x07:
- if( !p_mkv->sps )
- {
- p_mkv->sps = malloc(i_size - 4);
- if (p_mkv->sps == NULL)
- return -1;
- p_mkv->sps_len = i_size - 4;
- memcpy(p_mkv->sps, p_nalu + 4, i_size - 4);
- }
- break;
-
- // pps
- case 0x08:
- if( !p_mkv->pps )
- {
- p_mkv->pps = malloc(i_size - 4);
- if (p_mkv->pps == NULL)
- return -1;
- p_mkv->pps_len = i_size - 4;
- memcpy(p_mkv->pps, p_nalu + 4, i_size - 4);
- }
- break;
-
- // slice, sei
- case 0x1:
- case 0x5:
- case 0x6:
- if( !p_mkv->b_writing_frame )
- {
- if( mk_startFrame(p_mkv->w) < 0 )
- return -1;
- p_mkv->b_writing_frame = 1;
- }
- psize = i_size - 4 ;
- dsize[0] = psize >> 24;
- dsize[1] = psize >> 16;
- dsize[2] = psize >> 8;
- dsize[3] = psize;
- if( mk_addFrameData(p_mkv->w, dsize, 4) < 0 ||
- mk_addFrameData(p_mkv->w, p_nalu + 4, i_size - 4) < 0 )
- return -1;
- break;
-
- default:
- break;
- }
-
- if( !p_mkv->b_header_written && p_mkv->pps && p_mkv->sps &&
- write_header_mkv(p_mkv) < 0 )
- return -1;
-
- return i_size;
-}
-
-int set_eop_mkv( hnd_t handle, x264_picture_t *p_picture )
-{
- mkv_t *p_mkv = handle;
- int64_t i_stamp = (int64_t)(p_picture->i_pts * 1e9 / p_mkv->fps_num);
-
- p_mkv->b_writing_frame = 0;
-
- return mk_setFrameFlags( p_mkv->w, i_stamp,
- p_picture->i_type == X264_TYPE_IDR );
-}
-
-int close_file_mkv( hnd_t handle )
-{
- mkv_t *p_mkv = handle;
- int ret;
-
- if( p_mkv->sps )
- free( p_mkv->sps );
- if( p_mkv->pps )
- free( p_mkv->pps );
-
- ret = mk_close(p_mkv->w);
-
- free( p_mkv );
-
- return ret;
-}
-
diff --git a/muxers.h b/muxers.h
index aaede1c..041dbbc 100644
--- a/muxers.h
+++ b/muxers.h
@@ -1,7 +1,7 @@
/*****************************************************************************
- * muxers.c: h264 file i/o plugins
+ * muxers.h: h264 file i/o modules
*****************************************************************************
- * Copyright (C) 2003-2008 x264 project
+ * Copyright (C) 2003-2009 x264 project
*
* Authors: Laurent Aimar <fenrir at via.ecp.fr>
* Loren Merritt <lorenm at u.washington.edu>
@@ -24,49 +24,33 @@
#ifndef X264_MUXERS_H
#define X264_MUXERS_H
-typedef void *hnd_t;
-
-int open_file_yuv( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_yuv( hnd_t handle );
-int read_frame_yuv( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_yuv( hnd_t handle );
-
-int open_file_y4m( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_y4m( hnd_t handle );
-int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_y4m( hnd_t handle );
-
-int open_file_avis( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_avis( hnd_t handle );
-int read_frame_avis( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_avis( hnd_t handle );
+#include "common/common.h"
+#include "x264.h"
-int open_file_thread( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_thread( hnd_t handle );
-int read_frame_thread( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_thread( hnd_t handle );
-
-int open_file_bsf( char *psz_filename, hnd_t *p_handle );
-int set_param_bsf( hnd_t handle, x264_param_t *p_param );
-int write_nalu_bsf( hnd_t handle, uint8_t *p_nal, int i_size );
-int set_eop_bsf( hnd_t handle, x264_picture_t *p_picture );
-int close_file_bsf( hnd_t handle );
-
-int open_file_mp4( char *psz_filename, hnd_t *p_handle );
-int set_param_mp4( hnd_t handle, x264_param_t *p_param );
-int write_nalu_mp4( hnd_t handle, uint8_t *p_nal, int i_size );
-int set_eop_mp4( hnd_t handle, x264_picture_t *p_picture );
-int close_file_mp4( hnd_t handle );
-
-int open_file_mkv( char *psz_filename, hnd_t *p_handle );
-int set_param_mkv( hnd_t handle, x264_param_t *p_param );
-int write_nalu_mkv( hnd_t handle, uint8_t *p_nal, int i_size );
-int set_eop_mkv( hnd_t handle, x264_picture_t *p_picture );
-int close_file_mkv( hnd_t handle );
+typedef void *hnd_t;
-extern int (*p_open_infile)( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-extern int (*p_get_frame_total)( hnd_t handle );
-extern int (*p_read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-extern int (*p_close_infile)( hnd_t handle );
+static inline int64_t gcd( int64_t a, int64_t b )
+{
+ while( 1 )
+ {
+ int64_t c = a % b;
+ if( !c )
+ return b;
+ a = b;
+ b = c;
+ }
+}
+
+static inline char *get_filename_extension( char *filename )
+{
+ char *ext = filename + strlen( filename );
+ while( *ext != '.' && ext > filename )
+ ext--;
+ ext += *ext == '.';
+ return ext;
+}
+
+#include "input/input.h"
+#include "output/output.h"
#endif
diff --git a/output/flv.c b/output/flv.c
new file mode 100644
index 0000000..b3e5d16
--- /dev/null
+++ b/output/flv.c
@@ -0,0 +1,308 @@
+/*****************************************************************************
+ * flv.c:
+ *****************************************************************************
+ * Copyright (C) 2009 Kieran Kunhya <kieran at kunhya.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "flv_bytestream.h"
+
+#define CHECK(x)\
+do {\
+ if( (x) < 0 )\
+ return -1;\
+} while( 0 )
+
+typedef struct
+{
+ flv_buffer *c;
+
+ uint8_t *sei;
+ int sei_len;
+
+ int64_t i_fps_num;
+ int64_t i_fps_den;
+ int64_t i_framenum;
+
+ uint64_t i_framerate_pos;
+ uint64_t i_duration_pos;
+ uint64_t i_filesize_pos;
+ uint64_t i_bitrate_pos;
+
+ uint8_t b_write_length;
+ int64_t i_prev_dts;
+ int64_t i_prev_pts;
+
+ int i_timebase_num;
+ int i_timebase_den;
+ int b_vfr_input;
+
+ unsigned start;
+} flv_hnd_t;
+
+static int write_header( flv_buffer *c )
+{
+ x264_put_tag( c, "FLV" ); // Signature
+ x264_put_byte( c, 1 ); // Version
+ x264_put_byte( c, 1 ); // Video Only
+ x264_put_be32( c, 9 ); // DataOffset
+ x264_put_be32( c, 0 ); // PreviousTagSize0
+
+ return flv_flush_data( c );
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+ flv_hnd_t *p_flv = malloc( sizeof(*p_flv) );
+ *p_handle = NULL;
+ if( !p_flv )
+ return -1;
+ memset( p_flv, 0, sizeof(*p_flv) );
+
+ p_flv->c = flv_create_writer( psz_filename );
+ if( !p_flv->c )
+ return -1;
+
+ CHECK( write_header( p_flv->c ) );
+ *p_handle = p_flv;
+
+ return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+ flv_hnd_t *p_flv = handle;
+ flv_buffer *c = p_flv->c;
+
+ x264_put_byte( c, FLV_TAG_TYPE_META ); // Tag Type "script data"
+
+ int start = c->d_cur;
+ x264_put_be24( c, 0 ); // data length
+ x264_put_be24( c, 0 ); // timestamp
+ x264_put_be32( c, 0 ); // reserved
+
+ x264_put_byte( c, AMF_DATA_TYPE_STRING );
+ x264_put_amf_string( c, "onMetaData" );
+
+ x264_put_byte( c, AMF_DATA_TYPE_MIXEDARRAY );
+ x264_put_be32( c, 7 );
+
+ x264_put_amf_string( c, "width" );
+ x264_put_amf_double( c, p_param->i_width );
+
+ x264_put_amf_string( c, "height" );
+ x264_put_amf_double( c, p_param->i_height );
+
+ x264_put_amf_string( c, "framerate" );
+
+ if( !p_param->b_vfr_input )
+ x264_put_amf_double( c, (double)p_param->i_fps_num / p_param->i_fps_den );
+ else
+ {
+ p_flv->i_framerate_pos = c->d_cur + c->d_total + 1;
+ x264_put_amf_double( c, 0 ); // written at end of encoding
+ }
+
+ x264_put_amf_string( c, "videocodecid" );
+ x264_put_amf_double( c, FLV_CODECID_H264 );
+
+ x264_put_amf_string( c, "duration" );
+ p_flv->i_duration_pos = c->d_cur + c->d_total + 1;
+ x264_put_amf_double( c, 0 ); // written at end of encoding
+
+ x264_put_amf_string( c, "filesize" );
+ p_flv->i_filesize_pos = c->d_cur + c->d_total + 1;
+ x264_put_amf_double( c, 0 ); // written at end of encoding
+
+ x264_put_amf_string( c, "videodatarate" );
+ p_flv->i_bitrate_pos = c->d_cur + c->d_total + 1;
+ x264_put_amf_double( c, 0 ); // written at end of encoding
+
+ x264_put_amf_string( c, "" );
+ x264_put_byte( c, AMF_END_OF_OBJECT );
+
+ unsigned length = c->d_cur - start;
+ rewrite_amf_be24( c, length - 10, start );
+
+ x264_put_be32( c, length + 1 ); // tag length
+
+ p_flv->i_fps_num = p_param->i_fps_num;
+ p_flv->i_fps_den = p_param->i_fps_den;
+ p_flv->i_timebase_num = p_param->i_timebase_num;
+ p_flv->i_timebase_den = p_param->i_timebase_den;
+ p_flv->b_vfr_input = p_param->b_vfr_input;
+
+ return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+ flv_hnd_t *p_flv = handle;
+ flv_buffer *c = p_flv->c;
+
+ int sei_size = p_nal[0].i_payload;
+ int sps_size = p_nal[1].i_payload;
+ int pps_size = p_nal[2].i_payload;
+
+ // SEI
+ /* It is within the spec to write this as-is but for
+ * mplayer/ffmpeg playback this is deferred until before the first frame */
+
+ p_flv->sei = malloc( sei_size );
+ if( !p_flv->sei )
+ return -1;
+ p_flv->sei_len = sei_size;
+
+ memcpy( p_flv->sei, p_nal[0].p_payload, sei_size );
+
+ // SPS
+ uint8_t *sps = p_nal[1].p_payload + 4;
+
+ x264_put_byte( c, FLV_TAG_TYPE_VIDEO );
+ x264_put_be24( c, 0 ); // rewrite later
+ x264_put_be24( c, 0 ); // timestamp
+ x264_put_byte( c, 0 ); // timestamp extended
+ x264_put_be24( c, 0 ); // StreamID - Always 0
+ p_flv->start = c->d_cur; // needed for overwriting length
+
+ x264_put_byte( c, 7 | FLV_FRAME_KEY ); // Frametype and CodecID
+ x264_put_byte( c, 0 ); // AVC sequence header
+ x264_put_be24( c, 0 ); // composition time
+
+ x264_put_byte( c, 1 ); // version
+ x264_put_byte( c, sps[1] ); // profile
+ x264_put_byte( c, sps[2] ); // profile
+ x264_put_byte( c, sps[3] ); // level
+ x264_put_byte( c, 0xff ); // 6 bits reserved (111111) + 2 bits nal size length - 1 (11)
+ x264_put_byte( c, 0xe1 ); // 3 bits reserved (111) + 5 bits number of sps (00001)
+
+ x264_put_be16( c, sps_size - 4 );
+ flv_append_data( c, sps, sps_size - 4 );
+
+ // PPS
+ x264_put_byte( c, 1 ); // number of pps
+ x264_put_be16( c, pps_size - 4 );
+ flv_append_data( c, p_nal[2].p_payload + 4, pps_size - 4 );
+
+ // rewrite data length info
+ unsigned length = c->d_cur - p_flv->start;
+ rewrite_amf_be24( c, length, p_flv->start - 10 );
+ x264_put_be32( c, length + 11 ); // Last tag size
+ CHECK( flv_flush_data( c ) );
+
+ return sei_size + sps_size + pps_size;
+}
+
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+ flv_hnd_t *p_flv = handle;
+ flv_buffer *c = p_flv->c;
+
+ int64_t dts = (int64_t)( (p_picture->i_dts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+ int64_t cts = (int64_t)( (p_picture->i_pts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+ int64_t offset = cts - dts;
+
+ if( p_flv->i_framenum )
+ {
+ int64_t prev_dts = (int64_t)( (p_flv->i_prev_dts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+ int64_t prev_cts = (int64_t)( (p_flv->i_prev_pts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+ if( prev_dts == dts )
+ {
+ double fps = ((double)p_flv->i_timebase_den / p_flv->i_timebase_num) / (p_picture->i_dts - p_flv->i_prev_dts);
+ fprintf( stderr, "flv [warning]: duplicate DTS %"PRId64" generated by rounding\n"
+ " current internal decoding framerate: %.6f fps\n", dts, fps );
+ }
+ if( prev_cts == cts )
+ {
+ double fps = ((double)p_flv->i_timebase_den / p_flv->i_timebase_num) / (p_picture->i_pts - p_flv->i_prev_pts);
+ fprintf( stderr, "flv [warning]: duplicate CTS %"PRId64" generated by rounding\n"
+ " current internal composition framerate: %.6f fps\n", cts, fps );
+ }
+ }
+ p_flv->i_prev_dts = p_picture->i_dts;
+ p_flv->i_prev_pts = p_picture->i_pts;
+
+ // A new frame - write packet header
+ x264_put_byte( c, FLV_TAG_TYPE_VIDEO );
+ x264_put_be24( c, 0 ); // calculated later
+ x264_put_be24( c, dts );
+ x264_put_byte( c, dts >> 24 );
+ x264_put_be24( c, 0 );
+
+ p_flv->start = c->d_cur;
+ x264_put_byte( c, p_picture->b_keyframe ? FLV_FRAME_KEY : FLV_FRAME_INTER );
+ x264_put_byte( c, 1 ); // AVC NALU
+ x264_put_be24( c, offset );
+
+ if( p_flv->sei )
+ {
+ flv_append_data( c, p_flv->sei, p_flv->sei_len );
+ free( p_flv->sei );
+ p_flv->sei = NULL;
+ }
+ flv_append_data( c, p_nalu, i_size );
+
+ unsigned length = c->d_cur - p_flv->start;
+ rewrite_amf_be24( c, length, p_flv->start - 10 );
+ x264_put_be32( c, 11 + length ); // Last tag size
+ CHECK( flv_flush_data( c ) );
+
+ p_flv->i_framenum++;
+
+ return i_size;
+}
+
+static void rewrite_amf_double( FILE *fp, uint64_t position, double value )
+{
+ uint64_t x = endian_fix64( dbl2int( value ) );
+ fseek( fp, position, SEEK_SET );
+ fwrite( &x, 8, 1, fp );
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+ flv_hnd_t *p_flv = handle;
+ flv_buffer *c = p_flv->c;
+
+ CHECK( flv_flush_data( c ) );
+
+ double total_duration = (double)(2 * largest_pts - second_largest_pts) * p_flv->i_timebase_num / p_flv->i_timebase_den;
+
+ if( x264_is_regular_file( c->fp ) )
+ {
+ double framerate;
+ uint64_t filesize = ftell( c->fp );
+
+ if( p_flv->i_framerate_pos )
+ {
+ framerate = (double)p_flv->i_framenum / total_duration;
+ rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate );
+ }
+
+ rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration );
+ rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize );
+ rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) );
+ }
+
+ fclose( c->fp );
+ free( p_flv );
+ free( c );
+
+ return 0;
+}
+
+cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/flv_bytestream.c b/output/flv_bytestream.c
new file mode 100644
index 0000000..316114c
--- /dev/null
+++ b/output/flv_bytestream.c
@@ -0,0 +1,150 @@
+/*****************************************************************************
+ * flv_bytestream.c:
+ *****************************************************************************
+ * Copyright (C) 2009 Kieran Kunhya
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "flv_bytestream.h"
+
+uint64_t dbl2int( double value )
+{
+ return (union {double f; uint64_t i;}){value}.i;
+}
+
+/* Put functions */
+
+void x264_put_byte( flv_buffer *c, uint8_t b )
+{
+ flv_append_data( c, &b, 1 );
+}
+
+void x264_put_be32( flv_buffer *c, uint32_t val )
+{
+ x264_put_byte( c, val >> 24 );
+ x264_put_byte( c, val >> 16 );
+ x264_put_byte( c, val >> 8 );
+ x264_put_byte( c, val );
+}
+
+void x264_put_be64( flv_buffer *c, uint64_t val )
+{
+ x264_put_be32( c, val >> 32 );
+ x264_put_be32( c, val );
+}
+
+void x264_put_be16( flv_buffer *c, uint16_t val )
+{
+ x264_put_byte( c, val >> 8 );
+ x264_put_byte( c, val );
+}
+
+void x264_put_be24( flv_buffer *c, uint32_t val )
+{
+ x264_put_be16( c, val >> 8 );
+ x264_put_byte( c, val );
+}
+
+void x264_put_tag( flv_buffer *c, const char *tag )
+{
+ while( *tag )
+ x264_put_byte( c, *tag++ );
+}
+
+void x264_put_amf_string( flv_buffer *c, const char *str )
+{
+ uint16_t len = strlen( str );
+ x264_put_be16( c, len );
+ flv_append_data( c, (uint8_t*)str, len );
+}
+
+void x264_put_amf_double( flv_buffer *c, double d )
+{
+ x264_put_byte( c, AMF_DATA_TYPE_NUMBER );
+ x264_put_be64( c, dbl2int( d ) );
+}
+
+/* flv writing functions */
+
+flv_buffer *flv_create_writer( const char *filename )
+{
+ flv_buffer *c = malloc( sizeof(*c) );
+
+ if( !c )
+ return NULL;
+ memset( c, 0, sizeof(*c) );
+
+ if( !strcmp( filename, "-" ) )
+ c->fp = stdout;
+ else
+ c->fp = fopen( filename, "wb" );
+ if( !c->fp )
+ {
+ free( c );
+ return NULL;
+ }
+
+ return c;
+}
+
+int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size )
+{
+ unsigned ns = c->d_cur + size;
+
+ if( ns > c->d_max )
+ {
+ void *dp;
+ unsigned dn = 16;
+ while( ns > dn )
+ dn <<= 1;
+
+ dp = realloc( c->data, dn );
+ if( !dp )
+ return -1;
+
+ c->data = dp;
+ c->d_max = dn;
+ }
+
+ memcpy( c->data + c->d_cur, data, size );
+
+ c->d_cur = ns;
+
+ return 0;
+}
+
+void rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start )
+{
+ *(c->data + start + 0) = length >> 16;
+ *(c->data + start + 1) = length >> 8;
+ *(c->data + start + 2) = length >> 0;
+}
+
+int flv_flush_data( flv_buffer *c )
+{
+ if( !c->d_cur )
+ return 0;
+
+ if( fwrite( c->data, c->d_cur, 1, c->fp ) != 1 )
+ return -1;
+
+ c->d_total += c->d_cur;
+
+ c->d_cur = 0;
+
+ return 0;
+}
diff --git a/output/flv_bytestream.h b/output/flv_bytestream.h
new file mode 100644
index 0000000..00f37fe
--- /dev/null
+++ b/output/flv_bytestream.h
@@ -0,0 +1,135 @@
+/*****************************************************************************
+ * flv_bytestream.h:
+ *****************************************************************************
+ * Copyright (C) 2009 Kieran Kunhya
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_FLV_BYTESTREAM_H
+#define X264_FLV_BYTESTREAM_H
+
+/* offsets for packed values */
+#define FLV_AUDIO_SAMPLESSIZE_OFFSET 1
+#define FLV_AUDIO_SAMPLERATE_OFFSET 2
+#define FLV_AUDIO_CODECID_OFFSET 4
+
+#define FLV_VIDEO_FRAMETYPE_OFFSET 4
+
+/* bitmasks to isolate specific values */
+#define FLV_AUDIO_CHANNEL_MASK 0x01
+#define FLV_AUDIO_SAMPLESIZE_MASK 0x02
+#define FLV_AUDIO_SAMPLERATE_MASK 0x0c
+#define FLV_AUDIO_CODECID_MASK 0xf0
+
+#define FLV_VIDEO_CODECID_MASK 0x0f
+#define FLV_VIDEO_FRAMETYPE_MASK 0xf0
+
+#define AMF_END_OF_OBJECT 0x09
+
+enum
+{
+ FLV_HEADER_FLAG_HASVIDEO = 1,
+ FLV_HEADER_FLAG_HASAUDIO = 4,
+};
+
+enum
+{
+ FLV_TAG_TYPE_AUDIO = 0x08,
+ FLV_TAG_TYPE_VIDEO = 0x09,
+ FLV_TAG_TYPE_META = 0x12,
+};
+
+enum
+{
+ FLV_MONO = 0,
+ FLV_STEREO = 1,
+};
+
+enum
+{
+ FLV_SAMPLESSIZE_8BIT = 0,
+ FLV_SAMPLESSIZE_16BIT = 1 << FLV_AUDIO_SAMPLESSIZE_OFFSET,
+};
+
+enum
+{
+ FLV_SAMPLERATE_SPECIAL = 0, /**< signifies 5512Hz and 8000Hz in the case of NELLYMOSER */
+ FLV_SAMPLERATE_11025HZ = 1 << FLV_AUDIO_SAMPLERATE_OFFSET,
+ FLV_SAMPLERATE_22050HZ = 2 << FLV_AUDIO_SAMPLERATE_OFFSET,
+ FLV_SAMPLERATE_44100HZ = 3 << FLV_AUDIO_SAMPLERATE_OFFSET,
+};
+
+enum
+{
+ FLV_CODECID_MP3 = 2 << FLV_AUDIO_CODECID_OFFSET,
+ FLV_CODECID_AAC = 10<< FLV_AUDIO_CODECID_OFFSET,
+};
+
+enum
+{
+ FLV_CODECID_H264 = 7,
+};
+
+enum
+{
+ FLV_FRAME_KEY = 1 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
+ FLV_FRAME_INTER = 2 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
+};
+
+typedef enum
+{
+ AMF_DATA_TYPE_NUMBER = 0x00,
+ AMF_DATA_TYPE_BOOL = 0x01,
+ AMF_DATA_TYPE_STRING = 0x02,
+ AMF_DATA_TYPE_OBJECT = 0x03,
+ AMF_DATA_TYPE_NULL = 0x05,
+ AMF_DATA_TYPE_UNDEFINED = 0x06,
+ AMF_DATA_TYPE_REFERENCE = 0x07,
+ AMF_DATA_TYPE_MIXEDARRAY = 0x08,
+ AMF_DATA_TYPE_OBJECT_END = 0x09,
+ AMF_DATA_TYPE_ARRAY = 0x0a,
+ AMF_DATA_TYPE_DATE = 0x0b,
+ AMF_DATA_TYPE_LONG_STRING = 0x0c,
+ AMF_DATA_TYPE_UNSUPPORTED = 0x0d,
+} AMFDataType;
+
+typedef struct flv_buffer
+{
+ uint8_t *data;
+ unsigned d_cur;
+ unsigned d_max;
+ FILE *fp;
+ uint64_t d_total;
+} flv_buffer;
+
+flv_buffer *flv_create_writer( const char *filename );
+int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size );
+int flv_write_byte( flv_buffer *c, uint8_t *byte );
+int flv_flush_data( flv_buffer *c );
+void rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start );
+
+uint64_t dbl2int( double value );
+uint64_t get_amf_double( double value );
+void x264_put_byte( flv_buffer *c, uint8_t b );
+void x264_put_be32( flv_buffer *c, uint32_t val );
+void x264_put_be64( flv_buffer *c, uint64_t val );
+void x264_put_be16( flv_buffer *c, uint16_t val );
+void x264_put_be24( flv_buffer *c, uint32_t val );
+void x264_put_tag( flv_buffer *c, const char *tag );
+void x264_put_amf_string( flv_buffer *c, const char *str );
+void x264_put_amf_double( flv_buffer *c, double d );
+
+#endif
diff --git a/output/matroska.c b/output/matroska.c
new file mode 100644
index 0000000..8e84f52
--- /dev/null
+++ b/output/matroska.c
@@ -0,0 +1,209 @@
+/*****************************************************************************
+ * matroska.c: x264 matroska output module
+ *****************************************************************************
+ * Copyright (C) 2005 Mike Matsnev
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "matroska_ebml.h"
+
+typedef struct
+{
+ mk_writer *w;
+
+ int width, height, d_width, d_height;
+
+ int64_t frame_duration;
+
+ char b_writing_frame;
+ int i_timebase_num;
+ int i_timebase_den;
+
+} mkv_hnd_t;
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+ mkv_hnd_t *p_mkv;
+
+ *p_handle = NULL;
+
+ p_mkv = malloc( sizeof(*p_mkv) );
+ if( !p_mkv )
+ return -1;
+
+ memset( p_mkv, 0, sizeof(*p_mkv) );
+
+ p_mkv->w = mk_create_writer( psz_filename );
+ if( !p_mkv->w )
+ {
+ free( p_mkv );
+ return -1;
+ }
+
+ *p_handle = p_mkv;
+
+ return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+ mkv_hnd_t *p_mkv = handle;
+ int64_t dw, dh;
+
+ if( p_param->i_fps_num > 0 && !p_param->b_vfr_input )
+ {
+ p_mkv->frame_duration = (int64_t)p_param->i_fps_den *
+ (int64_t)1000000000 / p_param->i_fps_num;
+ }
+ else
+ {
+ p_mkv->frame_duration = 0;
+ }
+
+ p_mkv->width = p_param->i_width;
+ p_mkv->height = p_param->i_height;
+
+ if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
+ {
+ dw = (int64_t)p_param->i_width * p_param->vui.i_sar_width;
+ dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height;
+ }
+ else
+ {
+ dw = p_param->i_width;
+ dh = p_param->i_height;
+ }
+
+ if( dw > 0 && dh > 0 )
+ {
+ int64_t x = gcd( dw, dh );
+ dw /= x;
+ dh /= x;
+ }
+
+ p_mkv->d_width = (int)dw;
+ p_mkv->d_height = (int)dh;
+ p_mkv->i_timebase_num = p_param->i_timebase_num;
+ p_mkv->i_timebase_den = p_param->i_timebase_den;
+
+ return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+ mkv_hnd_t *p_mkv = handle;
+
+ int sei_size = p_nal[0].i_payload;
+ int sps_size = p_nal[1].i_payload - 4;
+ int pps_size = p_nal[2].i_payload - 4;
+
+ uint8_t *sei = p_nal[0].p_payload;
+ uint8_t *sps = p_nal[1].p_payload + 4;
+ uint8_t *pps = p_nal[2].p_payload + 4;
+
+ int ret;
+ uint8_t *avcC;
+ int avcC_len;
+
+ if( !p_mkv->width || !p_mkv->height ||
+ !p_mkv->d_width || !p_mkv->d_height )
+ return -1;
+
+ avcC_len = 5 + 1 + 2 + sps_size + 1 + 2 + pps_size;
+ avcC = malloc( avcC_len );
+ if( !avcC )
+ return -1;
+
+ avcC[0] = 1;
+ avcC[1] = sps[1];
+ avcC[2] = sps[2];
+ avcC[3] = sps[3];
+ avcC[4] = 0xff; // nalu size length is four bytes
+ avcC[5] = 0xe1; // one sps
+
+ avcC[6] = sps_size >> 8;
+ avcC[7] = sps_size;
+
+ memcpy( avcC+8, sps, sps_size );
+
+ avcC[8+sps_size] = 1; // one pps
+ avcC[9+sps_size] = pps_size >> 8;
+ avcC[10+sps_size] = pps_size;
+
+ memcpy( avcC+11+sps_size, pps, pps_size );
+
+ ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
+ avcC, avcC_len, p_mkv->frame_duration, 50000,
+ p_mkv->width, p_mkv->height,
+ p_mkv->d_width, p_mkv->d_height );
+
+ free( avcC );
+
+ // SEI
+
+ if( !p_mkv->b_writing_frame )
+ {
+ if( mk_start_frame( p_mkv->w ) < 0 )
+ return -1;
+ p_mkv->b_writing_frame = 1;
+ }
+ if( mk_add_frame_data( p_mkv->w, sei, sei_size ) < 0 )
+ return -1;
+
+ return sei_size + sps_size + pps_size;
+}
+
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+ mkv_hnd_t *p_mkv = handle;
+
+ if( !p_mkv->b_writing_frame )
+ {
+ if( mk_start_frame( p_mkv->w ) < 0 )
+ return -1;
+ p_mkv->b_writing_frame = 1;
+ }
+
+ if( mk_add_frame_data( p_mkv->w, p_nalu, i_size ) < 0 )
+ return -1;
+
+ int64_t i_stamp = (int64_t)((p_picture->i_pts * 1e9 * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5);
+
+ p_mkv->b_writing_frame = 0;
+
+ if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
+ return -1;
+
+ return i_size;
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+ mkv_hnd_t *p_mkv = handle;
+ int ret;
+ int64_t i_last_delta;
+
+ i_last_delta = (int64_t)(((largest_pts - second_largest_pts) * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5);
+
+ ret = mk_close( p_mkv->w, i_last_delta );
+
+ free( p_mkv );
+
+ return ret;
+}
+
+cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
new file mode 100644
index 0000000..d1c6e13
--- /dev/null
+++ b/output/matroska_ebml.c
@@ -0,0 +1,562 @@
+/*****************************************************************************
+ * matroska_ebml.c:
+ *****************************************************************************
+ * Copyright (C) 2005 Mike Matsnev
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "matroska_ebml.h"
+
+#define CLSIZE 1048576
+#define CHECK(x)\
+do {\
+ if( (x) < 0 )\
+ return -1;\
+} while( 0 )
+
+struct mk_context
+{
+ struct mk_context *next, **prev, *parent;
+ mk_writer *owner;
+ unsigned id;
+
+ void *data;
+ unsigned d_cur, d_max;
+};
+
+typedef struct mk_context mk_context;
+
+struct mk_writer
+{
+ FILE *fp;
+
+ unsigned duration_ptr;
+
+ mk_context *root, *cluster, *frame;
+ mk_context *freelist;
+ mk_context *actlist;
+
+ int64_t def_duration;
+ int64_t timescale;
+ int64_t cluster_tc_scaled;
+ int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
+
+ char wrote_header, in_frame, keyframe;
+};
+
+static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
+{
+ mk_context *c;
+
+ if( w->freelist )
+ {
+ c = w->freelist;
+ w->freelist = w->freelist->next;
+ }
+ else
+ {
+ c = malloc( sizeof(*c) );
+ if( !c )
+ return NULL;
+ memset( c, 0, sizeof(*c) );
+ }
+
+ c->parent = parent;
+ c->owner = w;
+ c->id = id;
+
+ if( c->owner->actlist )
+ c->owner->actlist->prev = &c->next;
+ c->next = c->owner->actlist;
+ c->prev = &c->owner->actlist;
+ c->owner->actlist = c;
+
+ return c;
+}
+
+static int mk_append_context_data( mk_context *c, const void *data, unsigned size )
+{
+ unsigned ns = c->d_cur + size;
+
+ if( ns > c->d_max )
+ {
+ void *dp;
+ unsigned dn = c->d_max ? c->d_max << 1 : 16;
+ while( ns > dn )
+ dn <<= 1;
+
+ dp = realloc( c->data, dn );
+ if( !dp )
+ return -1;
+
+ c->data = dp;
+ c->d_max = dn;
+ }
+
+ memcpy( (char*)c->data + c->d_cur, data, size );
+
+ c->d_cur = ns;
+
+ return 0;
+}
+
+static int mk_write_id( mk_context *c, unsigned id )
+{
+ unsigned char c_id[4] = { id >> 24, id >> 16, id >> 8, id };
+
+ if( c_id[0] )
+ return mk_append_context_data( c, c_id, 4 );
+ if( c_id[1] )
+ return mk_append_context_data( c, c_id+1, 3 );
+ if( c_id[2] )
+ return mk_append_context_data( c, c_id+2, 2 );
+ return mk_append_context_data( c, c_id+3, 1 );
+}
+
+static int mk_write_size( mk_context *c, unsigned size )
+{
+ unsigned char c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
+
+ if( size < 0x7f )
+ {
+ c_size[4] |= 0x80;
+ return mk_append_context_data( c, c_size+4, 1 );
+ }
+ if( size < 0x3fff )
+ {
+ c_size[3] |= 0x40;
+ return mk_append_context_data( c, c_size+3, 2 );
+ }
+ if( size < 0x1fffff )
+ {
+ c_size[2] |= 0x20;
+ return mk_append_context_data( c, c_size+2, 3 );
+ }
+ if( size < 0x0fffffff )
+ {
+ c_size[1] |= 0x10;
+ return mk_append_context_data( c, c_size+1, 4 );
+ }
+ return mk_append_context_data( c, c_size, 5 );
+}
+
+static int mk_flush_context_id( mk_context *c )
+{
+ unsigned char ff = 0xff;
+
+ if( !c->id )
+ return 0;
+
+ CHECK( mk_write_id( c->parent, c->id ) );
+ CHECK( mk_append_context_data( c->parent, &ff, 1 ) );
+
+ c->id = 0;
+
+ return 0;
+}
+
+static int mk_flush_context_data( mk_context *c )
+{
+ if( !c->d_cur )
+ return 0;
+
+ if( c->parent )
+ CHECK( mk_append_context_data( c->parent, c->data, c->d_cur ) );
+ else if( fwrite( c->data, c->d_cur, 1, c->owner->fp ) != 1 )
+ return -1;
+
+ c->d_cur = 0;
+
+ return 0;
+}
+
+static int mk_close_context( mk_context *c, unsigned *off )
+{
+ if( c->id )
+ {
+ CHECK( mk_write_id( c->parent, c->id ) );
+ CHECK( mk_write_size( c->parent, c->d_cur ) );
+ }
+
+ if( c->parent && off )
+ *off += c->parent->d_cur;
+
+ CHECK( mk_flush_context_data( c ) );
+
+ if( c->next )
+ c->next->prev = c->prev;
+ *(c->prev) = c->next;
+ c->next = c->owner->freelist;
+ c->owner->freelist = c;
+
+ return 0;
+}
+
+static void mk_destroy_contexts( mk_writer *w )
+{
+ mk_context *cur, *next;
+
+ for( cur = w->freelist; cur; cur = next )
+ {
+ next = cur->next;
+ free( cur->data );
+ free( cur );
+ }
+
+ for( cur = w->actlist; cur; cur = next )
+ {
+ next = cur->next;
+ free( cur->data );
+ free( cur );
+ }
+
+ w->freelist = w->actlist = w->root = NULL;
+}
+
+static int mk_write_string( mk_context *c, unsigned id, const char *str )
+{
+ size_t len = strlen( str );
+
+ CHECK( mk_write_id( c, id ) );
+ CHECK( mk_write_size( c, len ) );
+ CHECK( mk_append_context_data( c, str, len ) );
+ return 0;
+}
+
+static int mk_write_bin( mk_context *c, unsigned id, const void *data, unsigned size )
+{
+ CHECK( mk_write_id( c, id ) );
+ CHECK( mk_write_size( c, size ) );
+ CHECK( mk_append_context_data( c, data, size ) ) ;
+ return 0;
+}
+
+static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
+{
+ unsigned char c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
+ unsigned i = 0;
+
+ CHECK( mk_write_id( c, id ) );
+ while( i < 7 && !c_ui[i] )
+ ++i;
+ CHECK( mk_write_size( c, 8 - i ) );
+ CHECK( mk_append_context_data( c, c_ui+i, 8 - i ) );
+ return 0;
+}
+
+static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
+{
+ unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
+ unsigned i = 0;
+
+ CHECK( mk_write_id( c, id ) );
+ if( si < 0 )
+ while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
+ ++i;
+ else
+ while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
+ ++i;
+ CHECK( mk_write_size( c, 8 - i ) );
+ CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
+ return 0;
+}
+
+static int mk_write_float_raw( mk_context *c, float f )
+{
+ union
+ {
+ float f;
+ unsigned u;
+ } u;
+ unsigned char c_f[4];
+
+ u.f = f;
+ c_f[0] = u.u >> 24;
+ c_f[1] = u.u >> 16;
+ c_f[2] = u.u >> 8;
+ c_f[3] = u.u;
+
+ return mk_append_context_data( c, c_f, 4 );
+}
+
+static int mk_write_float( mk_context *c, unsigned id, float f )
+{
+ CHECK( mk_write_id( c, id ) );
+ CHECK( mk_write_size( c, 4 ) );
+ CHECK( mk_write_float_raw( c, f ) );
+ return 0;
+}
+
+static unsigned mk_ebml_size_size( unsigned s )
+{
+ if( s < 0x7f )
+ return 1;
+ if( s < 0x3fff )
+ return 2;
+ if( s < 0x1fffff )
+ return 3;
+ if( s < 0x0fffffff )
+ return 4;
+ return 5;
+}
+
+static unsigned mk_ebml_sint_size( int64_t si )
+{
+ unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
+ unsigned i = 0;
+
+ if( si < 0 )
+ while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
+ ++i;
+ else
+ while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
+ ++i;
+
+ return 8 - i;
+}
+
+mk_writer *mk_create_writer( const char *filename )
+{
+ mk_writer *w = malloc( sizeof(*w) );
+ if( !w )
+ return NULL;
+
+ memset( w, 0, sizeof(*w) );
+
+ w->root = mk_create_context( w, NULL, 0 );
+ if( !w->root )
+ {
+ free( w );
+ return NULL;
+ }
+
+ if( !strcmp( filename, "-" ) )
+ w->fp = stdout;
+ else
+ w->fp = fopen( filename, "wb" );
+ if( !w->fp )
+ {
+ mk_destroy_contexts( w );
+ free( w );
+ return NULL;
+ }
+
+ w->timescale = 1000000;
+
+ return w;
+}
+
+int mk_writeHeader( mk_writer *w, const char *writing_app,
+ const char *codec_id,
+ const void *codec_private, unsigned codec_private_size,
+ int64_t default_frame_duration,
+ int64_t timescale,
+ unsigned width, unsigned height,
+ unsigned d_width, unsigned d_height )
+{
+ mk_context *c, *ti, *v;
+
+ if( w->wrote_header )
+ return -1;
+
+ w->timescale = timescale;
+ w->def_duration = default_frame_duration;
+
+ if( !(c = mk_create_context( w, w->root, 0x1a45dfa3 )) ) // EBML
+ return -1;
+ CHECK( mk_write_uint( c, 0x4286, 1 ) ); // EBMLVersion
+ CHECK( mk_write_uint( c, 0x42f7, 1 ) ); // EBMLReadVersion
+ CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
+ CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
+ CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
+ CHECK( mk_write_uint( c, 0x4287, 1 ) ); // DocTypeVersion
+ CHECK( mk_write_uint( c, 0x4285, 1 ) ); // DocTypeReadversion
+ CHECK( mk_close_context( c, 0 ) );
+
+ if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment
+ return -1;
+ CHECK( mk_flush_context_id( c ) );
+ CHECK( mk_close_context( c, 0 ) );
+
+ if( !(c = mk_create_context( w, w->root, 0x1549a966 )) ) // SegmentInfo
+ return -1;
+ CHECK( mk_write_string( c, 0x4d80, "Haali Matroska Writer b0" ) );
+ CHECK( mk_write_string( c, 0x5741, writing_app ) );
+ CHECK( mk_write_uint( c, 0x2ad7b1, w->timescale ) );
+ CHECK( mk_write_float( c, 0x4489, 0) );
+ w->duration_ptr = c->d_cur - 4;
+ CHECK( mk_close_context( c, &w->duration_ptr ) );
+
+ if( !(c = mk_create_context( w, w->root, 0x1654ae6b )) ) // tracks
+ return -1;
+ if( !(ti = mk_create_context( w, c, 0xae )) ) // TrackEntry
+ return -1;
+ CHECK( mk_write_uint( ti, 0xd7, 1 ) ); // TrackNumber
+ CHECK( mk_write_uint( ti, 0x73c5, 1 ) ); // TrackUID
+ CHECK( mk_write_uint( ti, 0x83, 1 ) ); // TrackType
+ CHECK( mk_write_uint( ti, 0x9c, 0 ) ); // FlagLacing
+ CHECK( mk_write_string( ti, 0x86, codec_id ) ); // codec_id
+ if( codec_private_size )
+ CHECK( mk_write_bin( ti, 0x63a2, codec_private, codec_private_size ) ); // codec_private
+ if( default_frame_duration )
+ CHECK( mk_write_uint( ti, 0x23e383, default_frame_duration ) ); // DefaultDuration
+
+ if( !(v = mk_create_context( w, ti, 0xe0 ) ) ) // Video
+ return -1;
+ CHECK( mk_write_uint( v, 0xb0, width ) );
+ CHECK( mk_write_uint( v, 0xba, height ) );
+ CHECK( mk_write_uint( v, 0x54b0, d_width ) );
+ CHECK( mk_write_uint( v, 0x54ba, d_height ) );
+ CHECK( mk_close_context( v, 0 ) );
+
+ CHECK( mk_close_context( ti, 0 ) );
+
+ CHECK( mk_close_context( c, 0 ) );
+
+ CHECK( mk_flush_context_data( w->root ) );
+
+ w->wrote_header = 1;
+
+ return 0;
+}
+
+static int mk_close_cluster( mk_writer *w )
+{
+ if( w->cluster == NULL )
+ return 0;
+ CHECK( mk_close_context( w->cluster, 0 ) );
+ w->cluster = NULL;
+ CHECK( mk_flush_context_data( w->root ) );
+ return 0;
+}
+
+static int mk_flush_frame( mk_writer *w )
+{
+ int64_t delta, ref = 0;
+ unsigned fsize, bgsize;
+ unsigned char c_delta_flags[3];
+
+ if( !w->in_frame )
+ return 0;
+
+ delta = w->frame_tc/w->timescale - w->cluster_tc_scaled;
+ if( delta > 32767ll || delta < -32768ll )
+ CHECK( mk_close_cluster( w ) );
+
+ if( !w->cluster )
+ {
+ w->cluster_tc_scaled = w->frame_tc / w->timescale;
+ w->cluster = mk_create_context( w, w->root, 0x1f43b675 ); // Cluster
+ if( !w->cluster )
+ return -1;
+
+ CHECK( mk_write_uint( w->cluster, 0xe7, w->cluster_tc_scaled ) ); // Timecode
+
+ delta = 0;
+ }
+
+ fsize = w->frame ? w->frame->d_cur : 0;
+ bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
+ if( !w->keyframe )
+ {
+ ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
+ bgsize += 1 + 1 + mk_ebml_sint_size( ref );
+ }
+
+ CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
+ CHECK( mk_write_size( w->cluster, bgsize ) );
+ CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
+ CHECK( mk_write_size( w->cluster, fsize + 4 ) );
+ CHECK( mk_write_size( w->cluster, 1 ) ); // track number
+
+ c_delta_flags[0] = delta >> 8;
+ c_delta_flags[1] = delta;
+ c_delta_flags[2] = 0;
+ CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
+ if( w->frame )
+ {
+ CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
+ w->frame->d_cur = 0;
+ }
+ if( !w->keyframe )
+ CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
+
+ w->in_frame = 0;
+ w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
+
+ if( w->cluster->d_cur > CLSIZE )
+ CHECK( mk_close_cluster( w ) );
+
+ return 0;
+}
+
+int mk_start_frame( mk_writer *w )
+{
+ if( mk_flush_frame( w ) < 0 )
+ return -1;
+
+ w->in_frame = 1;
+ w->keyframe = 0;
+
+ return 0;
+}
+
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
+{
+ if( !w->in_frame )
+ return -1;
+
+ w->frame_tc = timestamp;
+ w->keyframe = keyframe != 0;
+
+ if( w->max_frame_tc < timestamp )
+ w->max_frame_tc = timestamp;
+
+ return 0;
+}
+
+int mk_add_frame_data( mk_writer *w, const void *data, unsigned size )
+{
+ if( !w->in_frame )
+ return -1;
+
+ if( !w->frame )
+ if( !(w->frame = mk_create_context( w, NULL, 0 )) )
+ return -1;
+
+ return mk_append_context_data( w->frame, data, size );
+}
+
+int mk_close( mk_writer *w, int64_t last_delta )
+{
+ int ret = 0;
+ if( mk_flush_frame( w ) < 0 || mk_close_cluster( w ) < 0 )
+ ret = -1;
+ if( w->wrote_header && x264_is_regular_file( w->fp ) )
+ {
+ fseek( w->fp, w->duration_ptr, SEEK_SET );
+ int64_t last_frametime = w->def_duration ? w->def_duration : last_delta;
+ int64_t total_duration = w->max_frame_tc+last_frametime;
+ if( mk_write_float_raw( w->root, (float)((double)total_duration / w->timescale) ) < 0 ||
+ mk_flush_context_data( w->root ) < 0 )
+ ret = -1;
+ }
+ mk_destroy_contexts( w );
+ fclose( w->fp );
+ free( w );
+ return ret;
+}
diff --git a/matroska.h b/output/matroska_ebml.h
similarity index 56%
rename from matroska.h
rename to output/matroska_ebml.h
index be6f530..252e781 100644
--- a/matroska.h
+++ b/output/matroska_ebml.h
@@ -1,5 +1,5 @@
/*****************************************************************************
- * matroska.h:
+ * matroska_ebml.h:
*****************************************************************************
* Copyright (C) 2005 Mike Matsnev
*
@@ -18,24 +18,24 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifndef X264_MATROSKA_H
-#define X264_MATROSKA_H
+#ifndef X264_MATROSKA_EBML_H
+#define X264_MATROSKA_EBML_H
-typedef struct mk_Writer mk_Writer;
+typedef struct mk_writer mk_writer;
-mk_Writer *mk_createWriter( const char *filename );
+mk_writer *mk_create_writer( const char *filename );
-int mk_writeHeader( mk_Writer *w, const char *writingApp,
- const char *codecID,
- const void *codecPrivate, unsigned codecPrivateSize,
- int64_t default_frame_duration,
- int64_t timescale,
- unsigned width, unsigned height,
- unsigned d_width, unsigned d_height );
+int mk_writeHeader( mk_writer *w, const char *writing_app,
+ const char *codec_id,
+ const void *codec_private, unsigned codec_private_size,
+ int64_t default_frame_duration,
+ int64_t timescale,
+ unsigned width, unsigned height,
+ unsigned d_width, unsigned d_height );
-int mk_startFrame( mk_Writer *w );
-int mk_addFrameData( mk_Writer *w, const void *data, unsigned size );
-int mk_setFrameFlags( mk_Writer *w, int64_t timestamp, int keyframe );
-int mk_close( mk_Writer *w );
+int mk_start_frame( mk_writer *w );
+int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
+int mk_close( mk_writer *w, int64_t last_delta );
#endif
diff --git a/output/mp4.c b/output/mp4.c
new file mode 100644
index 0000000..e3ad9c6
--- /dev/null
+++ b/output/mp4.c
@@ -0,0 +1,300 @@
+/*****************************************************************************
+ * mp4.c: x264 mp4 output module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include <gpac/isomedia.h>
+
+typedef struct
+{
+ GF_ISOFile *p_file;
+ GF_AVCConfig *p_config;
+ GF_ISOSample *p_sample;
+ int i_track;
+ uint32_t i_descidx;
+ int i_time_res;
+ int64_t i_time_inc;
+ int i_numframe;
+ int i_delay_time;
+} mp4_hnd_t;
+
+static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track )
+{
+ u32 i, count, di, timescale, time_wnd, rate;
+ u64 offset;
+ Double br;
+ GF_ESD *esd;
+
+ esd = gf_isom_get_esd( p_file, i_track, 1 );
+ if( !esd )
+ return;
+
+ esd->decoderConfig->avgBitrate = 0;
+ esd->decoderConfig->maxBitrate = 0;
+ rate = time_wnd = 0;
+
+ timescale = gf_isom_get_media_timescale( p_file, i_track );
+ count = gf_isom_get_sample_count( p_file, i_track );
+ for( i = 0; i < count; i++ )
+ {
+ GF_ISOSample *samp = gf_isom_get_sample_info( p_file, i_track, i+1, &di, &offset );
+ if( !samp )
+ {
+ fprintf( stderr, "mp4 [error]: failure reading back frame %u\n", i );
+ break;
+ }
+
+ if( esd->decoderConfig->bufferSizeDB < samp->dataLength )
+ esd->decoderConfig->bufferSizeDB = samp->dataLength;
+
+ esd->decoderConfig->avgBitrate += samp->dataLength;
+ rate += samp->dataLength;
+ if( samp->DTS > time_wnd + timescale )
+ {
+ if( rate > esd->decoderConfig->maxBitrate )
+ esd->decoderConfig->maxBitrate = rate;
+ time_wnd = samp->DTS;
+ rate = 0;
+ }
+
+ gf_isom_sample_del( &samp );
+ }
+
+ br = (Double)(s64)gf_isom_get_media_duration( p_file, i_track );
+ br /= timescale;
+ esd->decoderConfig->avgBitrate = (u32)(esd->decoderConfig->avgBitrate / br);
+ /*move to bps*/
+ esd->decoderConfig->avgBitrate *= 8;
+ esd->decoderConfig->maxBitrate *= 8;
+
+ gf_isom_change_mpeg4_description( p_file, i_track, 1, esd );
+ gf_odf_desc_del( (GF_Descriptor*)esd );
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+ mp4_hnd_t *p_mp4 = handle;
+ uint64_t total_duration = 0;
+
+ if( !p_mp4 )
+ return 0;
+
+ if( p_mp4->p_config )
+ gf_odf_avc_cfg_del( p_mp4->p_config );
+
+ if( p_mp4->p_sample )
+ {
+ if( p_mp4->p_sample->data )
+ free( p_mp4->p_sample->data );
+
+ gf_isom_sample_del( &p_mp4->p_sample );
+ }
+
+ if( p_mp4->p_file )
+ {
+ /* The mdhd duration is defined as CTS[final] - CTS[0] + duration of last frame.
+ * The mdhd duration (in seconds) should be able to be longer than the tkhd duration since the track is managed by edts.
+ * So, if mdhd duration is equal to the last DTS or less, we give the last composition time delta to the last sample duration.
+ * And then, the mdhd duration is updated, but it time-wise doesn't give the actual duration.
+ * The tkhd duration is the actual track duration. */
+ uint64_t mdhd_duration = (2 * largest_pts - second_largest_pts - p_mp4->i_delay_time) * p_mp4->i_time_inc;
+ total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
+ if( mdhd_duration != total_duration )
+ {
+ uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
+ uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
+ gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
+ total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
+ }
+
+ /* Write an Edit Box if the first CTS offset is positive.
+ * A media_time is given by not the mvhd timescale but rather the mdhd timescale.
+ * The reason is that an Edit Box maps the presentation time-line to the media time-line.
+ * Any demuxers should follow the Edit Box if it exists. */
+ GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
+ if( sample->CTS_Offset > 0 )
+ {
+ uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
+ uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
+ gf_isom_append_edit_segment( p_mp4->p_file, p_mp4->i_track, tkhd_duration, sample->CTS_Offset, GF_ISOM_EDIT_NORMAL );
+ }
+ gf_isom_sample_del( &sample );
+
+ recompute_bitrate_mp4( p_mp4->p_file, p_mp4->i_track );
+ gf_isom_set_pl_indication( p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15 );
+ gf_isom_set_storage_mode( p_mp4->p_file, GF_ISOM_STORE_FLAT );
+ gf_isom_close( p_mp4->p_file );
+ }
+
+ free( p_mp4 );
+
+ return 0;
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+ mp4_hnd_t *p_mp4;
+
+ *p_handle = NULL;
+ FILE *fh = fopen( psz_filename, "w" );
+ if( !fh )
+ return -1;
+ else if( !x264_is_regular_file( fh ) )
+ {
+ fprintf( stderr, "mp4 [error]: MP4 output is incompatible with non-regular file `%s'\n", psz_filename );
+ return -1;
+ }
+ fclose( fh );
+
+ if( !(p_mp4 = malloc( sizeof(mp4_hnd_t) )) )
+ return -1;
+
+ memset( p_mp4, 0, sizeof(mp4_hnd_t) );
+ p_mp4->p_file = gf_isom_open( psz_filename, GF_ISOM_OPEN_WRITE, NULL );
+
+ if( !(p_mp4->p_sample = gf_isom_sample_new()) )
+ {
+ close_file( p_mp4, 0, 0 );
+ return -1;
+ }
+
+ gf_isom_set_brand_info( p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0 );
+
+ *p_handle = p_mp4;
+
+ return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+ mp4_hnd_t *p_mp4 = handle;
+
+ p_mp4->i_time_res = p_param->i_timebase_den;
+ p_mp4->i_time_inc = p_param->i_timebase_num;
+
+ p_mp4->i_track = gf_isom_new_track( p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL,
+ p_mp4->i_time_res );
+
+ p_mp4->p_config = gf_odf_avc_cfg_new();
+ gf_isom_avc_config_new( p_mp4->p_file, p_mp4->i_track, p_mp4->p_config,
+ NULL, NULL, &p_mp4->i_descidx );
+
+ gf_isom_set_track_enabled( p_mp4->p_file, p_mp4->i_track, 1 );
+
+ gf_isom_set_visual_info( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx,
+ p_param->i_width, p_param->i_height );
+
+ if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
+ {
+ uint64_t dw = p_param->i_width << 16;
+ uint64_t dh = p_param->i_height << 16;
+ double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
+ if( sar > 1.0 )
+ dw *= sar ;
+ else
+ dh /= sar;
+ gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
+ }
+
+ p_mp4->p_sample->data = malloc( p_param->i_width * p_param->i_height * 3 / 2 );
+ if( !p_mp4->p_sample->data )
+ return -1;
+
+ return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+ mp4_hnd_t *p_mp4 = handle;
+ GF_AVCConfigSlot *p_slot;
+
+ int sei_size = p_nal[0].i_payload;
+ int sps_size = p_nal[1].i_payload - 4;
+ int pps_size = p_nal[2].i_payload - 4;
+
+ uint8_t *sei = p_nal[0].p_payload;
+ uint8_t *sps = p_nal[1].p_payload + 4;
+ uint8_t *pps = p_nal[2].p_payload + 4;
+
+ // SPS
+
+ p_mp4->p_config->configurationVersion = 1;
+ p_mp4->p_config->AVCProfileIndication = sps[1];
+ p_mp4->p_config->profile_compatibility = sps[2];
+ p_mp4->p_config->AVCLevelIndication = sps[3];
+ p_slot = malloc( sizeof(GF_AVCConfigSlot) );
+ if( !p_slot )
+ return -1;
+ p_slot->size = sps_size;
+ p_slot->data = malloc( p_slot->size );
+ if( !p_slot->data )
+ return -1;
+ memcpy( p_slot->data, sps, sps_size );
+ gf_list_add( p_mp4->p_config->sequenceParameterSets, p_slot );
+
+ // PPS
+
+ p_slot = malloc( sizeof(GF_AVCConfigSlot) );
+ if( !p_slot )
+ return -1;
+ p_slot->size = pps_size;
+ p_slot->data = malloc( p_slot->size );
+ if( !p_slot->data )
+ return -1;
+ memcpy( p_slot->data, pps, pps_size );
+ gf_list_add( p_mp4->p_config->pictureParameterSets, p_slot );
+ gf_isom_avc_config_update( p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config );
+
+ // SEI
+
+ memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size );
+ p_mp4->p_sample->dataLength += sei_size;
+
+ return sei_size + sps_size + pps_size;
+}
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+ mp4_hnd_t *p_mp4 = handle;
+ int64_t dts;
+ int64_t cts;
+
+ memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size );
+ p_mp4->p_sample->dataLength += i_size;
+
+ if( !p_mp4->i_numframe )
+ p_mp4->i_delay_time = p_picture->i_dts * -1;
+
+ dts = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
+ cts = (p_picture->i_pts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
+
+ p_mp4->p_sample->IsRAP = p_picture->b_keyframe;
+ p_mp4->p_sample->DTS = dts;
+ p_mp4->p_sample->CTS_Offset = (uint32_t)(cts - dts);
+ gf_isom_add_sample( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample );
+
+ p_mp4->p_sample->dataLength = 0;
+ p_mp4->i_numframe++;
+
+ return i_size;
+}
+
+cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/encoder/analyse.h b/output/output.h
similarity index 62%
copy from encoder/analyse.h
copy to output/output.h
index b8c828f..851b819 100644
--- a/encoder/analyse.h
+++ b/output/output.h
@@ -1,7 +1,7 @@
/*****************************************************************************
- * analyse.h: h264 encoder library
+ * output.h: x264 file output modules
*****************************************************************************
- * Copyright (C) 2003-2008 x264 project
+ * Copyright (C) 2003-2009 x264 project
*
* Authors: Laurent Aimar <fenrir at via.ecp.fr>
* Loren Merritt <lorenm at u.washington.edu>
@@ -21,10 +21,21 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifndef X264_ANALYSE_H
-#define X264_ANALYSE_H
+#ifndef X264_OUTPUT_H
+#define X264_OUTPUT_H
-void x264_macroblock_analyse( x264_t *h );
-void x264_slicetype_decide( x264_t *h );
+typedef struct
+{
+ int (*open_file)( char *psz_filename, hnd_t *p_handle );
+ int (*set_param)( hnd_t handle, x264_param_t *p_param );
+ int (*write_headers)( hnd_t handle, x264_nal_t *p_nal );
+ int (*write_frame)( hnd_t handle, uint8_t *p_nal, int i_size, x264_picture_t *p_picture );
+ int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
+} cli_output_t;
+
+extern cli_output_t raw_output;
+extern cli_output_t mkv_output;
+extern cli_output_t mp4_output;
+extern cli_output_t flv_output;
#endif
diff --git a/output/raw.c b/output/raw.c
new file mode 100644
index 0000000..a4d1175
--- /dev/null
+++ b/output/raw.c
@@ -0,0 +1,66 @@
+/*****************************************************************************
+ * raw.c: x264 raw bitstream output module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+ if( !strcmp( psz_filename, "-" ) )
+ *p_handle = stdout;
+ else if( !(*p_handle = fopen( psz_filename, "w+b" )) )
+ return -1;
+
+ return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+ return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+ int size = p_nal[0].i_payload + p_nal[1].i_payload + p_nal[2].i_payload;
+
+ if( fwrite( p_nal[0].p_payload, size, 1, (FILE*)handle ) )
+ return size;
+ return -1;
+}
+
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+ if( fwrite( p_nalu, i_size, 1, (FILE*)handle ) )
+ return i_size;
+ return -1;
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+ if( !handle || handle == stdout )
+ return 0;
+
+ return fclose( (FILE*)handle );
+}
+
+cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
+
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4f34309..0bedc5b 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -30,6 +30,12 @@
#include "common/common.h"
#include "common/cpu.h"
+// GCC doesn't align stack variables on ARM, so use .bss
+#ifdef ARCH_ARM
+#undef ALIGNED_16
+#define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#endif
+
/* buf1, buf2: initialised to random data and shouldn't write into them */
uint8_t * buf1, * buf2;
/* buf3, buf4: used to store output */
@@ -76,17 +82,15 @@ static const char **intra_predict_8x8_names = intra_predict_4x4_names;
static inline uint32_t read_time(void)
{
+ uint32_t a = 0;
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
- uint32_t a;
asm volatile( "rdtsc" :"=a"(a) ::"edx" );
- return a;
#elif defined(ARCH_PPC)
- uint32_t a;
asm volatile( "mftb %0" : "=r" (a) );
- return a;
-#else
- return 0;
+#elif defined(ARCH_ARM) // ARMv7 only
+ asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
#endif
+ return a;
}
static bench_t* get_bench( const char *name, int cpu )
@@ -158,11 +162,14 @@ static void print_bench(void)
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
- b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
+ b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+ b->cpu&X264_CPU_NEON ? "neon" :
+ b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
- b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
+ b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
@@ -229,7 +236,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
x264_predict_t predict_4x4[9+3];
x264_predict8x8_t predict_8x8[9+3];
x264_predict_8x8_filter_t predict_8x8_filter;
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_16( uint8_t edge[33] );
uint16_t cost_mv[32];
int ret = 0, ok, used_asm;
int i, j;
@@ -337,16 +344,20 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
- int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var[i], buf1, 16 ); \
- res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+ /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+ call_c1( pixel_c.var[i], buf1, 16 ); \
+ call_a1( pixel_asm.var[i], buf1, 16 ); \
+ uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \
+ uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
- fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+ fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
} \
+ call_c2( pixel_c.var[i], buf1, 16 ); \
+ call_a2( pixel_asm.var[i], buf1, 16 ); \
}
ok = 1; used_asm = 0;
@@ -354,6 +365,23 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
+ ok = 1; used_asm = 0;
+ if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
+ {
+ int res_c, res_asm, ssd_c, ssd_asm;
+ set_func_name( "var2_8x8" );
+ used_asm = 1;
+ res_c = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c );
+ res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm );
+ if( res_c != res_asm || ssd_c != ssd_asm )
+ {
+ ok = 0;
+ fprintf( stderr, "var[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm );
+ }
+ }
+
+ report( "pixel var2 :" );
+
for( i=0, ok=1, used_asm=0; i<4; i++ )
if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
{
@@ -362,6 +390,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
for( j=0; j<32; j++ )
{
uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
+ call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
+ call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
if( rc != ra )
@@ -414,7 +444,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
{
float res_c, res_a;
- DECLARE_ALIGNED_16( int sums[5][4] ) = {{0}};
+ ALIGNED_16( int sums[5][4] ) = {{0}};
used_asm = ok = 1;
x264_emms();
res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
@@ -439,8 +469,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
for( i=0; i<100 && ok; i++ )
if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
- DECLARE_ALIGNED_16( uint16_t sums[72] );
- DECLARE_ALIGNED_16( int dc[4] );
+ ALIGNED_16( uint16_t sums[72] );
+ ALIGNED_16( int dc[4] );
int16_t mvs_a[32], mvs_c[32];
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
@@ -476,10 +506,11 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, i, j, interlace;
- DECLARE_ALIGNED_16( int16_t dct1[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
+ ALIGNED_16( int16_t dct1[16][16] );
+ ALIGNED_16( int16_t dct2[16][16] );
+ ALIGNED_16( int16_t dct4[16][16] );
+ ALIGNED_16( int16_t dct8[4][64] );
+ ALIGNED_8( int16_t dctdc[2][4] );
x264_t h_buf;
x264_t *h = &h_buf;
@@ -490,6 +521,7 @@ static int check_dct( int cpu_ref, int cpu_new )
memset( h, 0, sizeof(*h) );
h->pps = h->pps_array;
x264_param_default( &h->param );
+ h->chroma_qp_table = i_chroma_qp_table + 12;
h->param.analyse.i_luma_deadzone[0] = 0;
h->param.analyse.i_luma_deadzone[1] = 0;
h->param.analyse.b_transform_8x8 = 1;
@@ -514,6 +546,7 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+ TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
report( "sub_dct4 :" );
@@ -581,9 +614,9 @@ static int check_dct( int cpu_ref, int cpu_new )
for( i=0; i<16 && ok; i++ )\
{\
for( j=0; j<16; j++ )\
- dct1[0][0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
- : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
- : ((*p++)&0x1fff)-0x1000; /* general case */\
+ dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
+ : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
+ : ((*p++)&0x1fff)-0x1000; /* general case */\
memcpy( dct2, dct1, 32 );\
call_c1( dct_c.name, dct1[0] );\
call_a1( dct_asm.name, dct2[0] );\
@@ -603,8 +636,8 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_function_t zigzag_ref;
x264_zigzag_function_t zigzag_asm;
- DECLARE_ALIGNED_16( int16_t level1[64] );
- DECLARE_ALIGNED_16( int16_t level2[64] );
+ ALIGNED_16( int16_t level1[64] );
+ ALIGNED_16( int16_t level2[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
@@ -624,13 +657,14 @@ static int check_dct( int cpu_ref, int cpu_new )
#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
+ int nz_a, nz_c; \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \
memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
- call_c1( zigzag_c.name, t1, buf2, buf3 ); \
- call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
- if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
+ nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
+ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
+ if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
@@ -639,6 +673,35 @@ static int check_dct( int cpu_ref, int cpu_new )
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
}
+#define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
+ if( zigzag_asm.name != zigzag_ref.name ) \
+ { \
+ int nz_a, nz_c; \
+ int16_t dc_a, dc_c; \
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ used_asm = 1; \
+ for( i = 0; i < 2; i++ ) \
+ { \
+ memcpy( buf3, buf2, 16*FDEC_STRIDE ); \
+ memcpy( buf4, buf2, 16*FDEC_STRIDE ); \
+ for( j = 0; j < 4; j++ ) \
+ { \
+ memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
+ memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
+ } \
+ nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
+ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
+ if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name " [FAILED]\n" ); \
+ break; \
+ } \
+ } \
+ call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
+ call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
+ }
+
#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
@@ -668,6 +731,7 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+ TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
report( "zigzag_frame :" );
interlace = 1;
@@ -679,10 +743,11 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+ TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
report( "zigzag_field :" );
ok = 1; used_asm = 0;
- TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 );
+ TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 );
report( "zigzag_interleave :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB
@@ -697,7 +762,7 @@ static int check_mc( int cpu_ref, int cpu_new )
x264_mc_functions_t mc_a;
x264_pixel_function_t pixel;
- uint8_t *src = &buf1[2*32+2];
+ uint8_t *src = &buf1[2*64+2];
uint8_t *src2[4] = { &buf1[3*64+2], &buf1[5*64+2],
&buf1[7*64+2], &buf1[9*64+2] };
uint8_t *dst1 = buf3;
@@ -714,12 +779,13 @@ static int check_mc( int cpu_ref, int cpu_new )
#define MC_TEST_LUMA( w, h ) \
if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
{ \
+ const x264_weight_t *weight = weight_none; \
set_func_name( "mc_luma_%dx%d", w, h );\
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
- call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h ); \
+ call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
+ call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
@@ -730,12 +796,13 @@ static int check_mc( int cpu_ref, int cpu_new )
{ \
uint8_t *ref = dst2; \
int ref_stride = 32; \
+ const x264_weight_t *weight = weight_none; \
set_func_name( "get_ref_%dx%d", w, h );\
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
- ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h ); \
+ call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
+ ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
for( i=0; i<h; i++ ) \
if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
{ \
@@ -752,8 +819,8 @@ static int check_mc( int cpu_ref, int cpu_new )
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- call_c( mc_c.mc_chroma, dst1, 16, src, 32, dx, dy, w, h ); \
- call_a( mc_a.mc_chroma, dst2, 16, src, 32, dx, dy, w, h ); \
+ call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
+ call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
for( j=0; j<h; j++ ) \
for( i=w; i<4; i++ ) \
@@ -783,8 +850,9 @@ static int check_mc( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
for( dy = -1; dy < 9; dy++ )
- for( dx = -1; dx < 9; dx++ )
+ for( dx = -128; dx < 128; dx++ )
{
+ if( rand()&15 ) continue;
MC_TEST_CHROMA( 8, 8 );
MC_TEST_CHROMA( 8, 4 );
MC_TEST_CHROMA( 4, 8 );
@@ -822,6 +890,79 @@ static int check_mc( int cpu_ref, int cpu_new )
MC_TEST_AVG( avg, w );
report( "mc wpredb :" );
+#define MC_TEST_WEIGHT( name, weight, aligned ) \
+ int align_off = (aligned ? 0 : rand()%16); \
+ for( i = 1, ok = 1, used_asm = 0; i <= 5; i++ ) \
+ { \
+ ALIGNED_16( uint8_t buffC[640] ); \
+ ALIGNED_16( uint8_t buffA[640] ); \
+ j = X264_MAX( i*4, 2 ); \
+ memset( buffC, 0, 640 ); \
+ memset( buffA, 0, 640 ); \
+ x264_t ha; \
+ ha.mc = mc_a; \
+ /* w12 is the same as w16 in some cases */ \
+ if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
+ continue; \
+ if( mc_a.name[i] != mc_ref.name[i] ) \
+ { \
+ int k; \
+ set_func_name( "%s_w%d", #name, j ); \
+ used_asm = 1; \
+ call_c1( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+ mc_a.weight_cache(&ha, &weight); \
+ call_a1( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+ for( k = 0; k < 16; k++ ) \
+ if( memcmp( &buffC[k*32], &buffA[k*32], j ) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
+ break; \
+ } \
+ call_c2( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+ call_a2( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+ } \
+ }
+
+ ok = 1; used_asm = 0;
+
+ int s,o,d;
+ int align_cnt = 0;
+ for( s = 0; s <= 127 && ok; s++ )
+ {
+ for( o = -128; o <= 127 && ok; o++ )
+ {
+ if( rand() & 2047 ) continue;
+ for( d = 0; d <= 7 && ok; d++ )
+ {
+ if( s == 1<<d )
+ continue;
+ x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
+ MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
+ }
+ }
+
+ }
+ report( "mc weight :" );
+
+ ok = 1; used_asm = 0;
+ s = 1; d = 0;
+ for( o = 0; o <= 127 && ok; o++ )
+ {
+ if( rand() & 15 ) continue;
+ x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
+ MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
+ }
+ report( "mc offsetadd :" );
+ ok = 1; used_asm = 0;
+ for( o = -128; o < 0 && ok; o++ )
+ {
+ if( rand() & 15 ) continue;
+ x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
+ MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
+ }
+ report( "mc offsetsub :" );
+
if( mc_a.hpel_filter != mc_ref.hpel_filter )
{
uint8_t *src = buf1+8+2*64;
@@ -855,7 +996,7 @@ static int check_mc( int cpu_ref, int cpu_new )
if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
{
uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
- uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
+ uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf4+3072 };
set_func_name( "lowres_init" );
ok = 1; used_asm = 1;
for( w=40; w<=48; w+=8 )
@@ -908,6 +1049,34 @@ static int check_mc( int cpu_ref, int cpu_new )
INTEGRAL_INIT( integral_init8v, 9, sum, stride );
report( "integral init :" );
+ if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
+ {
+ ok = 1; used_asm = 1;
+ set_func_name( "mbtree_propagate" );
+ int *dsta = (int*)buf3;
+ int *dstc = dsta+400;
+ uint16_t *prop = (uint16_t*)buf1;
+ uint16_t *intra = (uint16_t*)buf4;
+ uint16_t *inter = intra+400;
+ uint16_t *qscale = inter+400;
+ uint16_t *rand = (uint16_t*)buf2;
+ x264_emms();
+ for( i=0; i<400; i++ )
+ {
+ intra[i] = *rand++ & 0x7fff;
+ intra[i] += !intra[i];
+ inter[i] = *rand++ & 0x7fff;
+ qscale[i] = *rand++ & 0x7fff;
+ }
+ call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
+ call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
+ // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+ x264_emms();
+ for( i=0; i<400; i++ )
+ ok &= abs(dstc[i]-dsta[i]) <= (abs(dstc[i])>512) || fabs((double)dstc[i]/dsta[i]-1) < 1e-6;
+ report( "mbtree propagate :" );
+ }
+
return ret;
}
@@ -981,9 +1150,9 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- DECLARE_ALIGNED_16( int16_t dct1[64] );
- DECLARE_ALIGNED_16( int16_t dct2[64] );
- DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
+ ALIGNED_16( int16_t dct1[64] );
+ ALIGNED_16( int16_t dct2[64] );
+ ALIGNED_16( uint8_t cqm_buf[64] );
int ret = 0, ok, used_asm;
int oks[2] = {1,1}, used_asms[2] = {0,0};
int i, j, i_cqm, qp;
@@ -992,6 +1161,7 @@ static int check_quant( int cpu_ref, int cpu_new )
memset( h, 0, sizeof(*h) );
h->pps = h->pps_array;
x264_param_default( &h->param );
+ h->chroma_qp_table = i_chroma_qp_table + 12;
h->param.rc.i_qp_min = 26;
h->param.analyse.b_transform_8x8 = 1;
@@ -1030,25 +1200,21 @@ static int check_quant( int cpu_ref, int cpu_new )
#define INIT_QUANT8() \
{ \
static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
- int x, y; \
- for( y = 0; y < 8; y++ ) \
- for( x = 0; x < 8; x++ ) \
- { \
- unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
- dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \
- } \
+ for( i = 0; i < 64; i++ ) \
+ { \
+ unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \
+ dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+ } \
}
#define INIT_QUANT4() \
{ \
static const int scale1d[4] = {4,6,4,6}; \
- int x, y; \
- for( y = 0; y < 4; y++ ) \
- for( x = 0; x < 4; x++ ) \
- { \
- unsigned int scale = 255*scale1d[y]*scale1d[x]; \
- dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \
- } \
+ for( i = 0; i < 16; i++ ) \
+ { \
+ unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \
+ dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+ } \
}
#define TEST_QUANT_DC( name, cqm ) \
@@ -1063,16 +1229,16 @@ static int check_quant( int cpu_ref, int cpu_new )
int result_c, result_a; \
for( i = 0; i < 16; i++ ) \
dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
- result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
- result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
{ \
oks[0] = 0; \
fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
break; \
} \
- call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
- call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
} \
} \
}
@@ -1088,16 +1254,16 @@ static int check_quant( int cpu_ref, int cpu_new )
{ \
int result_c, result_a; \
INIT_QUANT##w() \
- result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
{ \
oks[0] = 0; \
fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
break; \
} \
- call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
} \
} \
}
@@ -1118,18 +1284,18 @@ static int check_quant( int cpu_ref, int cpu_new )
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT##w() \
- call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
memcpy( dct2, dct1, w*w*2 ); \
- call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
- call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
if( memcmp( dct1, dct2, w*w*2 ) ) \
{ \
oks[1] = 0; \
fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
break; \
} \
- call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
- call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+ call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
} \
}
@@ -1147,17 +1313,17 @@ static int check_quant( int cpu_ref, int cpu_new )
{ \
for( i = 0; i < 16; i++ ) \
dct1[i] = rand(); \
- call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
+ call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*2 ); \
- call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
- call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
if( memcmp( dct1, dct2, w*w*2 ) ) \
{ \
oks[1] = 0; \
fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
} \
- call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
- call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+ call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
} \
}
@@ -1205,8 +1371,8 @@ static int check_quant( int cpu_ref, int cpu_new )
dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
if( ac ) \
dct1[0] = 0; \
- result_c = call_c( qf_c.decname, (void*)dct1 ); \
- result_a = call_a( qf_a.decname, (void*)dct1 ); \
+ result_c = call_c( qf_c.decname, dct1 ); \
+ result_a = call_a( qf_a.decname, dct1 ); \
if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
{ \
ok = 0; \
@@ -1236,8 +1402,8 @@ static int check_quant( int cpu_ref, int cpu_new )
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
- result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \
- result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \
+ result_c = call_c( qf_c.last, dct1+ac ); \
+ result_a = call_a( qf_a.last, dct1+ac ); \
if( result_c != result_a ) \
{ \
ok = 0; \
@@ -1271,8 +1437,8 @@ static int check_quant( int cpu_ref, int cpu_new )
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
- result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \
- result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \
+ result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
+ result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
@@ -1297,8 +1463,8 @@ static int check_intra( int cpu_ref, int cpu_new )
{
int ret = 0, ok = 1, used_asm = 0;
int i;
- DECLARE_ALIGNED_16( uint8_t edge[33] );
- DECLARE_ALIGNED_16( uint8_t edge2[33] );
+ ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_16( uint8_t edge2[33] );
struct
{
x264_predict_t predict_16x16[4+3];
@@ -1502,6 +1668,13 @@ static int check_all_flags( void )
fprintf( stderr, "x264: ALTIVEC against C\n" );
ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
}
+#elif ARCH_ARM
+ if( x264_cpu_detect() & X264_CPU_ARMV6 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
+ if( x264_cpu_detect() & X264_CPU_NEON )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+ if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
#endif
return ret;
}
@@ -1513,7 +1686,7 @@ int main(int argc, char *argv[])
if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
{
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
return 1;
#endif
@@ -1532,6 +1705,11 @@ int main(int argc, char *argv[])
srand( i );
buf1 = x264_malloc( 0x3e00 + 16*BENCH_ALIGNS );
+ if( !buf1 )
+ {
+ fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
+ return -1;
+ }
buf2 = buf1 + 0xf00;
buf3 = buf2 + 0xf00;
buf4 = buf3 + 0x1000;
diff --git a/x264.c b/x264.c
index 9aafc35..58bc1f4 100644
--- a/x264.c
+++ b/x264.c
@@ -5,6 +5,8 @@
*
* Authors: Loren Merritt <lorenm at u.washington.edu>
* Laurent Aimar <fenrir at via.ecp.fr>
+ * Steven Walters <kemuri9 at gmail.com>
+ * Kieran Kunhya <kieran at kunhya.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,19 +35,12 @@
#include "x264.h"
#include "muxers.h"
-#ifndef _MSC_VER
-#include "config.h"
-#endif
-
#ifdef _WIN32
#include <windows.h>
#else
#define SetConsoleTitle(t)
#endif
-uint8_t *mux_buffer = NULL;
-int mux_buffer_size = 0;
-
/* Ctrl-C handler */
static int b_ctrl_c = 0;
static int b_exit_on_ctrl_c = 0;
@@ -64,24 +59,43 @@ typedef struct {
FILE *qpfile;
} cli_opt_t;
-/* input file operation function pointers */
-int (*p_open_infile)( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int (*p_get_frame_total)( hnd_t handle );
-int (*p_read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int (*p_close_infile)( hnd_t handle );
+/* i/o file operation function pointer structs */
+cli_input_t input;
+static cli_output_t output;
-/* output file operation function pointers */
-static int (*p_open_outfile)( char *psz_filename, hnd_t *p_handle );
-static int (*p_set_outfile_param)( hnd_t handle, x264_param_t *p_param );
-static int (*p_write_nalu)( hnd_t handle, uint8_t *p_nal, int i_size );
-static int (*p_set_eop)( hnd_t handle, x264_picture_t *p_picture );
-static int (*p_close_outfile)( hnd_t handle );
+static const char * const demuxer_names[] =
+{
+ "auto",
+ "yuv",
+ "y4m",
+#ifdef AVS_INPUT
+ "avs",
+#endif
+#ifdef LAVF_INPUT
+ "lavf",
+#endif
+#ifdef FFMS_INPUT
+ "ffms",
+#endif
+ 0
+};
-static void Help( x264_param_t *defaults, int b_longhelp );
+static const char * const muxer_names[] =
+{
+ "auto",
+ "raw",
+ "mkv",
+ "flv",
+#ifdef MP4_OUTPUT
+ "mp4",
+#endif
+ 0
+};
+
+static void Help( x264_param_t *defaults, int longhelp );
static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt );
static int Encode( x264_param_t *param, cli_opt_t *opt );
-
/****************************************************************************
* main:
****************************************************************************/
@@ -126,31 +140,59 @@ static char const *strtable_lookup( const char * const table[], int index )
return ( ( index >= 0 && index < i ) ? table[ index ] : "???" );
}
+static char *stringify_names( char *buf, const char * const names[] )
+{
+ int i = 0;
+ char *p = buf;
+ for( p[0] = 0; names[i]; i++ )
+ {
+ p += sprintf( p, "%s", names[i] );
+ if( names[i+1] )
+ p += sprintf( p, ", " );
+ }
+ return buf;
+}
+
/*****************************************************************************
* Help:
*****************************************************************************/
-static void Help( x264_param_t *defaults, int b_longhelp )
+static void Help( x264_param_t *defaults, int longhelp )
{
+ char buf[50];
#define H0 printf
-#define H1 if(b_longhelp) printf
+#define H1 if(longhelp>=1) printf
+#define H2 if(longhelp==2) printf
H0( "x264 core:%d%s\n"
"Syntax: x264 [options] -o outfile infile [widthxheight]\n"
"\n"
"Infile can be raw YUV 4:2:0 (in which case resolution is required),\n"
" or YUV4MPEG 4:2:0 (*.y4m),\n"
- " or AVI or Avisynth if compiled with AVIS support (%s).\n"
+ " or Avisynth if compiled with support (%s).\n"
+ " or libav* formats if compiled with lavf support (%s) or ffms support (%s).\n"
"Outfile type is selected by filename:\n"
" .264 -> Raw bytestream\n"
" .mkv -> Matroska\n"
+ " .flv -> Flash Video\n"
" .mp4 -> MP4 if compiled with GPAC support (%s)\n"
"\n"
"Options:\n"
"\n"
- " -h, --help List the more commonly used options\n"
- " --longhelp List all options\n"
+ " -h, --help List basic options\n"
+ " --longhelp List more options\n"
+ " --fullhelp List all options\n"
"\n",
X264_BUILD, X264_VERSION,
-#ifdef AVIS_INPUT
+#ifdef AVS_INPUT
+ "yes",
+#else
+ "no",
+#endif
+#ifdef LAVF_INPUT
+ "yes",
+#else
+ "no",
+#endif
+#ifdef FFMS_INPUT
"yes",
#else
"no",
@@ -161,333 +203,844 @@ static void Help( x264_param_t *defaults, int b_longhelp )
"no"
#endif
);
+ H0( "Example usage:\n" );
+ H0( "\n" );
+ H0( " Constant quality mode:\n" );
+ H0( " x264 --crf 24 -o <output> <input>\n" );
+ H0( "\n" );
+ H0( " Two-pass with a bitrate of 1000kbps:\n" );
+ H0( " x264 --pass 1 --bitrate 1000 -o <output> <input>\n" );
+ H0( " x264 --pass 2 --bitrate 1000 -o <output> <input>\n" );
+ H0( "\n" );
+ H0( " Lossless:\n" );
+ H0( " x264 --crf 0 -o <output> <input>\n" );
+ H0( "\n" );
+ H0( " Maximum PSNR at the cost of speed and visual quality:\n" );
+ H0( " x264 --preset placebo --tune psnr -o <output> <input>\n" );
+ H0( "\n" );
+ H0( " Constant bitrate at 1000kbps with a 2 second-buffer:\n");
+ H0( " x264 --vbv-bufsize 2000 --bitrate 1000 -o <output> <input>\n" );
+ H0( "\n" );
+ H0( "Presets:\n" );
+ H0( "\n" );
+ H0( " --profile Force the limits of an H.264 profile [high]\n"
+ " Overrides all settings.\n" );
+ H2( " - baseline:\n"
+ " --no-8x8dct --bframes 0 --no-cabac\n"
+ " --cqm flat --weightp 0 No interlaced\n"
+ " No lossless\n"
+ " - main:\n"
+ " --no-8x8dct --cqm flat No lossless\n"
+ " - high:\n"
+ " No lossless\n" );
+ else H0( " - baseline,main,high\n" );
+ H0( " --preset Use a preset to select encoding settings [medium]\n"
+ " Overridden by user settings.\n" );
+ H2( " - ultrafast:\n"
+ " --no-8x8dct --aq-mode 0 --b-adapt 0\n"
+ " --bframes 0 --no-cabac --no-deblock\n"
+ " --no-mbtree --me dia --no-mixed-refs\n"
+ " --partitions none --ref 1 --scenecut 0\n"
+ " --subme 0 --trellis 0 --no-weightb\n"
+ " --weightp 0\n"
+ " - veryfast:\n"
+ " --no-mbtree --me dia --no-mixed-refs\n"
+ " --partitions i8x8,i4x4 --ref 1\n"
+ " --subme 1 --trellis 0 --weightp 0\n"
+ " - faster:\n"
+ " --no-mbtree --no-mixed-refs --ref 2\n"
+ " --subme 4 --weightp 1\n"
+ " - fast\n"
+ " --rc-lookahead 30 --ref 2 --subme 6\n"
+ " - medium\n"
+ " Default settings apply.\n"
+ " - slow\n"
+ " --b-adapt 2 --direct auto --me umh\n"
+ " --rc-lookahead 50 --ref 5 --subme 8\n"
+ " - slower\n"
+ " --b-adapt 2 --direct auto --me umh\n"
+ " --partitions all --rc-lookahead 60\n"
+ " --ref 8 --subme 9 --trellis 2\n"
+ " - veryslow\n"
+ " --b-adapt 2 --bframes 8 --direct auto\n"
+ " --me umh --merange 24 --partitions all\n"
+ " --ref 16 --subme 10 --trellis 2\n"
+ " --rc-lookahead 60\n"
+ " - placebo\n"
+ " --bframes 16 --b-adapt 2 --direct auto\n"
+ " --slow-firstpass --no-fast-pskip\n"
+ " --me tesa --merange 24 --partitions all\n"
+ " --rc-lookahead 60 --ref 16 --subme 10\n"
+ " --trellis 2\n" );
+ else H0( " - ultrafast,veryfast,faster,fast,medium\n"
+ " - slow,slower,veryslow,placebo\n" );
+ H0( " --tune Tune the settings for a particular type of source\n"
+ " or situation\n"
+ " Overridden by user settings.\n"
+ " Multiple tunings are separated by commas.\n"
+ " Only one psy tuning can be used at a time.\n" );
+ H2( " - film (psy tuning):\n"
+ " --deblock -1:-1 --psy-rd <unset>:0.15\n"
+ " - animation(psy tuning):\n"
+ " --bframes {+2} --deblock 1:1\n"
+ " --psy-rd 0.4:<unset> --aq-strength 0.6\n"
+ " --ref {Double if >1 else 1}\n"
+ " - grain(psy tuning):\n"
+ " --aq-strength 0.5 --no-dct-decimate\n"
+ " --deadzone-inter 6 --deadzone-intra 6\n"
+ " --deblock -2:-2 --ipratio 1.1 \n"
+ " --pbratio 1.1 --psy-rd <unset>:0.25\n"
+ " --qcomp 0.8\n"
+ " - psnr(psy tuning):\n"
+ " --aq-mode 0 --no-psy\n"
+ " - ssim(psy tuning):\n"
+ " --aq-mode 2 --no-psy\n"
+ " - fastdecode:\n"
+ " --no-cabac --no-deblock --no-weightb\n"
+ " --weightp 0\n"
+ " - zerolatency:\n"
+ " --bframes 0 --rc-lookahead 0\n"
+ " --sync-lookahead 0 --sliced-threads\n"
+ " - touhou(psy tuning):\n"
+ " --aq-strength 1.3 --deblock -1:-1\n"
+ " --partitions {p4x4 if p8x8 set}\n"
+ " --psy-rd <unset>:0.2\n"
+ " --ref {Double if >1 else 1}\n" );
+ else H0( " - psy tunings: film,animation,grain,psnr,ssim\n"
+ " - other tunings: fastdecode,zerolatency\n" );
+ H1( " --slow-firstpass Don't use faster settings with --pass 1\n" );
+ H0( "\n" );
H0( "Frame-type options:\n" );
H0( "\n" );
H0( " -I, --keyint <integer> Maximum GOP size [%d]\n", defaults->i_keyint_max );
- H1( " -i, --min-keyint <integer> Minimum GOP size [%d]\n", defaults->i_keyint_min );
- H1( " --no-scenecut Disable adaptive I-frame decision\n" );
- H1( " --scenecut <integer> How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
- H0( " -b, --bframes <integer> Number of B-frames between I and P [%d]\n", defaults->i_bframe );
- H1( " --b-adapt Adaptive B-frame decision method [%d]\n"
+ H2( " -i, --min-keyint <integer> Minimum GOP size [%d]\n", defaults->i_keyint_min );
+ H2( " --no-scenecut Disable adaptive I-frame decision\n" );
+ H2( " --scenecut <integer> How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
+ H2( " --intra-refresh Use Periodic Intra Refresh instead of IDR frames\n" );
+ H1( " -b, --bframes <integer> Number of B-frames between I and P [%d]\n", defaults->i_bframe );
+ H1( " --b-adapt <integer> Adaptive B-frame decision method [%d]\n"
" Higher values may lower threading efficiency.\n"
" - 0: Disabled\n"
" - 1: Fast\n"
" - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive );
- H1( " --b-bias <integer> Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
- H0( " --b-pyramid Keep some B-frames as references\n" );
- H0( " --no-cabac Disable CABAC\n" );
- H0( " -r, --ref <integer> Number of reference frames [%d]\n", defaults->i_frame_reference );
+ H2( " --b-bias <integer> Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
+ H1( " --b-pyramid <string> Keep some B-frames as references [%s]\n"
+ " - none: Disabled\n"
+ " - strict: Strictly hierarchical pyramid\n"
+ " - normal: Non-strict (not Blu-ray compatible)\n",
+ strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) );
+ H1( " --no-cabac Disable CABAC\n" );
+ H1( " -r, --ref <integer> Number of reference frames [%d]\n", defaults->i_frame_reference );
H1( " --no-deblock Disable loop filter\n" );
- H0( " -f, --deblock <alpha:beta> Loop filter AlphaC0 and Beta parameters [%d:%d]\n",
+ H1( " -f, --deblock <alpha:beta> Loop filter parameters [%d:%d]\n",
defaults->i_deblocking_filter_alphac0, defaults->i_deblocking_filter_beta );
+ H2( " --slices <integer> Number of slices per frame; forces rectangular\n"
+ " slices and is overridden by other slicing options\n" );
+ else H1( " --slices <integer> Number of slices per frame\n" );
+ H2( " --slice-max-size <integer> Limit the size of each slice in bytes\n");
+ H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n");
H0( " --interlaced Enable pure-interlaced mode\n" );
+ H2( " --constrained-intra Enable constrained intra prediction.\n" );
H0( "\n" );
H0( "Ratecontrol:\n" );
H0( "\n" );
- H0( " -q, --qp <integer> Set QP (0=lossless) [%d]\n", defaults->rc.i_qp_constant );
+ H1( " -q, --qp <integer> Force constant QP (0-51, 0=lossless)\n" );
H0( " -B, --bitrate <integer> Set bitrate (kbit/s)\n" );
- H0( " --crf <float> Quality-based VBR (nominal QP)\n" );
- H1( " --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
- H0( " --vbv-bufsize <integer> Enable CBR and set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
- H1( " --vbv-init <float> Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init );
- H1( " --qpmin <integer> Set min QP [%d]\n", defaults->rc.i_qp_min );
- H1( " --qpmax <integer> Set max QP [%d]\n", defaults->rc.i_qp_max );
- H1( " --qpstep <integer> Set max QP step [%d]\n", defaults->rc.i_qp_step );
- H0( " --ratetol <float> Allowed variance of average bitrate [%.1f]\n", defaults->rc.f_rate_tolerance );
- H0( " --ipratio <float> QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
- H0( " --pbratio <float> QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
- H1( " --chroma-qp-offset <integer> QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
- H1( " --aq-mode <integer> AQ method [%d]\n"
+ H0( " --crf <float> Quality-based VBR (0-51, 0=lossless) [%.1f]\n", defaults->rc.f_rf_constant );
+ H1( " --rc-lookahead <integer> Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead );
+ H0( " --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
+ H0( " --vbv-bufsize <integer> Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
+ H2( " --vbv-init <float> Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init );
+ H2( " --qpmin <integer> Set min QP [%d]\n", defaults->rc.i_qp_min );
+ H2( " --qpmax <integer> Set max QP [%d]\n", defaults->rc.i_qp_max );
+ H2( " --qpstep <integer> Set max QP step [%d]\n", defaults->rc.i_qp_step );
+ H2( " --ratetol <float> Tolerance of ABR ratecontrol and VBV [%.1f]\n", defaults->rc.f_rate_tolerance );
+ H2( " --ipratio <float> QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
+ H2( " --pbratio <float> QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
+ H2( " --chroma-qp-offset <integer> QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
+ H2( " --aq-mode <integer> AQ method [%d]\n"
" - 0: Disabled\n"
- " - 1: Variance AQ (complexity mask)\n", defaults->rc.i_aq_mode );
- H0( " --aq-strength <float> Reduces blocking and blurring in flat and\n"
- " textured areas. [%.1f]\n"
- " - 0.5: weak AQ\n"
- " - 1.5: strong AQ\n", defaults->rc.f_aq_strength );
- H0( "\n" );
- H0( " -p, --pass <1|2|3> Enable multipass ratecontrol\n"
+ " - 1: Variance AQ (complexity mask)\n"
+ " - 2: Auto-variance AQ (experimental)\n", defaults->rc.i_aq_mode );
+ H1( " --aq-strength <float> Reduces blocking and blurring in flat and\n"
+ " textured areas. [%.1f]\n", defaults->rc.f_aq_strength );
+ H1( "\n" );
+ H0( " -p, --pass <integer> Enable multipass ratecontrol\n"
" - 1: First pass, creates stats file\n"
- " - 2: Last pass, does not overwrite stats file\n"
- " - 3: Nth pass, overwrites stats file\n" );
- H0( " --stats <string> Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
- H0( " --qcomp <float> QP curve compression: 0.0 => CBR, 1.0 => CQP [%.2f]\n", defaults->rc.f_qcompress );
- H1( " --cplxblur <float> Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
- H1( " --qblur <float> Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
- H0( " --zones <zone0>/<zone1>/... Tweak the bitrate of some regions of the video\n" );
- H1( " Each zone is of the form\n"
+ " - 2: Last pass, does not overwrite stats file\n" );
+ H2( " - 3: Nth pass, overwrites stats file\n" );
+ H1( " --stats <string> Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
+ H2( " --no-mbtree Disable mb-tree ratecontrol.\n");
+ H2( " --qcomp <float> QP curve compression [%.2f]\n", defaults->rc.f_qcompress );
+ H2( " --cplxblur <float> Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
+ H2( " --qblur <float> Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
+ H2( " --zones <zone0>/<zone1>/... Tweak the bitrate of regions of the video\n" );
+ H2( " Each zone is of the form\n"
" <start frame>,<end frame>,<option>\n"
" where <option> is either\n"
" q=<integer> (force QP)\n"
" or b=<float> (bitrate multiplier)\n" );
- H1( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
+ H2( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
" Format of each line: framenumber frametype QP\n"
- " QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
- H0( "\n" );
- H0( "Analysis:\n" );
- H0( "\n" );
- H0( " -A, --partitions <string> Partitions to consider [\"p8x8,b8x8,i8x8,i4x4\"]\n"
+ " QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
+ " QPs are restricted by qpmin/qpmax.\n" );
+ H1( "\n" );
+ H1( "Analysis:\n" );
+ H1( "\n" );
+ H1( " -A, --partitions <string> Partitions to consider [\"p8x8,b8x8,i8x8,i4x4\"]\n"
" - p8x8, p4x4, b8x8, i8x8, i4x4\n"
" - none, all\n"
" (p4x4 requires p8x8. i8x8 requires --8x8dct.)\n" );
- H0( " --direct <string> Direct MV prediction mode [\"%s\"]\n"
+ H1( " --direct <string> Direct MV prediction mode [\"%s\"]\n"
" - none, spatial, temporal, auto\n",
strtable_lookup( x264_direct_pred_names, defaults->analyse.i_direct_mv_pred ) );
- H0( " -w, --weightb Weighted prediction for B-frames\n" );
- H0( " --me <string> Integer pixel motion estimation method [\"%s\"]\n",
+ H2( " --no-weightb Disable weighted prediction for B-frames\n" );
+ H1( " --weightp <integer> Weighted prediction for P-frames [%d]\n"
+ " - 0: Disabled\n"
+ " - 1: Blind offset\n"
+ " - 2: Smart analysis\n", defaults->analyse.i_weighted_pred );
+ H1( " --me <string> Integer pixel motion estimation method [\"%s\"]\n",
strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
- H1( " - dia: diamond search, radius 1 (fast)\n"
+ H2( " - dia: diamond search, radius 1 (fast)\n"
" - hex: hexagonal search, radius 2\n"
" - umh: uneven multi-hexagon search\n"
" - esa: exhaustive search\n"
" - tesa: hadamard exhaustive search (slow)\n" );
- else H0( " - dia, hex, umh\n" );
- H0( " --merange <integer> Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
- H1( " --mvrange <integer> Maximum motion vector length [-1 (auto)]\n" );
- H1( " --mvrange-thread <int> Minimum buffer between threads [-1 (auto)]\n" );
- H0( " -m, --subme <integer> Subpixel motion estimation and mode decision [%d]\n", defaults->analyse.i_subpel_refine );
- H1( " - 0: fullpel only (not recommended)\n"
+ else H1( " - dia, hex, umh\n" );
+ H2( " --merange <integer> Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
+ H2( " --mvrange <integer> Maximum motion vector length [-1 (auto)]\n" );
+ H2( " --mvrange-thread <int> Minimum buffer between threads [-1 (auto)]\n" );
+ H1( " -m, --subme <integer> Subpixel motion estimation and mode decision [%d]\n", defaults->analyse.i_subpel_refine );
+ H2( " - 0: fullpel only (not recommended)\n"
" - 1: SAD mode decision, one qpel iteration\n"
" - 2: SATD mode decision\n"
" - 3-5: Progressively more qpel\n"
" - 6: RD mode decision for I/P-frames\n"
" - 7: RD mode decision for all frames\n"
" - 8: RD refinement for I/P-frames\n"
- " - 9: RD refinement for all frames\n" );
- else H0( " decision quality: 1=fast, 9=best.\n" );
- H0( " --psy-rd Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
+ " - 9: RD refinement for all frames\n"
+ " - 10: QP-RD - requires trellis=2, aq-mode>0\n" );
+ else H1( " decision quality: 1=fast, 10=best.\n" );
+ H1( " --psy-rd Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
" #1: RD (requires subme>=6)\n"
" #2: Trellis (requires trellis, experimental)\n",
defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis );
- H0( " --mixed-refs Decide references on a per partition basis\n" );
- H1( " --no-chroma-me Ignore chroma in motion estimation\n" );
- H0( " -8, --8x8dct Adaptive spatial transform size\n" );
- H0( " -t, --trellis <integer> Trellis RD quantization. Requires CABAC. [%d]\n"
+ H2( " --no-psy Disable all visual optimizations that worsen\n"
+ " both PSNR and SSIM.\n" );
+ H2( " --no-mixed-refs Don't decide references on a per partition basis\n" );
+ H2( " --no-chroma-me Ignore chroma in motion estimation\n" );
+ H1( " --no-8x8dct Disable adaptive spatial transform size\n" );
+ H1( " -t, --trellis <integer> Trellis RD quantization. Requires CABAC. [%d]\n"
" - 0: disabled\n"
" - 1: enabled only on the final encode of a MB\n"
" - 2: enabled on all mode decisions\n", defaults->analyse.i_trellis );
- H0( " --no-fast-pskip Disables early SKIP detection on P-frames\n" );
- H0( " --no-dct-decimate Disables coefficient thresholding on P-frames\n" );
- H0( " --nr <integer> Noise reduction [%d]\n", defaults->analyse.i_noise_reduction );
- H1( "\n" );
- H1( " --deadzone-inter <int> Set the size of the inter luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[0] );
- H1( " --deadzone-intra <int> Set the size of the intra luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[1] );
- H1( " Deadzones should be in the range 0 - 32.\n" );
- H1( " --cqm <string> Preset quant matrices [\"flat\"]\n"
+ H2( " --no-fast-pskip Disables early SKIP detection on P-frames\n" );
+ H2( " --no-dct-decimate Disables coefficient thresholding on P-frames\n" );
+ H1( " --nr <integer> Noise reduction [%d]\n", defaults->analyse.i_noise_reduction );
+ H2( "\n" );
+ H2( " --deadzone-inter <int> Set the size of the inter luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[0] );
+ H2( " --deadzone-intra <int> Set the size of the intra luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[1] );
+ H2( " Deadzones should be in the range 0 - 32.\n" );
+ H2( " --cqm <string> Preset quant matrices [\"flat\"]\n"
" - jvt, flat\n" );
- H0( " --cqmfile <string> Read custom quant matrices from a JM-compatible file\n" );
- H1( " Overrides any other --cqm* options.\n" );
- H1( " --cqm4 <list> Set all 4x4 quant matrices\n"
+ H1( " --cqmfile <string> Read custom quant matrices from a JM-compatible file\n" );
+ H2( " Overrides any other --cqm* options.\n" );
+ H2( " --cqm4 <list> Set all 4x4 quant matrices\n"
" Takes a comma-separated list of 16 integers.\n" );
- H1( " --cqm8 <list> Set all 8x8 quant matrices\n"
+ H2( " --cqm8 <list> Set all 8x8 quant matrices\n"
" Takes a comma-separated list of 64 integers.\n" );
- H1( " --cqm4i, --cqm4p, --cqm8i, --cqm8p\n"
+ H2( " --cqm4i, --cqm4p, --cqm8i, --cqm8p\n"
" Set both luma and chroma quant matrices\n" );
- H1( " --cqm4iy, --cqm4ic, --cqm4py, --cqm4pc\n"
+ H2( " --cqm4iy, --cqm4ic, --cqm4py, --cqm4pc\n"
" Set individual quant matrices\n" );
- H1( "\n" );
- H1( "Video Usability Info (Annex E):\n" );
- H1( "The VUI settings are not used by the encoder but are merely suggestions to\n" );
- H1( "the playback equipment. See doc/vui.txt for details. Use at your own risk.\n" );
- H1( "\n" );
- H1( " --overscan <string> Specify crop overscan setting [\"%s\"]\n"
+ H2( "\n" );
+ H2( "Video Usability Info (Annex E):\n" );
+ H2( "The VUI settings are not used by the encoder but are merely suggestions to\n" );
+ H2( "the playback equipment. See doc/vui.txt for details. Use at your own risk.\n" );
+ H2( "\n" );
+ H2( " --overscan <string> Specify crop overscan setting [\"%s\"]\n"
" - undef, show, crop\n",
strtable_lookup( x264_overscan_names, defaults->vui.i_overscan ) );
- H1( " --videoformat <string> Specify video format [\"%s\"]\n"
+ H2( " --videoformat <string> Specify video format [\"%s\"]\n"
" - component, pal, ntsc, secam, mac, undef\n",
strtable_lookup( x264_vidformat_names, defaults->vui.i_vidformat ) );
- H1( " --fullrange <string> Specify full range samples setting [\"%s\"]\n"
+ H2( " --fullrange <string> Specify full range samples setting [\"%s\"]\n"
" - off, on\n",
strtable_lookup( x264_fullrange_names, defaults->vui.b_fullrange ) );
- H1( " --colorprim <string> Specify color primaries [\"%s\"]\n"
+ H2( " --colorprim <string> Specify color primaries [\"%s\"]\n"
" - undef, bt709, bt470m, bt470bg\n"
" smpte170m, smpte240m, film\n",
strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) );
- H1( " --transfer <string> Specify transfer characteristics [\"%s\"]\n"
+ H2( " --transfer <string> Specify transfer characteristics [\"%s\"]\n"
" - undef, bt709, bt470m, bt470bg, linear,\n"
" log100, log316, smpte170m, smpte240m\n",
strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) );
- H1( " --colormatrix <string> Specify color matrix setting [\"%s\"]\n"
+ H2( " --colormatrix <string> Specify color matrix setting [\"%s\"]\n"
" - undef, bt709, fcc, bt470bg\n"
" smpte170m, smpte240m, GBR, YCgCo\n",
strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) );
- H1( " --chromaloc <integer> Specify chroma sample location (0 to 5) [%d]\n",
+ H2( " --chromaloc <integer> Specify chroma sample location (0 to 5) [%d]\n",
defaults->vui.i_chroma_loc );
H0( "\n" );
H0( "Input/Output:\n" );
H0( "\n" );
H0( " -o, --output Specify output file\n" );
+ H1( " --muxer <string> Specify output container format [\"%s\"]\n"
+ " - %s\n", muxer_names[0], stringify_names( buf, muxer_names ) );
+ H1( " --demuxer <string> Specify input container format [\"%s\"]\n"
+ " - %s\n", demuxer_names[0], stringify_names( buf, demuxer_names ) );
+ H1( " --index <string> Filename for input index file\n" );
H0( " --sar width:height Specify Sample Aspect Ratio\n" );
H0( " --fps <float|rational> Specify framerate\n" );
H0( " --seek <integer> First frame to encode\n" );
H0( " --frames <integer> Maximum number of frames to encode\n" );
H0( " --level <string> Specify level (as defined by Annex A)\n" );
- H0( "\n" );
- H0( " -v, --verbose Print stats for each frame\n" );
- H0( " --progress Show a progress indicator while encoding\n" );
+ H1( "\n" );
+ H1( " -v, --verbose Print stats for each frame\n" );
+ H1( " --no-progress Don't show the progress indicator while encoding\n" );
H0( " --quiet Quiet Mode\n" );
- H0( " --no-psnr Disable PSNR computation\n" );
- H0( " --no-ssim Disable SSIM computation\n" );
- H0( " --threads <integer> Parallel encoding\n" );
- H0( " --thread-input Run Avisynth in its own thread\n" );
- H1( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" );
- H1( " --asm <integer> Override CPU detection\n" );
- H1( " --no-asm Disable all CPU optimizations\n" );
- H1( " --visualize Show MB types overlayed on the encoded video\n" );
- H1( " --dump-yuv <string> Save reconstructed frames\n" );
- H1( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
- H1( " --aud Use access unit delimiters\n" );
+ H1( " --psnr Enable PSNR computation\n" );
+ H1( " --ssim Enable SSIM computation\n" );
+ H1( " --threads <integer> Force a specific number of threads\n" );
+ H2( " --sliced-threads Low-latency but lower-efficiency threading\n" );
+ H2( " --thread-input Run Avisynth in its own thread\n" );
+ H2( " --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
+ H2( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" );
+ H2( " --asm <integer> Override CPU detection\n" );
+ H2( " --no-asm Disable all CPU optimizations\n" );
+ H2( " --visualize Show MB types overlayed on the encoded video\n" );
+ H2( " --dump-yuv <string> Save reconstructed frames\n" );
+ H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
+ H2( " --aud Use access unit delimiters\n" );
+ H2( " --force-cfr Force constant framerate timestamp generation\n" );
H0( "\n" );
}
+#define OPT_FRAMES 256
+#define OPT_SEEK 257
+#define OPT_QPFILE 258
+#define OPT_THREAD_INPUT 259
+#define OPT_QUIET 260
+#define OPT_NOPROGRESS 261
+#define OPT_VISUALIZE 262
+#define OPT_LONGHELP 263
+#define OPT_PROFILE 264
+#define OPT_PRESET 265
+#define OPT_TUNE 266
+#define OPT_SLOWFIRSTPASS 267
+#define OPT_FULLHELP 268
+#define OPT_FPS 269
+#define OPT_MUXER 270
+#define OPT_DEMUXER 271
+#define OPT_INDEX 272
+#define OPT_INTERLACED 273
+
+static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
+static struct option long_options[] =
+{
+ { "help", no_argument, NULL, 'h' },
+ { "longhelp", no_argument, NULL, OPT_LONGHELP },
+ { "fullhelp", no_argument, NULL, OPT_FULLHELP },
+ { "version", no_argument, NULL, 'V' },
+ { "profile", required_argument, NULL, OPT_PROFILE },
+ { "preset", required_argument, NULL, OPT_PRESET },
+ { "tune", required_argument, NULL, OPT_TUNE },
+ { "slow-firstpass", no_argument, NULL, OPT_SLOWFIRSTPASS },
+ { "bitrate", required_argument, NULL, 'B' },
+ { "bframes", required_argument, NULL, 'b' },
+ { "b-adapt", required_argument, NULL, 0 },
+ { "no-b-adapt", no_argument, NULL, 0 },
+ { "b-bias", required_argument, NULL, 0 },
+ { "b-pyramid", required_argument, NULL, 0 },
+ { "min-keyint", required_argument, NULL, 'i' },
+ { "keyint", required_argument, NULL, 'I' },
+ { "intra-refresh", no_argument, NULL, 0 },
+ { "scenecut", required_argument, NULL, 0 },
+ { "no-scenecut", no_argument, NULL, 0 },
+ { "nf", no_argument, NULL, 0 },
+ { "no-deblock", no_argument, NULL, 0 },
+ { "filter", required_argument, NULL, 0 },
+ { "deblock", required_argument, NULL, 'f' },
+ { "interlaced", no_argument, NULL, OPT_INTERLACED },
+ { "no-interlaced", no_argument, NULL, OPT_INTERLACED },
+ { "constrained-intra", no_argument, NULL, 0 },
+ { "cabac", no_argument, NULL, 0 },
+ { "no-cabac", no_argument, NULL, 0 },
+ { "qp", required_argument, NULL, 'q' },
+ { "qpmin", required_argument, NULL, 0 },
+ { "qpmax", required_argument, NULL, 0 },
+ { "qpstep", required_argument, NULL, 0 },
+ { "crf", required_argument, NULL, 0 },
+ { "rc-lookahead",required_argument, NULL, 0 },
+ { "ref", required_argument, NULL, 'r' },
+ { "asm", required_argument, NULL, 0 },
+ { "no-asm", no_argument, NULL, 0 },
+ { "sar", required_argument, NULL, 0 },
+ { "fps", required_argument, NULL, OPT_FPS },
+ { "frames", required_argument, NULL, OPT_FRAMES },
+ { "seek", required_argument, NULL, OPT_SEEK },
+ { "output", required_argument, NULL, 'o' },
+ { "muxer", required_argument, NULL, OPT_MUXER },
+ { "demuxer", required_argument, NULL, OPT_DEMUXER },
+ { "stdout", required_argument, NULL, OPT_MUXER },
+ { "stdin", required_argument, NULL, OPT_DEMUXER },
+ { "index", required_argument, NULL, OPT_INDEX },
+ { "analyse", required_argument, NULL, 0 },
+ { "partitions", required_argument, NULL, 'A' },
+ { "direct", required_argument, NULL, 0 },
+ { "weightb", no_argument, NULL, 'w' },
+ { "no-weightb", no_argument, NULL, 0 },
+ { "weightp", required_argument, NULL, 0 },
+ { "me", required_argument, NULL, 0 },
+ { "merange", required_argument, NULL, 0 },
+ { "mvrange", required_argument, NULL, 0 },
+ { "mvrange-thread", required_argument, NULL, 0 },
+ { "subme", required_argument, NULL, 'm' },
+ { "psy-rd", required_argument, NULL, 0 },
+ { "no-psy", no_argument, NULL, 0 },
+ { "psy", no_argument, NULL, 0 },
+ { "mixed-refs", no_argument, NULL, 0 },
+ { "no-mixed-refs", no_argument, NULL, 0 },
+ { "no-chroma-me", no_argument, NULL, 0 },
+ { "8x8dct", no_argument, NULL, 0 },
+ { "no-8x8dct", no_argument, NULL, 0 },
+ { "trellis", required_argument, NULL, 't' },
+ { "fast-pskip", no_argument, NULL, 0 },
+ { "no-fast-pskip", no_argument, NULL, 0 },
+ { "no-dct-decimate", no_argument, NULL, 0 },
+ { "aq-strength", required_argument, NULL, 0 },
+ { "aq-mode", required_argument, NULL, 0 },
+ { "deadzone-inter", required_argument, NULL, '0' },
+ { "deadzone-intra", required_argument, NULL, '0' },
+ { "level", required_argument, NULL, 0 },
+ { "ratetol", required_argument, NULL, 0 },
+ { "vbv-maxrate", required_argument, NULL, 0 },
+ { "vbv-bufsize", required_argument, NULL, 0 },
+ { "vbv-init", required_argument, NULL, 0 },
+ { "ipratio", required_argument, NULL, 0 },
+ { "pbratio", required_argument, NULL, 0 },
+ { "chroma-qp-offset", required_argument, NULL, 0 },
+ { "pass", required_argument, NULL, 'p' },
+ { "stats", required_argument, NULL, 0 },
+ { "qcomp", required_argument, NULL, 0 },
+ { "mbtree", no_argument, NULL, 0 },
+ { "no-mbtree", no_argument, NULL, 0 },
+ { "qblur", required_argument, NULL, 0 },
+ { "cplxblur", required_argument, NULL, 0 },
+ { "zones", required_argument, NULL, 0 },
+ { "qpfile", required_argument, NULL, OPT_QPFILE },
+ { "threads", required_argument, NULL, 0 },
+ { "sliced-threads", no_argument, NULL, 0 },
+ { "no-sliced-threads", no_argument, NULL, 0 },
+ { "slice-max-size", required_argument, NULL, 0 },
+ { "slice-max-mbs", required_argument, NULL, 0 },
+ { "slices", required_argument, NULL, 0 },
+ { "thread-input", no_argument, NULL, OPT_THREAD_INPUT },
+ { "sync-lookahead", required_argument, NULL, 0 },
+ { "non-deterministic", no_argument, NULL, 0 },
+ { "psnr", no_argument, NULL, 0 },
+ { "ssim", no_argument, NULL, 0 },
+ { "quiet", no_argument, NULL, OPT_QUIET },
+ { "verbose", no_argument, NULL, 'v' },
+ { "no-progress", no_argument, NULL, OPT_NOPROGRESS },
+ { "visualize", no_argument, NULL, OPT_VISUALIZE },
+ { "dump-yuv", required_argument, NULL, 0 },
+ { "sps-id", required_argument, NULL, 0 },
+ { "aud", no_argument, NULL, 0 },
+ { "nr", required_argument, NULL, 0 },
+ { "cqm", required_argument, NULL, 0 },
+ { "cqmfile", required_argument, NULL, 0 },
+ { "cqm4", required_argument, NULL, 0 },
+ { "cqm4i", required_argument, NULL, 0 },
+ { "cqm4iy", required_argument, NULL, 0 },
+ { "cqm4ic", required_argument, NULL, 0 },
+ { "cqm4p", required_argument, NULL, 0 },
+ { "cqm4py", required_argument, NULL, 0 },
+ { "cqm4pc", required_argument, NULL, 0 },
+ { "cqm8", required_argument, NULL, 0 },
+ { "cqm8i", required_argument, NULL, 0 },
+ { "cqm8p", required_argument, NULL, 0 },
+ { "overscan", required_argument, NULL, 0 },
+ { "videoformat", required_argument, NULL, 0 },
+ { "fullrange", required_argument, NULL, 0 },
+ { "colorprim", required_argument, NULL, 0 },
+ { "transfer", required_argument, NULL, 0 },
+ { "colormatrix", required_argument, NULL, 0 },
+ { "chromaloc", required_argument, NULL, 0 },
+ { "force-cfr", no_argument, NULL, 0 },
+ {0, 0, 0, 0}
+};
+
+static int select_output( const char *muxer, char *filename, x264_param_t *param )
+{
+ const char *ext = get_filename_extension( filename );
+ if( !strcmp( filename, "-" ) || strcasecmp( muxer, "auto" ) )
+ ext = muxer;
+
+ if( !strcasecmp( ext, "mp4" ) )
+ {
+#ifdef MP4_OUTPUT
+ output = mp4_output;
+ param->b_annexb = 0;
+ param->b_aud = 0;
+ param->b_dts_compress = 0;
+ param->b_repeat_headers = 0;
+#else
+ fprintf( stderr, "x264 [error]: not compiled with MP4 output support\n" );
+ return -1;
+#endif
+ }
+ else if( !strcasecmp( ext, "mkv" ) )
+ {
+ output = mkv_output;
+ param->b_annexb = 0;
+ param->b_aud = 0;
+ param->b_dts_compress = 0;
+ param->b_repeat_headers = 0;
+ }
+ else if( !strcasecmp( ext, "flv" ) )
+ {
+ output = flv_output;
+ param->b_annexb = 0;
+ param->b_aud = 0;
+ param->b_dts_compress = 1;
+ param->b_repeat_headers = 0;
+ }
+ else
+ output = raw_output;
+ return 0;
+}
+
+static int select_input( const char *demuxer, char *used_demuxer, char *filename,
+ hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+ const char *ext = get_filename_extension( filename );
+ int b_regular = strcmp( filename, "-" );
+ int b_auto = !strcasecmp( demuxer, "auto" );
+ if( !b_regular && b_auto )
+ ext = "yuv";
+ if( b_regular )
+ {
+ FILE *f = fopen( filename, "r" );
+ if( f )
+ {
+ b_regular = x264_is_regular_file( f );
+ fclose( f );
+ }
+ }
+ const char *module = b_auto ? ext : demuxer;
+
+ if( !strcasecmp( module, "avs" ) || !strcasecmp( ext, "d2v" ) || !strcasecmp( ext, "dga" ) )
+ {
+#ifdef AVS_INPUT
+ input = avs_input;
+ module = "avs";
+#else
+ fprintf( stderr, "x264 [error]: not compiled with AVS input support\n" );
+ return -1;
+#endif
+ }
+ else if( !strcasecmp( module, "y4m" ) )
+ input = y4m_input;
+ else if( !strcasecmp( module, "yuv" ) )
+ input = yuv_input;
+ else
+ {
+#ifdef FFMS_INPUT
+ if( b_regular && (b_auto || !strcasecmp( demuxer, "ffms" )) &&
+ !ffms_input.open_file( filename, p_handle, info, opt ) )
+ {
+ module = "ffms";
+ b_auto = 0;
+ input = ffms_input;
+ }
+#endif
+#ifdef LAVF_INPUT
+ if( (b_auto || !strcasecmp( demuxer, "lavf" )) &&
+ !lavf_input.open_file( filename, p_handle, info, opt ) )
+ {
+ module = "lavf";
+ b_auto = 0;
+ input = lavf_input;
+ }
+#endif
+#ifdef AVS_INPUT
+ if( b_regular && (b_auto || !strcasecmp( demuxer, "avs" )) &&
+ !avs_input.open_file( filename, p_handle, info, opt ) )
+ {
+ module = "avs";
+ b_auto = 0;
+ input = avs_input;
+ }
+#endif
+ if( b_auto && !yuv_input.open_file( filename, p_handle, info, opt ) )
+ {
+ module = "yuv";
+ b_auto = 0;
+ input = yuv_input;
+ }
+
+ if( !(*p_handle) )
+ {
+ fprintf( stderr, "x264 [error]: could not open input file `%s' via any method!\n", filename );
+ return -1;
+ }
+ }
+ strcpy( used_demuxer, module );
+
+ return 0;
+}
+
/*****************************************************************************
* Parse:
*****************************************************************************/
-static int Parse( int argc, char **argv,
- x264_param_t *param, cli_opt_t *opt )
+static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
{
- char *psz_filename = NULL;
+ char *input_filename = NULL;
+ const char *demuxer = demuxer_names[0];
+ char *output_filename = NULL;
+ const char *muxer = muxer_names[0];
x264_param_t defaults = *param;
- char *psz;
- int b_avis = 0;
- int b_y4m = 0;
+ char *profile = NULL;
int b_thread_input = 0;
+ int b_turbo = 1;
+ int b_pass1 = 0;
+ int b_user_ref = 0;
+ int b_user_fps = 0;
+ int b_user_interlaced = 0;
+ int i;
+ cli_input_opt_t input_opt;
memset( opt, 0, sizeof(cli_opt_t) );
+ memset( &input_opt, 0, sizeof(cli_input_opt_t) );
+ opt->b_progress = 1;
+
+ /* Presets are applied before all other options. */
+ for( optind = 0;; )
+ {
+ int c = getopt_long( argc, argv, short_options, long_options, NULL );
+ if( c == -1 )
+ break;
- /* Default input file driver */
- p_open_infile = open_file_yuv;
- p_get_frame_total = get_frame_total_yuv;
- p_read_frame = read_frame_yuv;
- p_close_infile = close_file_yuv;
+ if( c == OPT_PRESET )
+ {
+ if( !strcasecmp( optarg, "ultrafast" ) )
+ {
+ param->i_frame_reference = 1;
+ param->i_scenecut_threshold = 0;
+ param->b_deblocking_filter = 0;
+ param->b_cabac = 0;
+ param->i_bframe = 0;
+ param->analyse.intra = 0;
+ param->analyse.inter = 0;
+ param->analyse.b_transform_8x8 = 0;
+ param->analyse.i_me_method = X264_ME_DIA;
+ param->analyse.i_subpel_refine = 0;
+ param->rc.i_aq_mode = 0;
+ param->analyse.b_mixed_references = 0;
+ param->analyse.i_trellis = 0;
+ param->i_bframe_adaptive = X264_B_ADAPT_NONE;
+ param->rc.b_mb_tree = 0;
+ param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+ }
+ else if( !strcasecmp( optarg, "veryfast" ) )
+ {
+ param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
+ param->analyse.i_me_method = X264_ME_DIA;
+ param->analyse.i_subpel_refine = 1;
+ param->i_frame_reference = 1;
+ param->analyse.b_mixed_references = 0;
+ param->analyse.i_trellis = 0;
+ param->rc.b_mb_tree = 0;
+ param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+ }
+ else if( !strcasecmp( optarg, "faster" ) )
+ {
+ param->analyse.b_mixed_references = 0;
+ param->i_frame_reference = 2;
+ param->analyse.i_subpel_refine = 4;
+ param->rc.b_mb_tree = 0;
+ param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
+ }
+ else if( !strcasecmp( optarg, "fast" ) )
+ {
+ param->i_frame_reference = 2;
+ param->analyse.i_subpel_refine = 6;
+ param->rc.i_lookahead = 30;
+ }
+ else if( !strcasecmp( optarg, "medium" ) )
+ {
+ /* Default is medium */
+ }
+ else if( !strcasecmp( optarg, "slow" ) )
+ {
+ param->analyse.i_me_method = X264_ME_UMH;
+ param->analyse.i_subpel_refine = 8;
+ param->i_frame_reference = 5;
+ param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+ param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+ param->rc.i_lookahead = 50;
+ }
+ else if( !strcasecmp( optarg, "slower" ) )
+ {
+ param->analyse.i_me_method = X264_ME_UMH;
+ param->analyse.i_subpel_refine = 9;
+ param->i_frame_reference = 8;
+ param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+ param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+ param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+ param->analyse.i_trellis = 2;
+ param->rc.i_lookahead = 60;
+ }
+ else if( !strcasecmp( optarg, "veryslow" ) )
+ {
+ param->analyse.i_me_method = X264_ME_UMH;
+ param->analyse.i_subpel_refine = 10;
+ param->analyse.i_me_range = 24;
+ param->i_frame_reference = 16;
+ param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+ param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+ param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+ param->analyse.i_trellis = 2;
+ param->i_bframe = 8;
+ param->rc.i_lookahead = 60;
+ }
+ else if( !strcasecmp( optarg, "placebo" ) )
+ {
+ param->analyse.i_me_method = X264_ME_TESA;
+ param->analyse.i_subpel_refine = 10;
+ param->analyse.i_me_range = 24;
+ param->i_frame_reference = 16;
+ param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+ param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+ param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+ param->analyse.b_fast_pskip = 0;
+ param->analyse.i_trellis = 2;
+ param->i_bframe = 16;
+ param->rc.i_lookahead = 60;
+ b_turbo = 0;
+ }
+ else
+ {
+ fprintf( stderr, "x264 [error]: invalid preset '%s'\n", optarg );
+ return -1;
+ }
+ }
+ else if( c == '?' )
+ return -1;
+ }
- /* Default output file driver */
- p_open_outfile = open_file_bsf;
- p_set_outfile_param = set_param_bsf;
- p_write_nalu = write_nalu_bsf;
- p_set_eop = set_eop_bsf;
- p_close_outfile = close_file_bsf;
+ /* Tunings are applied next. */
+ for( optind = 0;; )
+ {
+ int c = getopt_long( argc, argv, short_options, long_options, NULL );
+ if( c == -1 )
+ break;
+
+ if( c == OPT_TUNE )
+ {
+ char *s = strtok( optarg, ",./-+" );
+ int psy_tuning_used = 0;
+ while( s )
+ {
+ if( !strncasecmp( s, "film", 4 ) )
+ {
+ if( psy_tuning_used ) goto psy_failure;
+ param->i_deblocking_filter_alphac0 = -1;
+ param->i_deblocking_filter_beta = -1;
+ param->analyse.f_psy_trellis = 0.15;
+ psy_tuning_used = 1;
+ }
+ else if( !strncasecmp( s, "animation", 9 ) )
+ {
+ if( psy_tuning_used ) goto psy_failure;
+ param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
+ param->i_deblocking_filter_alphac0 = 1;
+ param->i_deblocking_filter_beta = 1;
+ param->analyse.f_psy_rd = 0.4;
+ param->rc.f_aq_strength = 0.6;
+ param->i_bframe += 2;
+ psy_tuning_used = 1;
+ }
+ else if( !strncasecmp( s, "grain", 5 ) )
+ {
+ if( psy_tuning_used ) goto psy_failure;
+ param->i_deblocking_filter_alphac0 = -2;
+ param->i_deblocking_filter_beta = -2;
+ param->analyse.f_psy_trellis = 0.25;
+ param->analyse.b_dct_decimate = 0;
+ param->rc.f_pb_factor = 1.1;
+ param->rc.f_ip_factor = 1.1;
+ param->rc.f_aq_strength = 0.5;
+ param->analyse.i_luma_deadzone[0] = 6;
+ param->analyse.i_luma_deadzone[1] = 6;
+ param->rc.f_qcompress = 0.8;
+ psy_tuning_used = 1;
+ }
+ else if( !strncasecmp( s, "psnr", 4 ) )
+ {
+ if( psy_tuning_used ) goto psy_failure;
+ param->rc.i_aq_mode = X264_AQ_NONE;
+ param->analyse.b_psy = 0;
+ psy_tuning_used = 1;
+ }
+ else if( !strncasecmp( s, "ssim", 4 ) )
+ {
+ if( psy_tuning_used ) goto psy_failure;
+ param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
+ param->analyse.b_psy = 0;
+ psy_tuning_used = 1;
+ }
+ else if( !strncasecmp( s, "fastdecode", 10 ) )
+ {
+ param->b_deblocking_filter = 0;
+ param->b_cabac = 0;
+ param->analyse.b_weighted_bipred = 0;
+ param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+ }
+ else if( !strncasecmp( s, "zerolatency", 11 ) )
+ {
+ param->rc.i_lookahead = 0;
+ param->i_sync_lookahead = 0;
+ param->i_bframe = 0;
+ param->b_sliced_threads = 1;
+ }
+ else if( !strncasecmp( s, "touhou", 6 ) )
+ {
+ if( psy_tuning_used ) goto psy_failure;
+ param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
+ param->i_deblocking_filter_alphac0 = -1;
+ param->i_deblocking_filter_beta = -1;
+ param->analyse.f_psy_trellis = 0.2;
+ param->rc.f_aq_strength = 1.3;
+ if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
+ param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+ psy_tuning_used = 1;
+ }
+ else
+ {
+ fprintf( stderr, "x264 [error]: invalid tune '%s'\n", s );
+ return -1;
+ }
+ if( 0 )
+ {
+psy_failure:
+ fprintf( stderr, "x264 [warning]: only 1 psy tuning can be used: ignoring tune %s\n", s );
+ }
+ s = strtok( NULL, ",./-+" );
+ }
+ }
+ else if( c == '?' )
+ return -1;
+ }
/* Parse command line options */
- for( ;; )
+ for( optind = 0;; )
{
int b_error = 0;
int long_options_index = -1;
-#define OPT_FRAMES 256
-#define OPT_SEEK 257
-#define OPT_QPFILE 258
-#define OPT_THREAD_INPUT 259
-#define OPT_QUIET 260
-#define OPT_PROGRESS 261
-#define OPT_VISUALIZE 262
-#define OPT_LONGHELP 263
-
- static struct option long_options[] =
- {
- { "help", no_argument, NULL, 'h' },
- { "longhelp",no_argument, NULL, OPT_LONGHELP },
- { "version", no_argument, NULL, 'V' },
- { "bitrate", required_argument, NULL, 'B' },
- { "bframes", required_argument, NULL, 'b' },
- { "b-adapt", required_argument, NULL, 0 },
- { "no-b-adapt", no_argument, NULL, 0 },
- { "b-bias", required_argument, NULL, 0 },
- { "b-pyramid", no_argument, NULL, 0 },
- { "min-keyint",required_argument,NULL,'i' },
- { "keyint", required_argument, NULL, 'I' },
- { "scenecut",required_argument, NULL, 0 },
- { "no-scenecut",no_argument, NULL, 0 },
- { "nf", no_argument, NULL, 0 },
- { "no-deblock", no_argument, NULL, 0 },
- { "filter", required_argument, NULL, 0 },
- { "deblock", required_argument, NULL, 'f' },
- { "interlaced", no_argument, NULL, 0 },
- { "no-cabac",no_argument, NULL, 0 },
- { "qp", required_argument, NULL, 'q' },
- { "qpmin", required_argument, NULL, 0 },
- { "qpmax", required_argument, NULL, 0 },
- { "qpstep", required_argument, NULL, 0 },
- { "crf", required_argument, NULL, 0 },
- { "ref", required_argument, NULL, 'r' },
- { "asm", required_argument, NULL, 0 },
- { "no-asm", no_argument, NULL, 0 },
- { "sar", required_argument, NULL, 0 },
- { "fps", required_argument, NULL, 0 },
- { "frames", required_argument, NULL, OPT_FRAMES },
- { "seek", required_argument, NULL, OPT_SEEK },
- { "output", required_argument, NULL, 'o' },
- { "analyse", required_argument, NULL, 0 },
- { "partitions", required_argument, NULL, 'A' },
- { "direct", required_argument, NULL, 0 },
- { "weightb", no_argument, NULL, 'w' },
- { "me", required_argument, NULL, 0 },
- { "merange", required_argument, NULL, 0 },
- { "mvrange", required_argument, NULL, 0 },
- { "mvrange-thread", required_argument, NULL, 0 },
- { "subme", required_argument, NULL, 'm' },
- { "psy-rd", required_argument, NULL, 0 },
- { "mixed-refs", no_argument, NULL, 0 },
- { "no-chroma-me", no_argument, NULL, 0 },
- { "8x8dct", no_argument, NULL, '8' },
- { "trellis", required_argument, NULL, 't' },
- { "no-fast-pskip", no_argument, NULL, 0 },
- { "no-dct-decimate", no_argument, NULL, 0 },
- { "aq-strength", required_argument, NULL, 0 },
- { "aq-mode", required_argument, NULL, 0 },
- { "deadzone-inter", required_argument, NULL, '0' },
- { "deadzone-intra", required_argument, NULL, '0' },
- { "level", required_argument, NULL, 0 },
- { "ratetol", required_argument, NULL, 0 },
- { "vbv-maxrate", required_argument, NULL, 0 },
- { "vbv-bufsize", required_argument, NULL, 0 },
- { "vbv-init", required_argument,NULL, 0 },
- { "ipratio", required_argument, NULL, 0 },
- { "pbratio", required_argument, NULL, 0 },
- { "chroma-qp-offset", required_argument, NULL, 0 },
- { "pass", required_argument, NULL, 'p' },
- { "stats", required_argument, NULL, 0 },
- { "qcomp", required_argument, NULL, 0 },
- { "qblur", required_argument, NULL, 0 },
- { "cplxblur",required_argument, NULL, 0 },
- { "zones", required_argument, NULL, 0 },
- { "qpfile", required_argument, NULL, OPT_QPFILE },
- { "threads", required_argument, NULL, 0 },
- { "thread-input", no_argument, NULL, OPT_THREAD_INPUT },
- { "non-deterministic", no_argument, NULL, 0 },
- { "no-psnr", no_argument, NULL, 0 },
- { "no-ssim", no_argument, NULL, 0 },
- { "quiet", no_argument, NULL, OPT_QUIET },
- { "verbose", no_argument, NULL, 'v' },
- { "progress",no_argument, NULL, OPT_PROGRESS },
- { "visualize",no_argument, NULL, OPT_VISUALIZE },
- { "dump-yuv",required_argument, NULL, 0 },
- { "sps-id", required_argument, NULL, 0 },
- { "aud", no_argument, NULL, 0 },
- { "nr", required_argument, NULL, 0 },
- { "cqm", required_argument, NULL, 0 },
- { "cqmfile", required_argument, NULL, 0 },
- { "cqm4", required_argument, NULL, 0 },
- { "cqm4i", required_argument, NULL, 0 },
- { "cqm4iy", required_argument, NULL, 0 },
- { "cqm4ic", required_argument, NULL, 0 },
- { "cqm4p", required_argument, NULL, 0 },
- { "cqm4py", required_argument, NULL, 0 },
- { "cqm4pc", required_argument, NULL, 0 },
- { "cqm8", required_argument, NULL, 0 },
- { "cqm8i", required_argument, NULL, 0 },
- { "cqm8p", required_argument, NULL, 0 },
- { "overscan", required_argument, NULL, 0 },
- { "videoformat", required_argument, NULL, 0 },
- { "fullrange", required_argument, NULL, 0 },
- { "colorprim", required_argument, NULL, 0 },
- { "transfer", required_argument, NULL, 0 },
- { "colormatrix", required_argument, NULL, 0 },
- { "chromaloc", required_argument, NULL, 0 },
- {0, 0, 0, 0}
- };
-
- int c = getopt_long( argc, argv, "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw",
- long_options, &long_options_index);
+ int c = getopt_long( argc, argv, short_options, long_options, &long_options_index );
if( c == -1 )
{
@@ -502,6 +1055,9 @@ static int Parse( int argc, char **argv,
case OPT_LONGHELP:
Help( &defaults, 1 );
exit(0);
+ case OPT_FULLHELP:
+ Help( &defaults, 2 );
+ exit(0);
case 'V':
#ifdef X264_POINTVER
printf( "x264 "X264_POINTVER"\n" );
@@ -516,46 +1072,48 @@ static int Parse( int argc, char **argv,
#endif
exit(0);
case OPT_FRAMES:
- param->i_frame_total = atoi( optarg );
+ param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
break;
case OPT_SEEK:
- opt->i_seek = atoi( optarg );
+ opt->i_seek = input_opt.seek = X264_MAX( atoi( optarg ), 0 );
break;
case 'o':
- if( !strncasecmp(optarg + strlen(optarg) - 4, ".mp4", 4) )
+ output_filename = optarg;
+ break;
+ case OPT_MUXER:
+ for( i = 0; muxer_names[i] && strcasecmp( muxer_names[i], optarg ); )
+ i++;
+ if( !muxer_names[i] )
{
-#ifdef MP4_OUTPUT
- p_open_outfile = open_file_mp4;
- p_write_nalu = write_nalu_mp4;
- p_set_outfile_param = set_param_mp4;
- p_set_eop = set_eop_mp4;
- p_close_outfile = close_file_mp4;
-#else
- fprintf( stderr, "x264 [error]: not compiled with MP4 output support\n" );
+ fprintf( stderr, "x264 [error]: invalid muxer '%s'\n", optarg );
return -1;
-#endif
- }
- else if( !strncasecmp(optarg + strlen(optarg) - 4, ".mkv", 4) )
- {
- p_open_outfile = open_file_mkv;
- p_write_nalu = write_nalu_mkv;
- p_set_outfile_param = set_param_mkv;
- p_set_eop = set_eop_mkv;
- p_close_outfile = close_file_mkv;
}
- if( !strcmp(optarg, "-") )
- opt->hout = stdout;
- else if( p_open_outfile( optarg, &opt->hout ) )
+ muxer = optarg;
+ break;
+ case OPT_DEMUXER:
+ for( i = 0; demuxer_names[i] && strcasecmp( demuxer_names[i], optarg ); )
+ i++;
+ if( !demuxer_names[i] )
{
- fprintf( stderr, "x264 [error]: can't open output file `%s'\n", optarg );
+ fprintf( stderr, "x264 [error]: invalid demuxer '%s'\n", optarg );
return -1;
}
+ demuxer = optarg;
+ break;
+ case OPT_INDEX:
+ input_opt.index = optarg;
break;
case OPT_QPFILE:
- opt->qpfile = fopen( optarg, "r" );
+ opt->qpfile = fopen( optarg, "rb" );
if( !opt->qpfile )
{
- fprintf( stderr, "x264 [error]: can't open `%s'\n", optarg );
+ fprintf( stderr, "x264 [error]: can't open qpfile `%s'\n", optarg );
+ return -1;
+ }
+ else if( !x264_is_regular_file( opt->qpfile ) )
+ {
+ fprintf( stderr, "x264 [error]: qpfile incompatible with non-regular file `%s'\n", optarg );
+ fclose( opt->qpfile );
return -1;
}
break;
@@ -568,8 +1126,8 @@ static int Parse( int argc, char **argv,
case 'v':
param->i_log_level = X264_LOG_DEBUG;
break;
- case OPT_PROGRESS:
- opt->b_progress = 1;
+ case OPT_NOPROGRESS:
+ opt->b_progress = 0;
break;
case OPT_VISUALIZE:
#ifdef VISUALIZE
@@ -579,7 +1137,30 @@ static int Parse( int argc, char **argv,
fprintf( stderr, "x264 [warning]: not compiled with visualization support\n" );
#endif
break;
+ case OPT_TUNE:
+ case OPT_PRESET:
+ break;
+ case OPT_PROFILE:
+ profile = optarg;
+ break;
+ case OPT_SLOWFIRSTPASS:
+ b_turbo = 0;
+ break;
+ case 'r':
+ b_user_ref = 1;
+ goto generic_option;
+ case 'p':
+ b_pass1 = atoi( optarg ) == 1;
+ goto generic_option;
+ case OPT_FPS:
+ b_user_fps = 1;
+ param->b_vfr_input = 0;
+ goto generic_option;
+ case OPT_INTERLACED:
+ b_user_interlaced = 1;
+ goto generic_option;
default:
+generic_option:
{
int i;
if( long_options_index < 0 )
@@ -609,101 +1190,167 @@ static int Parse( int argc, char **argv,
}
}
- /* Get the file name */
- if( optind > argc - 1 || !opt->hout )
+ /* Set faster options in case of turbo firstpass. */
+ if( b_turbo && b_pass1 )
{
- fprintf( stderr, "x264 [error]: No %s file. Run x264 --help for a list of options.\n",
- optind > argc - 1 ? "input" : "output" );
- return -1;
+ param->i_frame_reference = 1;
+ param->analyse.b_transform_8x8 = 0;
+ param->analyse.inter = 0;
+ param->analyse.i_me_method = X264_ME_DIA;
+ param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
+ param->analyse.i_trellis = 0;
}
- psz_filename = argv[optind++];
-
- /* check demuxer type */
- psz = psz_filename + strlen(psz_filename) - 1;
- while( psz > psz_filename && *psz != '.' )
- psz--;
- if( !strncasecmp( psz, ".avi", 4 ) || !strncasecmp( psz, ".avs", 4 ) )
- b_avis = 1;
- if( !strncasecmp( psz, ".y4m", 4 ) )
- b_y4m = 1;
- if( !(b_avis || b_y4m) ) // raw yuv
+ /* Apply profile restrictions. */
+ if( profile )
{
- if( optind > argc - 1 )
+ if( !strcasecmp( profile, "baseline" ) )
{
- /* try to parse the file name */
- for( psz = psz_filename; *psz; psz++ )
+ param->analyse.b_transform_8x8 = 0;
+ param->b_cabac = 0;
+ param->i_cqm_preset = X264_CQM_FLAT;
+ param->i_bframe = 0;
+ param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+ if( param->b_interlaced )
{
- if( *psz >= '0' && *psz <= '9'
- && sscanf( psz, "%ux%u", ¶m->i_width, ¶m->i_height ) == 2 )
- {
- if( param->i_log_level >= X264_LOG_INFO )
- fprintf( stderr, "x264 [info]: %dx%d (given by file name) @ %.2f fps\n", param->i_width, param->i_height, (double)param->i_fps_num / (double)param->i_fps_den);
- break;
- }
+ fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
+ return -1;
}
}
+ else if( !strcasecmp( profile, "main" ) )
+ {
+ param->analyse.b_transform_8x8 = 0;
+ param->i_cqm_preset = X264_CQM_FLAT;
+ }
+ else if( !strcasecmp( profile, "high" ) )
+ {
+ /* Default */
+ }
else
{
- sscanf( argv[optind++], "%ux%u", ¶m->i_width, ¶m->i_height );
- if( param->i_log_level >= X264_LOG_INFO )
- fprintf( stderr, "x264 [info]: %dx%d @ %.2f fps\n", param->i_width, param->i_height, (double)param->i_fps_num / (double)param->i_fps_den);
+ fprintf( stderr, "x264 [error]: invalid profile: %s\n", profile );
+ return -1;
+ }
+ if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0) ||
+ (param->rc.i_rc_method == X264_RC_CRF && param->rc.f_rf_constant == 0) )
+ {
+ fprintf( stderr, "x264 [error]: %s profile doesn't support lossless\n", profile );
+ return -1;
}
}
- if( !(b_avis || b_y4m) && ( !param->i_width || !param->i_height ) )
+ /* Get the file name */
+ if( optind > argc - 1 || !output_filename )
{
- fprintf( stderr, "x264 [error]: Rawyuv input requires a resolution.\n" );
+ fprintf( stderr, "x264 [error]: No %s file. Run x264 --help for a list of options.\n",
+ optind > argc - 1 ? "input" : "output" );
return -1;
}
- /* open the input */
+ if( select_output( muxer, output_filename, param ) )
+ return -1;
+ if( output.open_file( output_filename, &opt->hout ) )
{
- if( b_avis )
- {
-#ifdef AVIS_INPUT
- p_open_infile = open_file_avis;
- p_get_frame_total = get_frame_total_avis;
- p_read_frame = read_frame_avis;
- p_close_infile = close_file_avis;
-#else
- fprintf( stderr, "x264 [error]: not compiled with AVIS input support\n" );
- return -1;
-#endif
- }
- if ( b_y4m )
- {
- p_open_infile = open_file_y4m;
- p_get_frame_total = get_frame_total_y4m;
- p_read_frame = read_frame_y4m;
- p_close_infile = close_file_y4m;
- }
+ fprintf( stderr, "x264 [error]: could not open output file `%s'\n", output_filename );
+ return -1;
+ }
- if( p_open_infile( psz_filename, &opt->hin, param ) )
- {
- fprintf( stderr, "x264 [error]: could not open input file '%s'\n", psz_filename );
- return -1;
- }
+ input_filename = argv[optind++];
+ input_opt.resolution = optind < argc ? argv[optind++] : NULL;
+ video_info_t info = {0};
+ char demuxername[5];
+
+ /* set info flags to param flags to be overwritten by demuxer as necessary. */
+ info.csp = param->i_csp;
+ info.fps_num = param->i_fps_num;
+ info.fps_den = param->i_fps_den;
+ info.interlaced = param->b_interlaced;
+ info.sar_width = param->vui.i_sar_width;
+ info.sar_height = param->vui.i_sar_height;
+ info.vfr = param->b_vfr_input;
+
+ if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) )
+ return -1;
+
+ if( !opt->hin && input.open_file( input_filename, &opt->hin, &info, &input_opt ) )
+ {
+ fprintf( stderr, "x264 [error]: could not open input file `%s'\n", input_filename );
+ return -1;
+ }
+
+ x264_reduce_fraction( &info.sar_width, &info.sar_height );
+ x264_reduce_fraction( &info.fps_num, &info.fps_den );
+ if( param->i_log_level >= X264_LOG_INFO )
+ fprintf( stderr, "%s [info]: %dx%d%c %d:%d @ %d/%d fps (%cfr)\n", demuxername, info.width,
+ info.height, info.interlaced ? 'i' : 'p', info.sar_width, info.sar_height,
+ info.fps_num, info.fps_den, info.vfr ? 'v' : 'c' );
+
+ /* set param flags from the info flags as necessary */
+ param->i_csp = info.csp;
+ param->i_height = info.height;
+ param->b_vfr_input = info.vfr;
+ param->i_width = info.width;
+ if( !b_user_interlaced && info.interlaced )
+ {
+ fprintf( stderr, "x264 [warning]: input appears to be interlaced, enabling interlaced mode.\n"
+ " If you want otherwise, use --no-interlaced\n" );
+ param->b_interlaced = 1;
+ }
+ if( !b_user_fps )
+ {
+ param->i_fps_num = info.fps_num;
+ param->i_fps_den = info.fps_den;
+ }
+ if( param->b_vfr_input )
+ {
+ param->i_timebase_num = info.timebase_num;
+ param->i_timebase_den = info.timebase_den;
+ }
+ else
+ {
+ param->i_timebase_den = param->i_fps_num;
+ param->i_timebase_num = param->i_fps_den;
+ }
+ if( !param->vui.i_sar_width || !param->vui.i_sar_height )
+ {
+ param->vui.i_sar_width = info.sar_width;
+ param->vui.i_sar_height = info.sar_height;
}
#ifdef HAVE_PTHREAD
if( b_thread_input || param->i_threads > 1
- || (param->i_threads == 0 && x264_cpu_num_processors() > 1) )
+ || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1) )
{
- if( open_file_thread( NULL, &opt->hin, param ) )
+ if( thread_input.open_file( NULL, &opt->hin, &info, NULL ) )
{
- fprintf( stderr, "x264 [warning]: threaded input failed\n" );
+ fprintf( stderr, "x264 [error]: threaded input failed\n" );
+ return -1;
}
else
- {
- p_open_infile = open_file_thread;
- p_get_frame_total = get_frame_total_thread;
- p_read_frame = read_frame_thread;
- p_close_infile = close_file_thread;
- }
+ input = thread_input;
}
#endif
+
+ /* Automatically reduce reference frame count to match the user's target level
+ * if the user didn't explicitly set a reference frame count. */
+ if( !b_user_ref )
+ {
+ int mbs = (((param->i_width)+15)>>4) * (((param->i_height)+15)>>4);
+ int i;
+ for( i = 0; x264_levels[i].level_idc != 0; i++ )
+ if( param->i_level_idc == x264_levels[i].level_idc )
+ {
+ while( mbs * 384 * param->i_frame_reference > x264_levels[i].dpb
+ && param->i_frame_reference > 1 )
+ {
+ param->i_frame_reference--;
+ }
+ break;
+ }
+ }
+
+
return 0;
}
@@ -716,14 +1363,14 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
{
file_pos = ftell( opt->qpfile );
ret = fscanf( opt->qpfile, "%d %c %d\n", &num, &type, &qp );
- if( num > i_frame || ret == EOF )
- {
- pic->i_type = X264_TYPE_AUTO;
- pic->i_qpplus1 = 0;
- fseek( opt->qpfile , file_pos , SEEK_SET );
- break;
- }
- if( num < i_frame )
+ if( num > i_frame || ret == EOF )
+ {
+ pic->i_type = X264_TYPE_AUTO;
+ pic->i_qpplus1 = 0;
+ fseek( opt->qpfile, file_pos, SEEK_SET );
+ break;
+ }
+ if( num < i_frame && ret == 3 )
continue;
pic->i_qpplus1 = qp+1;
if ( type == 'I' ) pic->i_type = X264_TYPE_IDR;
@@ -748,37 +1395,50 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
* Encode:
*****************************************************************************/
-static int Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic )
+static int Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_pts )
{
x264_picture_t pic_out;
x264_nal_t *nal;
- int i_nal, i;
- int i_file = 0;
+ int i_nal;
+ int i_frame_size = 0;
+
+ i_frame_size = x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out );
- if( x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out ) < 0 )
+ if( i_frame_size < 0 )
{
fprintf( stderr, "x264 [error]: x264_encoder_encode failed\n" );
+ return -1;
}
- for( i = 0; i < i_nal; i++ )
+ if( i_frame_size )
{
- int i_size;
+ i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
+ *last_pts = pic_out.i_pts;
+ }
- if( mux_buffer_size < nal[i].i_payload * 3/2 + 4 )
- {
- mux_buffer_size = nal[i].i_payload * 2 + 4;
- x264_free( mux_buffer );
- mux_buffer = x264_malloc( mux_buffer_size );
- }
+ return i_frame_size;
+}
- i_size = mux_buffer_size;
- x264_nal_encode( mux_buffer, &i_size, 1, &nal[i] );
- i_file += p_write_nalu( hout, mux_buffer, i_size );
+static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_pts )
+{
+ char buf[200];
+ int64_t i_elapsed = x264_mdate() - i_start;
+ double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
+ double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den );
+ if( i_frame_total )
+ {
+ int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
+ sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
+ 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
+ eta/3600, (eta/60)%60, eta%60 );
}
- if (i_nal)
- p_set_eop( hout, &pic_out );
-
- return i_file;
+ else
+ {
+ sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
+ }
+ fprintf( stderr, "%s \r", buf+5 );
+ SetConsoleTitle( buf );
+ fflush( stderr ); // needed in windows
}
static int Encode( x264_param_t *param, cli_opt_t *opt )
@@ -786,16 +1446,24 @@ static int Encode( x264_param_t *param, cli_opt_t *opt )
x264_t *h;
x264_picture_t pic;
- int i_frame, i_frame_total;
+ int i_frame, i_frame_total, i_frame_output;
int64_t i_start, i_end;
- int64_t i_file;
+ int64_t i_file = 0;
int i_frame_size;
int i_update_interval;
- char buf[200];
+ int64_t last_pts = 0;
+# define MAX_PTS_WARNING 3 /* arbitrary */
+ int pts_warning_cnt = 0;
+ int64_t largest_pts = -1;
+ int64_t second_largest_pts = -1;
+ int64_t ticks_per_frame;
+ double duration;
+ int prev_timebase_den = param->i_timebase_den / gcd( param->i_timebase_num, param->i_timebase_den );
+ int dts_compress_multiplier;
opt->b_progress &= param->i_log_level < X264_LOG_DEBUG;
- i_frame_total = p_get_frame_total( opt->hin );
- i_frame_total -= opt->i_seek;
+ i_frame_total = input.get_frame_total( opt->hin );
+ i_frame_total = X264_MAX( i_frame_total - opt->i_seek, 0 );
if( ( i_frame_total == 0 || param->i_frame_total < i_frame_total )
&& param->i_frame_total > 0 )
i_frame_total = param->i_frame_total;
@@ -805,30 +1473,77 @@ static int Encode( x264_param_t *param, cli_opt_t *opt )
if( ( h = x264_encoder_open( param ) ) == NULL )
{
fprintf( stderr, "x264 [error]: x264_encoder_open failed\n" );
- p_close_infile( opt->hin );
+ input.close_file( opt->hin );
return -1;
}
- if( p_set_outfile_param( opt->hout, param ) )
+ x264_encoder_parameters( h, param );
+
+ dts_compress_multiplier = param->i_timebase_den / prev_timebase_den;
+
+ if( output.set_param( opt->hout, param ) )
{
fprintf( stderr, "x264 [error]: can't set outfile param\n" );
- p_close_infile( opt->hin );
- p_close_outfile( opt->hout );
+ input.close_file( opt->hin );
+ output.close_file( opt->hout, largest_pts, second_largest_pts );
return -1;
}
/* Create a new pic */
- x264_picture_alloc( &pic, X264_CSP_I420, param->i_width, param->i_height );
+ if( input.picture_alloc( &pic, param->i_csp, param->i_width, param->i_height ) )
+ {
+ fprintf( stderr, "x264 [error]: malloc failed\n" );
+ return -1;
+ }
i_start = x264_mdate();
+ /* ticks/frame = ticks/second / frames/second */
+ ticks_per_frame = (int64_t)param->i_timebase_den * param->i_fps_den / param->i_timebase_num / param->i_fps_num;
+ if( ticks_per_frame < 1 )
+ {
+ fprintf( stderr, "x264 [error]: ticks_per_frame invalid: %"PRId64"\n", ticks_per_frame );
+ return -1;
+ }
+
+ if( !param->b_repeat_headers )
+ {
+ // Write SPS/PPS/SEI
+ x264_nal_t *headers;
+ int i_nal;
+
+ if( x264_encoder_headers( h, &headers, &i_nal ) < 0 )
+ {
+ fprintf( stderr, "x264 [error]: x264_encoder_headers failed\n" );
+ return -1;
+ }
+
+ if( (i_file = output.write_headers( opt->hout, headers )) < 0 )
+ return -1;
+ }
/* Encode frames */
- for( i_frame = 0, i_file = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); )
+ for( i_frame = 0, i_frame_output = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); )
{
- if( p_read_frame( &pic, opt->hin, i_frame + opt->i_seek ) )
+ if( input.read_frame( &pic, opt->hin, i_frame + opt->i_seek ) )
break;
- pic.i_pts = (int64_t)i_frame * param->i_fps_den;
+ if( !param->b_vfr_input )
+ pic.i_pts = i_frame;
+ if( pic.i_pts <= largest_pts )
+ {
+ if( param->i_log_level >= X264_LOG_WARNING )
+ {
+ if( param->i_log_level >= X264_LOG_DEBUG || pts_warning_cnt < MAX_PTS_WARNING )
+ fprintf( stderr, "x264 [warning]: non-strictly-monotonic pts at frame %d (%"PRId64" <= %"PRId64")\n",
+ i_frame, pic.i_pts * dts_compress_multiplier, largest_pts * dts_compress_multiplier );
+ else if( pts_warning_cnt == MAX_PTS_WARNING )
+ fprintf( stderr, "x264 [warning]: too many nonmonotonic pts warnings, suppressing further ones\n" );
+ pts_warning_cnt++;
+ }
+ pic.i_pts = largest_pts + ticks_per_frame;
+ }
+ second_largest_pts = largest_pts;
+ largest_pts = pic.i_pts;
if( opt->qpfile )
parse_qpfile( opt, &pic, i_frame + opt->i_seek );
@@ -839,61 +1554,65 @@ static int Encode( x264_param_t *param, cli_opt_t *opt )
pic.i_qpplus1 = 0;
}
- i_file += Encode_frame( h, opt->hout, &pic );
+ i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts );
+ if( i_frame_size < 0 )
+ return -1;
+ i_file += i_frame_size;
+ if( i_frame_size )
+ i_frame_output++;
i_frame++;
+ if( input.release_frame && input.release_frame( &pic, opt->hin ) )
+ break;
+
/* update status line (up to 1000 times per input file) */
- if( opt->b_progress && i_frame % i_update_interval == 0 )
- {
- int64_t i_elapsed = x264_mdate() - i_start;
- double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
- double bitrate = (double) i_file * 8 * param->i_fps_num / ( (double) param->i_fps_den * i_frame * 1000 );
- if( i_frame_total )
- {
- int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
- sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
- 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
- eta/3600, (eta/60)%60, eta%60 );
- }
- else
- {
- sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
- }
- fprintf( stderr, "%s \r", buf+5 );
- SetConsoleTitle( buf );
- fflush( stderr ); // needed in windows
- }
+ if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
+ Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
}
- /* Flush delayed B-frames */
- do {
- i_file +=
- i_frame_size = Encode_frame( h, opt->hout, NULL );
- } while( i_frame_size );
+ /* Flush delayed frames */
+ while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
+ {
+ i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts );
+ if( i_frame_size < 0 )
+ return -1;
+ i_file += i_frame_size;
+ if( i_frame_size )
+ i_frame_output++;
+ if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
+ Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
+ }
+ if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG )
+ fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
+
+ /* duration algorithm fails when only 1 frame is output */
+ if( i_frame_output == 1 )
+ duration = (double)param->i_fps_den / param->i_fps_num;
+ else
+ duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;
+ duration *= dts_compress_multiplier;
i_end = x264_mdate();
- x264_picture_clean( &pic );
+ input.picture_clean( &pic );
/* Erase progress indicator before printing encoding stats. */
if( opt->b_progress )
fprintf( stderr, " \r" );
x264_encoder_close( h );
- x264_free( mux_buffer );
fprintf( stderr, "\n" );
if( b_ctrl_c )
- fprintf( stderr, "aborted at input frame %d\n", opt->i_seek + i_frame );
+ fprintf( stderr, "aborted at input frame %d, output frame %d\n", opt->i_seek + i_frame, i_frame_output );
- p_close_infile( opt->hin );
- p_close_outfile( opt->hout );
+ input.close_file( opt->hin );
+ output.close_file( opt->hout, largest_pts, second_largest_pts );
- if( i_frame > 0 )
+ if( i_frame_output > 0 )
{
- double fps = (double)i_frame * (double)1000000 /
+ double fps = (double)i_frame_output * (double)1000000 /
(double)( i_end - i_start );
- fprintf( stderr, "encoded %d frames, %.2f fps, %.2f kb/s\n", i_frame, fps,
- (double) i_file * 8 * param->i_fps_num /
- ( (double) param->i_fps_den * i_frame * 1000 ) );
+ fprintf( stderr, "encoded %d frames, %.2f fps, %.2f kb/s\n", i_frame_output, fps,
+ (double) i_file * 8 / ( 1000 * duration ) );
}
return 0;
diff --git a/x264.h b/x264.h
index 26ac421..2550864 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
#include <stdarg.h>
-#define X264_BUILD 67
+#define X264_BUILD 84
/* x264_t:
* opaque handler for encoder */
@@ -63,6 +63,9 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6 0x020000
+#define X264_CPU_NEON 0x040000 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
/* Analyse flags
*/
@@ -83,18 +86,25 @@ typedef struct x264_t x264_t;
#define X264_CQM_FLAT 0
#define X264_CQM_JVT 1
#define X264_CQM_CUSTOM 2
-#define X264_RC_NONE -1
#define X264_RC_CQP 0
#define X264_RC_CRF 1
#define X264_RC_ABR 2
#define X264_AQ_NONE 0
#define X264_AQ_VARIANCE 1
+#define X264_AQ_AUTOVARIANCE 2
#define X264_B_ADAPT_NONE 0
#define X264_B_ADAPT_FAST 1
#define X264_B_ADAPT_TRELLIS 2
+#define X264_WEIGHTP_NONE 0
+#define X264_WEIGHTP_BLIND 1
+#define X264_WEIGHTP_SMART 2
+#define X264_B_PYRAMID_NONE 0
+#define X264_B_PYRAMID_STRICT 1
+#define X264_B_PYRAMID_NORMAL 2
static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
+static const char * const x264_b_pyramid_names[] = { "none", "strict", "normal", 0 };
static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
static const char * const x264_fullrange_names[] = { "off", "on", 0 };
@@ -103,8 +113,7 @@ static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "b
static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 };
/* Colorspace type
- * legacy only; nothing other than I420 is really supported.
- */
+ * legacy only; nothing other than I420 is really supported. */
#define X264_CSP_MASK 0x00ff /* */
#define X264_CSP_NONE 0x0000 /* Invalid mode */
#define X264_CSP_I420 0x0001 /* yuv 4:2:0 planar */
@@ -118,8 +127,7 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
#define X264_CSP_MAX 0x0009 /* end of list */
#define X264_CSP_VFLIP 0x1000 /* */
-/* Slice type
- */
+/* Slice type */
#define X264_TYPE_AUTO 0x0000 /* Let x264 choose the right type */
#define X264_TYPE_IDR 0x0001
#define X264_TYPE_I 0x0002
@@ -129,14 +137,17 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR)
#define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF)
-/* Log level
- */
+/* Log level */
#define X264_LOG_NONE (-1)
#define X264_LOG_ERROR 0
#define X264_LOG_WARNING 1
#define X264_LOG_INFO 2
#define X264_LOG_DEBUG 3
+/* Threading */
+#define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */
+#define X264_SYNC_LOOKAHEAD_AUTO (-1) /* Automatically select optimal lookahead thread buffer size */
+
/* Zones: override ratecontrol or other options for specific sections of the video.
* See x264_encoder_reconfig() for which options can be changed.
* If zones overlap, whichever comes later in the list takes precedence. */
@@ -154,7 +165,9 @@ typedef struct x264_param_t
/* CPU flags */
unsigned int cpu;
int i_threads; /* encode multiple frames in parallel */
+ int b_sliced_threads; /* Whether to use slice-based threading. */
int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
+ int i_sync_lookahead; /* threaded lookahead buffer */
/* Video Properties */
int i_width;
@@ -188,10 +201,12 @@ typedef struct x264_param_t
int i_keyint_max; /* Force an IDR keyframe at this interval */
int i_keyint_min; /* Scenecuts closer together than this are coded as I, not IDR. */
int i_scenecut_threshold; /* how aggressively to insert extra I frames */
+ int b_intra_refresh; /* Whether or not to use periodic intra refresh instead of IDR frames. */
+
int i_bframe; /* how many b-frame between 2 references pictures */
int i_bframe_adaptive;
int i_bframe_bias;
- int b_bframe_pyramid; /* Keep some B-frames as references */
+ int i_bframe_pyramid; /* Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal */
int b_deblocking_filter;
int i_deblocking_filter_alphac0; /* [-6, 6] -6 light filter, 6 strong */
@@ -201,6 +216,7 @@ typedef struct x264_param_t
int i_cabac_init_idc;
int b_interlaced;
+ int b_constrained_intra;
int i_cqm_preset;
char *psz_cqm_file; /* JM format */
@@ -225,6 +241,7 @@ typedef struct x264_param_t
unsigned int inter; /* inter partitions */
int b_transform_8x8;
+ int i_weighted_pred; /* weighting for P-frames */
int b_weighted_bipred; /* implicit weighting for B-frames */
int i_direct_mv_pred; /* spatial vs temporal mv prediction */
int i_chroma_qp_offset;
@@ -242,6 +259,7 @@ typedef struct x264_param_t
int i_noise_reduction; /* adaptive pseudo-deadzone */
float f_psy_rd; /* Psy RD strength */
float f_psy_trellis; /* Psy trellis strength */
+ int b_psy; /* Toggle all psy optimizations */
/* the deadzone size that will be used in luma quantization */
int i_luma_deadzone[2]; /* {inter, intra} */
@@ -271,6 +289,8 @@ typedef struct x264_param_t
int i_aq_mode; /* psy adaptive QP. (X264_AQ_*) */
float f_aq_strength;
+ int b_mb_tree; /* Macroblock-tree ratecontrol. */
+ int i_lookahead;
/* 2pass */
int b_stat_write; /* Enable stat writing in psz_stat_out */
@@ -290,7 +310,26 @@ typedef struct x264_param_t
/* Muxing parameters */
int b_aud; /* generate access unit delimiters */
int b_repeat_headers; /* put SPS/PPS before each keyframe */
+ int b_annexb; /* if set, place start codes (4 bytes) before NAL units,
+ * otherwise place size (4 bytes) before NAL units. */
int i_sps_id; /* SPS and PPS id number */
+ int b_vfr_input; /* VFR input */
+ int i_timebase_num; /* Timebase numerator */
+ int i_timebase_den; /* Timebase denominator */
+ int b_dts_compress; /* DTS compression: this algorithm eliminates negative DTS
+ * by compressing them to be less than the second PTS.
+ * Warning: this will change the timebase! */
+
+ /* Slicing parameters */
+ int i_slice_max_size; /* Max size per slice in bytes; includes estimated NAL overhead. */
+ int i_slice_max_mbs; /* Max number of MBs per slice; overrides i_slice_count. */
+ int i_slice_count; /* Number of slices per frame: forces rectangular slices. */
+
+ /* Optional callback for freeing this x264_param_t when it is done being used.
+ * Only used when the x264_param_t sits in memory for an indefinite period of time,
+ * i.e. when an x264_param_t is passed to x264_t in an x264_picture_t or in zones.
+ * Not used when x264_encoder_reconfig is called directly. */
+ void (*param_free)( void* );
} x264_param_t;
typedef struct {
@@ -349,16 +388,32 @@ typedef struct
int i_type;
/* In: force quantizer for > 0 */
int i_qpplus1;
+ /* Out: whether this frame is a keyframe. Important when using modes that result in
+ * SEI recovery points being used instead of IDR frames. */
+ int b_keyframe;
/* In: user pts, Out: pts of encoded picture (user)*/
int64_t i_pts;
-
+ /* Out: frame dts. Since the pts of the first frame is always zero,
+ * initial frames may have a negative dts which must be dealt with by any muxer */
+ int64_t i_dts;
+ /* In: custom encoding parameters to be set from this frame forwards
+ (in coded order, not display order). If NULL, continue using
+ parameters from the previous frame. Some parameters, such as
+ aspect ratio, can only be changed per-GOP due to the limitations
+ of H.264 itself; in this case, the caller must force an IDR frame
+ if it needs the changed parameter to apply immediately. */
+ x264_param_t *param;
/* In: raw data */
x264_image_t img;
+ /* private user data. libx264 doesn't touch this,
+ not even copy it from input to output frames. */
+ void *opaque;
} x264_picture_t;
/* x264_picture_alloc:
- * alloc data for a picture. You must call x264_picture_clean on it. */
-void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+ * alloc data for a picture. You must call x264_picture_clean on it.
+ * returns 0 on success, or -1 on malloc failure. */
+int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
/* x264_picture_clean:
* free associated resource for a x264_picture_t allocated with
@@ -391,41 +446,72 @@ enum nal_priority_e
NAL_PRIORITY_HIGHEST = 3,
};
+/* The data within the payload is already NAL-encapsulated; the ref_idc and type
+ * are merely in the struct for easy access by the calling application.
+ * All data returned in an x264_nal_t, including the data in p_payload, is no longer
+ * valid after the next call to x264_encoder_encode. Thus it must be used or copied
+ * before calling x264_encoder_encode or x264_encoder_headers again. */
typedef struct
{
int i_ref_idc; /* nal_priority_e */
int i_type; /* nal_unit_type_e */
- /* This data are raw payload */
+ /* Size of payload in bytes. */
int i_payload;
+ /* If param->b_annexb is set, Annex-B bytestream with 4-byte startcode.
+ * Otherwise, startcode is replaced with a 4-byte size.
+ * This size is the size used in mp4/similar muxing; it is equal to i_payload-4 */
uint8_t *p_payload;
} x264_nal_t;
-/* x264_nal_encode:
- * encode a nal into a buffer, setting the size.
- * if b_annexeb then a long synch work is added
- * XXX: it currently doesn't check for overflow */
-int x264_nal_encode( void *, int *, int b_annexeb, x264_nal_t *nal );
-
/****************************************************************************
* Encoder functions:
****************************************************************************/
+/* Force a link error in the case of linking against an incompatible API version.
+ * Glue #defines exist to force correct macro expansion; the final output of the macro
+ * is x264_encoder_open_##X264_BUILD (for purposes of dlopen). */
+#define x264_encoder_glue1(x,y) x##y
+#define x264_encoder_glue2(x,y) x264_encoder_glue1(x,y)
+#define x264_encoder_open x264_encoder_glue2(x264_encoder_open_,X264_BUILD)
+
/* x264_encoder_open:
* create a new encoder handler, all parameters from x264_param_t are copied */
-x264_t *x264_encoder_open ( x264_param_t * );
+x264_t *x264_encoder_open( x264_param_t * );
+
/* x264_encoder_reconfig:
- * change encoder options while encoding,
- * analysis-related parameters from x264_param_t are copied */
+ * analysis-related parameters from x264_param_t are copied.
+ * this takes effect immediately, on whichever frame is encoded next;
+ * due to delay, this may not be the next frame passed to encoder_encode.
+ * if the change should apply to some particular frame, use x264_picture_t->param instead.
+ * returns 0 on success, negative on parameter validation error. */
int x264_encoder_reconfig( x264_t *, x264_param_t * );
+/* x264_encoder_parameters:
+ * copies the current internal set of parameters to the pointer provided
+ * by the caller. useful when the calling application needs to know
+ * how x264_encoder_open has changed the parameters, or the current state
+ * of the encoder after multiple x264_encoder_reconfig calls.
+ * note that the data accessible through pointers in the returned param struct
+ * (e.g. filenames) should not be modified by the calling application. */
+void x264_encoder_parameters( x264_t *, x264_param_t * );
/* x264_encoder_headers:
- * return the SPS and PPS that will be used for the whole stream */
+ * return the SPS and PPS that will be used for the whole stream.
+ * if i_nal > 0, returns the total size of all NAL payloads.
+ * returns negative on error.
+ * the payloads of all output NALs are guaranteed to be sequential in memory. */
int x264_encoder_headers( x264_t *, x264_nal_t **, int * );
/* x264_encoder_encode:
- * encode one picture */
+ * encode one picture.
+ * if i_nal > 0, returns the total size of all NAL payloads.
+ * returns negative on error, zero if no NAL units returned.
+ * the payloads of all output NALs are guaranteed to be sequential in memory. */
int x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t *, x264_picture_t * );
/* x264_encoder_close:
* close an encoder handler */
void x264_encoder_close ( x264_t * );
+/* x264_encoder_delayed_frames:
+ * return the number of currently delayed (buffered) frames
+ * this should be used at the end of the stream, to know when you have all the encoded frames. */
+int x264_encoder_delayed_frames( x264_t * );
#endif
diff --git a/common/visualize.h b/x264dll.c
similarity index 56%
copy from common/visualize.h
copy to x264dll.c
index b611f6c..2b6524d 100644
--- a/common/visualize.h
+++ b/x264dll.c
@@ -1,7 +1,9 @@
/*****************************************************************************
- * x264: h264 encoder
+ * x264dll: x264 DLLMain for win32
*****************************************************************************
- * Copyright (C) 2005 Tuukka Toivonen <tuukkat at ee.oulu.fi>
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster at narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,14 +20,32 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifndef X264_VISUALIZE_H
-#define X264_VISUALIZE_H
-
#include "common/common.h"
+#include <windows.h>
+
+/* Callback for our DLL so we can initialize pthread */
+BOOL WINAPI DllMain( HANDLE hinstDLL, DWORD fdwReason, LPVOID lpvReserved )
+{
+#ifdef PTW32_STATIC_LIB
+ switch( fdwReason )
+ {
+ case DLL_PROCESS_ATTACH:
+ pthread_win32_process_attach_np();
-void x264_visualize_init( x264_t *h );
-void x264_visualize_mb( x264_t *h );
-void x264_visualize_show( x264_t *h );
-void x264_visualize_close( x264_t *h );
+ case DLL_THREAD_ATTACH:
+ pthread_win32_thread_attach_np();
+ break;
+ case DLL_THREAD_DETACH:
+ pthread_win32_thread_detach_np();
+ break;
+
+ case DLL_PROCESS_DETACH:
+ pthread_win32_thread_detach_np();
+ pthread_win32_process_detach_np();
+ break;
+ }
#endif
+
+ return TRUE;
+}
--
x264 packaging
More information about the pkg-multimedia-commits
mailing list