[SCM] mpv/upstream: Imported Upstream version 0.18.1

Wed Aug 10 14:26:53 UTC 2016

The following commit has been merged in the upstream branch:
commit 9fb50ae8ed250301c0205848101de9f2fdbb2471
Author: James Cowgill <james410 at cowgill.org.uk>
Date:   Fri Aug 5 22:35:54 2016 +0100

    Imported Upstream version 0.18.1

diff --git a/DOCS/client-api-changes.rst b/DOCS/client-api-changes.rst
index 8ef01a0..2027ed9 100644
--- a/DOCS/client-api-changes.rst
+++ b/DOCS/client-api-changes.rst
@@ -32,7 +32,11 @@ API changes
 
 ::
 
- --- mpv 0.17.1 ---
+ --- mpv 0.18.1 ---
+ ----   - remove "status" log level from mpv_request_log_messages() docs. This
+          is 100% equivalent to "v". The behavior is still the same, thus no
+          actual API change.
+ --- mpv 0.18.0 ---
  1.21   - mpv_set_property() changes behavior with MPV_FORMAT_NODE. Before this
           change it rejected mpv_nodes with format==MPV_FORMAT_STRING if the
           property was not a string or did not have special mechanisms in place
diff --git a/DOCS/interface-changes.rst b/DOCS/interface-changes.rst
index 7ffbbd6..d2e1ee4 100644
--- a/DOCS/interface-changes.rst
+++ b/DOCS/interface-changes.rst
@@ -19,7 +19,26 @@ Interface changes
 
 ::
 
- --- mpv 0.17.1 ---
+ --- mpv 0.18.1 ---
+    - deprecate --heartbeat-cmd
+    - remove --softvol=no capability:
+        - deprecate --softvol, it now does nothing
+        - --volume, --mute, and the corrsponding properties now always control
+          softvol, and behave as expected without surprises (e.g. you can set
+          them normally while no audio is initialized)
+        - rename --softvol-max to --volume-max (deprecated alias is added)
+        - the --volume-restore-data option and property are removed without
+          replacement. They were _always_ internal, and used for watch-later
+          resume/restore. Now --volume/--mute are saved directly instead.
+        - the previous point means resuming files with older watch-later configs
+          will print an error about missing --volume-restore-data (which you can
+          ignore), and will not restore the previous value
+        - as a consequence, volume controls will no longer control PulseAudio
+          per-application value, or use the system mixer's per-application
+          volume processing
+        - system or per-application volume can still be controlled with the
+          ao-volume and ao-mute properties (there are no command line options)
+ --- mpv 0.18.0 ---
     - now ab-loops are active even if one of the "ab-loop-a"/"-b" properties is
       unset ("no"), in which case the start of the file is used if the A loop
       point is unset, and the end of the file for an unset B loop point
@@ -27,7 +46,7 @@ Interface changes
       (also needs --embeddedfonts=no)
     - add "hwdec-interop" and "hwdec-current" properties
     - deprecated "hwdec-active" and "hwdec-detected" properties (to be removed
-      in mpv 0.19.0)
+      in mpv 0.20.0)
     - choice option/property values that are "yes" or "no" will now be returned
       as booleans when using the mpv_node functions in the client API, the
       "native" property accessors in Lua, and the JSON API. They can be set as
diff --git a/DOCS/man/input.rst b/DOCS/man/input.rst
index d58f2dc..9c4df61 100644
--- a/DOCS/man/input.rst
+++ b/DOCS/man/input.rst
@@ -459,6 +459,8 @@ Input Commands that are Possibly Subject to Change
         Remove all filters. Note that like the other sub-commands, this does
         not control automatically inserted filters.
 
+    The argument is always needed. E.g. in case of ``clr`` use ``vf clr ""``.
+
     You can assign labels to filter by prefixing them with ``@name:`` (where
     ``name`` is a user-chosen arbitrary identifier). Labels can be used to
     refer to filters by name in all of the filter chain modification commands.
@@ -561,29 +563,20 @@ Input Commands that are Possibly Subject to Change
     the resolution is reduced to that of the video's. You can read the
     ``osd-width`` and ``osd-height`` properties. At least with ``--vo-xv`` and
     anamorphic video (such as DVD), ``osd-par`` should be read as well, and the
-    overlay should be aspect-compensated. (Future directions: maybe mpv should
-    take care of some of these things automatically, but it's hard to tell
-    where to draw the line.)
+    overlay should be aspect-compensated.
 
     ``id`` is an integer between 0 and 63 identifying the overlay element. The
     ID can be used to add multiple overlay parts, update a part by using this
     command with an already existing ID, or to remove a part with
     ``overlay-remove``. Using a previously unused ID will add a new overlay,
-    while reusing an ID will update it. (Future directions: there should be
-    something to ensure different programs wanting to create overlays don't
-    conflict with each others, should that ever be needed.)
+    while reusing an ID will update it.
 
     ``x`` and ``y`` specify the position where the OSD should be displayed.
 
     ``file`` specifies the file the raw image data is read from. It can be
     either a numeric UNIX file descriptor prefixed with ``@`` (e.g. ``@4``),
-    or a filename. The file will be mapped into memory with ``mmap()``. Some VOs
-    will pass the mapped pointer directly to display APIs (e.g. opengl or
-    vdpau), so no actual copying is involved. Truncating the source file while
-    the overlay is active will crash the player. You shouldn't change the data
-    while the overlay is active, because the data is essentially accessed at
-    random points. Instead, call ``overlay-add`` again (preferably with a
-    different memory region to prevent tearing).
+    or a filename. The file will be mapped into memory with ``mmap()``,
+    copied, and unmapped before the command returns (changed in mpv 0.18.1).
 
     It is also possible to pass a raw memory address for use as bitmap memory
     by passing a memory address as integer prefixed with an ``&`` character.
@@ -616,15 +609,14 @@ Input Commands that are Possibly Subject to Change
     (Technically, the minimum size would be ``stride * (h - 1) + w * 4``, but
     for simplicity, the player will access all ``stride * h`` bytes.)
 
-    .. admonition:: Warning
+    .. note::
 
-        When updating the overlay, you should prepare a second shared memory
-        region (e.g. make use of the offset parameter) and add this as overlay,
-        instead of reusing the same memory every time. Otherwise, you might
-        get the equivalent of tearing, when your application and mpv write/read
-        the buffer at the same time. Also, keep in mind that mpv might access
-        an overlay's memory at random times whenever it feels the need to do
-        so, for example when redrawing the screen.
+        Before mpv 0.18.1, you had to do manual "double buffering" when updating
+        an overlay by replacing it with a different memory buffer. Since mpv
+        0.18.1, the memory is simply copied and doesn't reference any of the
+        memory indicated by the command's arguments after the commend returns.
+        If you want to use this command before mpv 0.18.1, reads the old docs
+        to see how to handle this correctly.
 
 ``overlay-remove <id>``
     Remove an overlay added with ``overlay-add`` and the same ID. Does nothing
@@ -1242,29 +1234,31 @@ Property list
     See ``--hr-seek``.
 
 ``mixer-active``
-    Return ``yes`` if the audio mixer is active, ``no`` otherwise. This has
-    implications for ``--softvol=no`` mode: if the mixer is inactive, changing
-    ``volume`` doesn't actually change anything on the system mixer. If the
-    ``--volume`` or ``--mute`` option are used, these might not be applied
-    properly until the mixer becomes active either. (The options, if set, will
-    just overwrite the mixer state at audio initialization.)
-
-    While the behavior with ``mixer-active==yes`` is relatively well-defined,
-    the ``no`` case will provide possibly wrong or insignificant values.
+    Return ``yes`` if the audio mixer is active, ``no`` otherwise.
 
-    Note that an active mixer does not necessarily imply active audio output,
-    although this is implied in the current implementation.
+    This option is relatively useless. Before mpv 0.18.1, it could be used to
+    infer behavior of the ``volume`` property.
 
 ``volume`` (RW)
-    Current volume (see ``--volume`` for details). Also see ``mixer-active``
-    property.
+    Current volume (see ``--volume`` for details).
 
-``volume-max``
-    Current maximum value the volume property can be set to. (This may depend
-    on the ``--softvol-max`` option.)
+``volume-max`` (RW)
+    Current maximum value the volume property can be set to. (Equivalent to the
+    ``--volume-max`` option.)
 
 ``mute`` (RW)
-    Current mute status (``yes``/``no``). Also see ``mixer-active`` property.
+    Current mute status (``yes``/``no``).
+
+``ao-volume`` (RW)
+    System volume. This property is available only if mpv audio output is
+    currently active, and only if the underlying implementation supports volume
+    control. What this option does depends on the API. For example, on ALSA
+    this usually changes system-wide audio, while with PulseAudio this controls
+    per-application volume.
+
+``ao-mute`` (RW)
+    Similar to ``ao-volume``, but controls the mute state. May be unimplemented
+    even if ``ao-volume`` works.
 
 ``audio-delay`` (RW)
     See ``--audio-delay``.
@@ -1388,7 +1382,7 @@ Property list
     properties to see whether this was successful.
 
     Unlike in mpv 0.9.x and before, this does not return the currently active
-    hardware decoder. Since mpv 0.17.1, ``hwdec-current`` is available for
+    hardware decoder. Since mpv 0.18.0, ``hwdec-current`` is available for
     this purpose.
 
 ``hwdec-current``
@@ -1412,13 +1406,13 @@ Property list
     platform and VO.
 
 ``hwdec-active``
-    Deprecated. To be removed in mpv 0.19.0. Use ``hwdec-current`` instead.
+    Deprecated. To be removed in mpv 0.20.0. Use ``hwdec-current`` instead.
 
     Return ``yes`` or ``no``, depending on whether any type of hardware decoding
     is actually in use.
 
 ``hwdec-detected``
-    Deprecated. To be removed in mpv 0.19.0.
+    Deprecated. To be removed in mpv 0.20.0.
 
     If hardware decoding is active, this returns the hardware decoder in use.
     Otherwise, it returns either ``no``, or if applicable, the currently loaded
@@ -2134,7 +2128,7 @@ Property list
 
     (Note that if an option is marked as file-local, even ``options/`` will
     access the local value, and the ``old`` value, which will be restored on
-    end of playback, can not be read or written until end of playback.)
+    end of playback, cannot be read or written until end of playback.)
 
 ``option-info/<name>``
     Additional per-option information.
diff --git a/DOCS/man/lua.rst b/DOCS/man/lua.rst
index b0d66c1..99ff6ff 100644
--- a/DOCS/man/lua.rst
+++ b/DOCS/man/lua.rst
@@ -175,7 +175,7 @@ The ``mp`` module is preloaded, although it can be loaded manually with
     Similar to ``mp.set_property``, but set the given property using its native
     type.
 
-    Since there are several data types which can not represented natively in
+    Since there are several data types which cannot represented natively in
     Lua, this might not always work as expected. For example, while the Lua
     wrapper can do some guesswork to decide whether a Lua table is an array
     or a map, this would fail with empty tables. Also, there are not many
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index a42d879..f9c32e5 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -638,15 +638,15 @@ Video
         In some cases, RGB conversion is forced, which means the RGB conversion
         is performed by the hardware decoding API, instead of the OpenGL code
         used by ``--vo=opengl``. This means certain obscure colorspaces may
-        not display correctly, and not certain filtering (such as debanding)
-        can not be applied in an ideal way.
+        not display correctly, not certain filtering (such as debanding)
+        cannot be applied in an ideal way.
 
         ``vdpau`` is usually safe. If deinterlacing enabled (or the ``vdpaupp``
         video filter is active in general), it forces RGB conversion. The latter
         currently does not treat certain colorspaces like BT.2020 correctly
-        (which is mostly a mpv-specific restriction). If the ``vdpauprb``
-        retrieves image data without RGB conversion, but does not work with
-        postprocessing.
+        (which is mostly a mpv-specific restriction). The ``vdpauprb`` video
+        filter retrieves image data without RGB conversion and is safe (but
+        precludes use of vdpau postprocessing).
 
         ``vaapi`` is safe if the ``vaapi-egl`` backend is indicated in the logs.
         If ``vaapi-glx`` is indicated, and the video colorspace is either BT.601
@@ -1072,14 +1072,10 @@ Audio
 
 ``--volume=<value>``
     Set the startup volume. 0 means silence, 100 means no volume reduction or
-    amplification. A value of -1 (the default) will not change the volume. See
-    also ``--softvol``.
+    amplification. Negative values can be passed for compatibility, but are
+    treated as 0.
 
-    .. note::
-
-        This was changed after the mpv 0.9 release. Before that, 100 actually
-        meant maximum volume. At the same time, the volume scale was made cubic,
-        so the old values won't match up with the new ones anyway.
+    Since mpv 0.18.1, this always controls the internal mixer (aka "softvol").
 
 ``--audio-delay=<sec>``
     Audio delay in seconds (positive or negative float value). Positive values
@@ -1094,20 +1090,17 @@ Audio
 
     See also: ``--volume``.
 
-``--softvol=<mode>``
-    Control whether to use the volume controls of the audio output driver or
-    the internal mpv volume filter.
+``--softvol=<no|yes|auto>``
+    Deprecated/unfunctional. Before mpv 0.18.1, this used to control whether
+    to use the volume controls of the audio output driver or the internal mpv
+    volume filter.
 
-    :no:    prefer audio driver controls, use the volume filter only if
-            absolutely needed
-    :yes:   always use the volume filter
-    :auto:  prefer the volume filter if the audio driver uses the system mixer
-            (default)
+    The current behavior is as if this option was set to ``yes``. The other
+    behaviors are not available anymore, although ``auto`` almost matches
+    current behavior in most cases.
 
-    The intention of ``auto`` is to avoid changing system mixer settings from
-    within mpv with default settings. mpv is a video player, not a mixer panel.
-    On the other hand, mixer controls are enabled for sound servers like
-    PulseAudio, which provide per-application volume.
+    The ``no`` behavior is still partially available through the ``ao-volume``
+    and ``ao-mute`` properties. But there are no options to reset these.
 
 ``--audio-demuxer=<[+]name>``
     Use this audio demuxer type when using ``--audio-file``. Use a '+' before
@@ -1264,10 +1257,12 @@ Audio
     their start timestamps differ, and then video timing is gradually adjusted
     if necessary to reach correct synchronization later.
 
-``--softvol-max=<100.0-1000.0>``
+``--volume-max=<100.0-1000.0>``, ``--softvol-max=<...>``
     Set the maximum amplification level in percent (default: 130). A value of
     130 will allow you to adjust the volume up to about double the normal level.
 
+    ``--softvol-max`` is a deprecated alias and should not be used.
+
 ``--audio-file-auto=<no|exact|fuzzy|all>``, ``--no-audio-file-auto``
     Load additional audio files matching the video filename. The parameter
     specifies how external audio files are matched. ``exact`` is enabled by
@@ -1276,7 +1271,7 @@ Audio
     :no:    Don't automatically load external audio files.
     :exact: Load the media filename with audio file extension (default).
     :fuzzy: Load all audio files containing media filename.
-    :all:   Load all aufio files in the current and ``--audio-file-paths``
+    :all:   Load all audio files in the current and ``--audio-file-paths``
             directories.
 
 ``--audio-file-paths=<path1:path2:...>``
@@ -1320,7 +1315,7 @@ Subtitles
 .. note::
 
     Changing styling and position does not work with all subtitles. Image-based
-    subtitles (DVD, Bluray/PGS, DVB) can not changed for fundamental reasons.
+    subtitles (DVD, Bluray/PGS, DVB) cannot changed for fundamental reasons.
     Subtitles in ASS format are normally not changed intentionally, but
     overriding them can be controlled with ``--ass-style-override``.
 
@@ -2019,6 +2014,14 @@ Window
     be the default behavior. Currently only affects X11 VOs.
 
 ``--heartbeat-cmd=<command>``
+
+    .. warning::
+
+        This option is redundant with Lua scripting. Further, it shouldn't be
+        needed for disabling screensaver anyway, since mpv will call
+        ``xdg-screensaver`` when using X11 backend. As a consequence this
+        option has been deprecated with no direct replacement.
+
     Command that is executed every 30 seconds during playback via *system()* -
     i.e. using the shell. The time between the commands can be customized with
     the ``--heartbeat-interval`` option. The command is not run while playback
@@ -2438,7 +2441,7 @@ Demuxer
     stop reading additional packets as soon as one of the limits is reached.
     (The limits still can be slightly overstepped due to technical reasons.)
 
-    Set these limits highher if you get a packet queue overflow warning, and
+    Set these limits higher if you get a packet queue overflow warning, and
     you think normal playback would be possible with a larger packet queue.
 
     See ``--list-options`` for defaults and value range.
@@ -2463,7 +2466,7 @@ Demuxer
 
 ``--force-seekable=<yes|no>``
     If the player thinks that the media is not seekable (e.g. playing from a
-    pipe, or it's a http stream with a server that doesn't support range
+    pipe, or it's an http stream with a server that doesn't support range
     requests), seeking will be disabled. This option can forcibly enable it.
     For seeks within the cache, there's a good chance of success.
 
@@ -3290,7 +3293,7 @@ Cache
     With ``auto``, the cache will usually be enabled for network streams,
     using the size set by ``--cache-default``. With ``yes``, the cache will
     always be enabled with the size set by ``--cache-default`` (unless the
-    stream can not be cached, or ``--cache-default`` disables caching).
+    stream cannot be cached, or ``--cache-default`` disables caching).
 
     May be useful when playing files from slow media, but can also have
     negative effects, especially with file formats that require a lot of
@@ -3685,7 +3688,7 @@ Miscellaneous
     video or audio outputs are not possible, but you can use filters to merge
     them into one.
 
-    The complex filter can not be changed yet during playback. It's also not
+    The complex filter cannot be changed yet during playback. It's also not
     possible to change the tracks connected to the filter at runtime. Other
     tracks, as long as they're not connected to the filter, and the
     corresponding output is not connected to the filter, can still be freely
diff --git a/DOCS/man/vf.rst b/DOCS/man/vf.rst
index b4e4438..6a5c44f 100644
--- a/DOCS/man/vf.rst
+++ b/DOCS/man/vf.rst
@@ -214,7 +214,7 @@ Available filters are:
         Format name, e.g. rgb15, bgr24, 420p, etc. (default: don't change).
     ``<outfmt>``
         Format name that should be substituted for the output. If they do not
-        have the same bytes per pixel and chroma subsamplimg, it will fail.
+        have the same bytes per pixel and chroma subsampling, it will fail.
     ``<colormatrix>``
         Controls the YUV to RGB color space conversion when playing video. There
         are various standards. Normally, BT.601 should be used for SD video, and
@@ -288,6 +288,8 @@ Available filters are:
         :adobe:        Adobe RGB (1998)
         :prophoto:     ProPhoto RGB (ROMM)
         :cie1931:      CIE 1931 RGB
+        :dci-p3:       DCI-P3 (Digital Cinema)
+        :v-gamut:      Panasonic V-Gamut primaries
 
     ``<gamma>``
        Gamma function the source file was encoded with. Normally this should be set
@@ -311,6 +313,8 @@ Available filters are:
        :gamma2.8:     Pure power curve (gamma 2.8)
        :prophoto:     ProPhoto RGB (ROMM) curve
        :st2084:       SMPTE ST2084 (HDR) curve
+       :std-b67:      ARIB STD-B67 (Hybrid Log-gamma) curve
+       :v-log:        Panasonic V-Log transfer curve
 
     ``<peak>``
         Reference peak illumination for the video file. This is mostly
diff --git a/DOCS/man/vo.rst b/DOCS/man/vo.rst
index bdc317f..b80244c 100644
--- a/DOCS/man/vo.rst
+++ b/DOCS/man/vo.rst
@@ -465,7 +465,7 @@ Available video output drivers are:
         8
             Dither to 8 bit output.
 
-        Note that the depth of the connected video display device can not be
+        Note that the depth of the connected video display device cannot be
         detected. Often, LCD panels will do dithering on their own, which
         conflicts with ``opengl``'s dithering and leads to ugly output.
 
@@ -629,6 +629,9 @@ Available video output drivers are:
             never resets (regardless of seeks).
         vec2 image_size
             The size in pixels of the input image.
+        vec2 target_size
+            The size in pixels of the visible part of the scaled (and possibly
+            cropped) image.
 
         For example, a shader that inverts the colors could look like this::
 
@@ -979,6 +982,8 @@ Available video output drivers are:
             CIE 1931 RGB (not to be confused with CIE XYZ)
         dci-p3
             DCI-P3 (Digital Cinema Colorspace), SMPTE RP431-2
+        v-gamut
+            Panasonic V-Gamut (VARICAM) primaries
 
     ``target-trc=<value>``
         Specifies the transfer characteristics (gamma) of the display. Video
@@ -1003,6 +1008,16 @@ Available video output drivers are:
             ProPhoto RGB (ROMM)
         st2084
             SMPTE ST2084 (HDR) curve, PQ OETF
+        std-b67
+            ARIB STD-B67 (Hybrid Log-gamma) curve, also known as BBC/NHK HDR
+        v-log
+            Panasonic V-Log (VARICAM) curve
+
+        NOTE: When using HDR output formats, mpv will encode to the specified
+              curve but it will not set any HDMI flags or other signalling that
+              might be required for the target device to correctly display the
+              HDR signal. The user should independently guarantee this before
+              using these signal formats for display.
 
     ``target-brightness=<1..100000>``
         Specifies the display's approximate brightness in cd/m^2. When playing
diff --git a/DOCS/tech-overview.txt b/DOCS/tech-overview.txt
index 914b222..e53f00b 100644
--- a/DOCS/tech-overview.txt
+++ b/DOCS/tech-overview.txt
@@ -123,7 +123,7 @@ options/options.h, options/options.c
     parser-mpcmd.c, and uses the option table in options.c.
 
 input/input.c:
-    This translates keyboard input comming from VOs and other sources (such
+    This translates keyboard input coming from VOs and other sources (such
     as remote control devices like Apple IR or client API commands) to the
     key bindings listed in the user's (or the builtin) input.conf and turns
     them into items of type struct mp_cmd. These commands are queued, and read
diff --git a/README.md b/README.md
index 4c84266..f38527f 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,20 @@ most likely the best maintenance out of all stable releases. Older releases
 are for distros, and at best receive basic changes like fixing critical security
 issues or build fixes, and at worst are completely abandoned.
 
+## FFmpeg ABI compatibility
+
+mpv does not support linking against FFmpeg versions it was not built with, even
+if the linked version is supposedly ABI-compatible with the version it was
+compiled against. Expect malfunctions, crashes, and security issues if you
+do it anyway.
+
+The reason for not supporting this is because it creates far too much complexity
+with little to no benefit, coupled with absurd and unusable FFmpeg API
+artifacts.
+
+Newer mpv versions will refuse to start if runtime and compile time FFmpeg
+library versions mismatch.
+
 ## Release cycle
 
 Every other month, an arbitrary git snapshot is made, and is assigned
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index fa553c7..6973ace 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,83 @@
+Release 0.18.1
+==============
+
+Note: Running mpv with different versions of the FFmpeg/Libav libraries than
+it was compiled with is no longer supported. Even supposedly ABI-compatible
+versions have been a source of trouble, and it creates far too much
+complexity with little to no benefit, coupled with absurd and unusable FFmpeg
+API artifacts.
+
+Instead, mpv will exit with an error when such a situation is detected.
+This simply means that mpv needs to be rebuilt whenever FFmpeg libraries change.
+
+
+Features
+--------
+
+New
+~~~
+
+- d3d: implement screenshots for --hwdec=d3d11va
+- vo_opengl: add output_size uniform to custom shader
+- vo_opengl: implement the Panasonic V-Log function (#3157)
+- vo_opengl: implement ARIB STD-B68 (HLG) HDR TRC
+
+
+Options and Commands
+--------------------
+
+Changed
+~~~~~~~
+- command: pack sub image data in overlay-add command
+
+
+Deprecated
+~~~~~~~~~~
+
+- options: deprecate --heartbeat-cmd
+- audio: deprecate --softvol
+
+
+Removed
+~~~~~~~
+
+- audio: drop --softvol=no and --softvol=auto (#3322)
+
+
+Fixes and Minor Enhancements
+----------------------------
+
+- video: fix deinterlace filter handling on pixel format changes
+- x11: silence xdg-screensaver
+- vo_opengl: angle: update the swapchain on resize (#3301)
+- vo_opengl: error out gracefully when trying to use FBOs without FBO API
+- vd_lavc: expose mastering display side data reference peak (improves results with HDR content)
+- vo_opengl: generalize HDR tone mapping mechanism (#3293)
+- vo_opengl: don't constantly resize the output FBO
+- vo_opengl: use ringbuffer of PBOs
+- Windows: make WM_NCHITTEST simpler and more accurate
+- ao_oss: do not add an entry to audio-device-list if device file missing
+- dec_audio: fix segment boudnary switching
+- ao_lavc, vo_lavc: Migrate to new FFmpeg encoding API
+- vo_opengl: explicitly use main framebuffer when reading window content (#3284)
+- vo_xv: fix behavior with odd sizes
+- audio: insert auto-inserted filters at end of chain
+- x11: add missing FocusChangeMask (disables key repeat when losing focus while a key is down)
+- ao_coreaudio: error out when selecting invalid device
+- ad_lavc: work around misbehavior of some FFmpeg decoders like wmapro (#3297)
+- player: cut off status line on terminal width
+
+
+This listing is not complete. Check DOCS/client-api-changes.rst for a history
+of changes to the client API, and DOCS/interface-changes.rst for a history
+of changes to other user-visible interfaces.
+
+A complete changelog can be seen by running `git log v0.18.0..v0.18.1`
+in the git repository or by visiting either
+https://github.com/mpv-player/mpv/compare/v0.18.0...v0.18.1 or
+http://git.srsfckn.biz/mpv/log/?qt=range&q=v0.18.0..v0.18.1
+
+
 Release 0.18.0
 ==============
 
@@ -39,7 +119,7 @@ New
 
 
 Removed
--------
+~~~~~~~
 
 - vo_opengl: remove nnedi3 prescaler (replaced by user shaders)
 - vo_opengl: remove prescaling framework with superxbr prescaler (replaced by user shaders)
diff --git a/VERSION b/VERSION
index 6633391..249afd5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.18.0
+0.18.1
diff --git a/audio/audio.c b/audio/audio.c
index ae85a4b..306401b 100644
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -257,13 +257,21 @@ void mp_audio_skip_samples(struct mp_audio *data, int samples)
         data->pts += samples / (double)data->rate;
 }
 
+// Return the timestamp of the sample just after the end of this frame.
+double mp_audio_end_pts(struct mp_audio *f)
+{
+    if (f->pts == MP_NOPTS_VALUE || f->rate < 1)
+        return MP_NOPTS_VALUE;
+    return f->pts + f->samples / (double)f->rate;
+}
+
 // Clip the given frame to the given timestamp range. Adjusts the frame size
 // and timestamp.
 void mp_audio_clip_timestamps(struct mp_audio *f, double start, double end)
 {
-    if (f->pts == MP_NOPTS_VALUE || f->rate < 1)
+    double f_end = mp_audio_end_pts(f);
+    if (f_end == MP_NOPTS_VALUE)
         return;
-    double f_end = f->pts + f->samples / (double)f->rate;
     if (end != MP_NOPTS_VALUE) {
         if (f_end >= end) {
             if (f->pts >= end) {
diff --git a/audio/audio.h b/audio/audio.h
index c469f7a..e126e93 100644
--- a/audio/audio.h
+++ b/audio/audio.h
@@ -73,6 +73,7 @@ void mp_audio_copy(struct mp_audio *dst, int dst_offset,
 void mp_audio_copy_attributes(struct mp_audio *dst, struct mp_audio *src);
 void mp_audio_skip_samples(struct mp_audio *data, int samples);
 void mp_audio_clip_timestamps(struct mp_audio *f, double start, double end);
+double mp_audio_end_pts(struct mp_audio *data);
 
 bool mp_audio_is_writeable(struct mp_audio *data);
 int mp_audio_make_writeable(struct mp_audio *data);
diff --git a/audio/decode/ad_lavc.c b/audio/decode/ad_lavc.c
index f48993f..0316f6b 100644
--- a/audio/decode/ad_lavc.c
+++ b/audio/decode/ad_lavc.c
@@ -186,6 +186,12 @@ static int decode_packet(struct dec_audio *da, struct demux_packet *mpkt,
     struct priv *priv = da->priv;
     AVCodecContext *avctx = priv->avctx;
 
+    // If the decoder discards the timestamp for some reason, we use the
+    // interpolated PTS. Initialize it so that it works for the initial
+    // packet as well.
+    if (mpkt && priv->next_pts == MP_NOPTS_VALUE)
+        priv->next_pts = mpkt->pts;
+
     int in_len = mpkt ? mpkt->len : 0;
 
     AVPacket pkt;
diff --git a/audio/decode/dec_audio.c b/audio/decode/dec_audio.c
index e60ebe3..d455770 100644
--- a/audio/decode/dec_audio.c
+++ b/audio/decode/dec_audio.c
@@ -200,7 +200,9 @@ void audio_work(struct dec_audio *da)
     if (da->current_frame)
         return;
 
-    if (!da->packet && demux_read_packet_async(da->header, &da->packet) == 0) {
+    if (!da->packet && !da->new_segment &&
+        demux_read_packet_async(da->header, &da->packet) == 0)
+    {
         da->current_state = DATA_WAIT;
         return;
     }
@@ -211,6 +213,7 @@ void audio_work(struct dec_audio *da)
         da->packet = NULL;
     }
 
+    bool had_input_packet = !!da->packet;
     bool had_packet = da->packet || da->new_segment;
 
     int ret = da->ad_driver->decode_packet(da, da->packet, &da->current_frame);
@@ -233,12 +236,12 @@ void audio_work(struct dec_audio *da)
 
     fix_audio_pts(da);
 
-    bool segment_end = true;
+    bool segment_end = !da->current_frame && !had_input_packet;
 
     if (da->current_frame) {
         mp_audio_clip_timestamps(da->current_frame, da->start, da->end);
         if (da->current_frame->pts != MP_NOPTS_VALUE && da->start != MP_NOPTS_VALUE)
-            segment_end = da->current_frame->pts >= da->start;
+            segment_end = da->current_frame->pts >= da->end;
         if (da->current_frame->samples == 0) {
             talloc_free(da->current_frame);
             da->current_frame = NULL;
diff --git a/audio/filter/af.c b/audio/filter/af.c
index ac1b492..21b0982 100644
--- a/audio/filter/af.c
+++ b/audio/filter/af.c
@@ -210,29 +210,6 @@ static struct af_instance *af_prepend(struct af_stream *s,
     return new;
 }
 
-/* Create and insert a new filter of type name after the filter in the
-   argument. This function can be called during runtime, the return
-   value is the new filter */
-static struct af_instance *af_append(struct af_stream *s,
-                                     struct af_instance *af,
-                                     char *name, char **args)
-{
-    if (!af)
-        af = s->first;
-    if (af == s->last)
-        af = s->last->prev;
-    // Create the new filter and make sure it is OK
-    struct af_instance *new = af_create(s, name, args);
-    if (!new)
-        return NULL;
-    // Update pointers
-    new->prev = af;
-    new->next = af->next;
-    af->next = new;
-    new->next->prev = new;
-    return new;
-}
-
 // Uninit and remove the filter "af"
 static void af_remove(struct af_stream *s, struct af_instance *af)
 {
@@ -275,6 +252,8 @@ static void af_print_filter_chain(struct af_stream *s, struct af_instance *at,
     while (af) {
         char b[128] = {0};
         mp_snprintf_cat(b, sizeof(b), "  [%s] ", af->info->name);
+        if (af->label)
+            mp_snprintf_cat(b, sizeof(b), "\"%s\" ", af->label);
         if (af->data)
             mp_snprintf_cat(b, sizeof(b), "%s", mp_audio_config_to_str(af->data));
         if (af == at)
@@ -287,11 +266,6 @@ static void af_print_filter_chain(struct af_stream *s, struct af_instance *at,
     MP_MSG(s, msg_level, "  [ao] %s\n", mp_audio_config_to_str(&s->output));
 }
 
-static bool af_is_conversion_filter(struct af_instance *af)
-{
-    return af && strcmp(af->info->name, "lavrresample") == 0;
-}
-
 // in is what af can take as input - insert a conversion filter if the actual
 // input format doesn't match what af expects.
 // Returns:
@@ -557,7 +531,7 @@ void af_destroy(struct af_stream *s)
    format of the preferred output respectively. The function is
    reentrant i.e. if called with an already initialized stream the
    stream will be reinitialized.
-   If one of the prefered output parameters is 0 the one that needs
+   If one of the preferred output parameters is 0 the one that needs
    no conversion is used (i.e. the output format in the last filter).
    The return value is 0 if success and -1 if failure */
 int af_init(struct af_stream *s)
@@ -602,12 +576,7 @@ struct af_instance *af_add(struct af_stream *s, char *name, char *label,
     if (af_find_by_label(s, label))
         return NULL;
 
-    struct af_instance *new;
-    // Insert the filter somewhere nice
-    if (af_is_conversion_filter(s->first->next))
-        new = af_append(s, s->first->next, name, args);
-    else
-        new = af_prepend(s, s->first->next, name, args);
+    struct af_instance *new = af_prepend(s, s->last, name, args);
     if (!new)
         return NULL;
     new->label = talloc_strdup(new, label);
diff --git a/audio/mixer.c b/audio/mixer.c
index 01bb4d5..a58a814 100644
--- a/audio/mixer.c
+++ b/audio/mixer.c
@@ -35,18 +35,6 @@ struct mixer {
     struct MPOpts *opts;
     struct ao *ao;
     struct af_stream *af;
-    // Static, dependent on ao/softvol settings
-    bool softvol;                       // use AO (false) or af_volume (true)
-    bool persistent_volume;             // volume does not need to be restored
-    bool emulate_mute;                  // if true, emulate mute with volume=0
-    // Last known values (possibly out of sync with reality)
-    float vol_l, vol_r;
-    bool muted;
-    // Used to decide whether we should unmute on uninit
-    bool muted_by_us;
-    /* Contains ao driver name or "softvol" if volume is not persistent
-     * and needs to be restored after the driver is reinitialized. */
-    const char *driver;
     // Other stuff
     float balance;
 };
@@ -57,82 +45,25 @@ struct mixer *mixer_init(void *talloc_ctx, struct mpv_global *global)
     *mixer = (struct mixer) {
         .log = mp_log_new(mixer, global->log, "mixer"),
         .opts = global->opts,
-        .vol_l = 100,
-        .vol_r = 100,
-        .driver = "",
     };
     return mixer;
 }
 
 bool mixer_audio_initialized(struct mixer *mixer)
 {
-    return !!mixer->ao;
+    return !!mixer->af;
 }
 
-float mixer_getmaxvolume(struct mixer *mixer)
+// Called when opts->softvol_volume or opts->softvol_mute were changed.
+void mixer_update_volume(struct mixer *mixer)
 {
-    // gain == 1
-    return mixer->softvol ? mixer->opts->softvol_max : 100;
-}
-
-static void checkvolume(struct mixer *mixer)
-{
-    if (!mixer->ao)
+    if (!mixer->af)
         return;
 
-    ao_control_vol_t vol = {mixer->vol_l, mixer->vol_r};
-    if (mixer->softvol) {
-        float gain;
-        if (!af_control_any_rev(mixer->af, AF_CONTROL_GET_VOLUME, &gain))
-            gain = 1.0;
-        vol.left = gain * 100.0;
-        vol.right = gain * 100.0;
-    } else {
-        MP_DBG(mixer, "Reading volume from AO.\n");
-        // Rely on the values not changing if the query is not supported
-        ao_control(mixer->ao, AOCONTROL_GET_VOLUME, &vol);
-        ao_control(mixer->ao, AOCONTROL_GET_MUTE, &mixer->muted);
-    }
-    float l = mixer->vol_l;
-    float r = mixer->vol_r;
-    if (mixer->emulate_mute && mixer->muted)
-        l = r = 0;
-    /* Try to detect cases where the volume has been changed by some external
-     * action (such as something else changing a shared system-wide volume).
-     * We don't test for exact equality, as some AOs may round the value
-     * we last set to some nearby supported value. 3 has been the default
-     * volume step for increase/decrease keys, and is apparently big enough
-     * to step to the next possible value in most setups.
-     */
-    if (FFABS(vol.left - l) >= 3 || FFABS(vol.right - r) >= 3) {
-        mixer->vol_l = vol.left;
-        mixer->vol_r = vol.right;
-        if (mixer->emulate_mute)
-            mixer->muted = false;
-    }
-    mixer->muted_by_us &= mixer->muted;
-}
+    float gain = MPMAX(mixer->opts->softvol_volume / 100.0, 0);
+    if (mixer->opts->softvol_mute == 1)
+        gain = 0.0;
 
-void mixer_getvolume(struct mixer *mixer, float *l, float *r)
-{
-    checkvolume(mixer);
-    *l = mixer->vol_l;
-    *r = mixer->vol_r;
-}
-
-static void setvolume_internal(struct mixer *mixer)
-{
-    float l = mixer->vol_l, r = mixer->vol_r;
-    if (mixer->emulate_mute && mixer->muted)
-        l = r = 0;
-    if (!mixer->softvol) {
-        MP_DBG(mixer, "Setting volume on AO.\n");
-        struct ao_control_vol vol = {.left = l, .right = r};
-        if (ao_control(mixer->ao, AOCONTROL_SET_VOLUME, &vol) != CONTROL_OK)
-            MP_ERR(mixer, "Failed to change audio output volume.\n");
-        return;
-    }
-    float gain = (l + r) / 2.0 / 100.0;
     if (!af_control_any_rev(mixer->af, AF_CONTROL_SET_VOLUME, &gain)) {
         if (gain == 1.0)
             return;
@@ -143,57 +74,6 @@ static void setvolume_internal(struct mixer *mixer)
     }
 }
 
-void mixer_setvolume(struct mixer *mixer, float l, float r)
-{
-    checkvolume(mixer);  // to check mute status
-
-    float max = mixer_getmaxvolume(mixer);
-    mixer->vol_l = MPCLAMP(l, 0, max);
-    mixer->vol_r = MPCLAMP(r, 0, max);
-    if (mixer->ao)
-        setvolume_internal(mixer);
-}
-
-void mixer_getbothvolume(struct mixer *mixer, float *b)
-{
-    float mixer_l, mixer_r;
-    mixer_getvolume(mixer, &mixer_l, &mixer_r);
-    *b = (mixer_l + mixer_r) / 2;
-}
-
-void mixer_setmute(struct mixer *mixer, bool mute)
-{
-    checkvolume(mixer);
-    if (mute == mixer->muted)
-        return;
-    if (mixer->ao) {
-        mixer->muted = mute;
-        mixer->muted_by_us = mute;
-        if (mixer->emulate_mute) {
-            setvolume_internal(mixer);
-        } else {
-            ao_control(mixer->ao, AOCONTROL_SET_MUTE, &mute);
-        }
-        checkvolume(mixer);
-    } else {
-        mixer->muted = mute;
-        mixer->muted_by_us = mute;
-    }
-}
-
-bool mixer_getmute(struct mixer *mixer)
-{
-    checkvolume(mixer);
-    return mixer->muted;
-}
-
-void mixer_addvolume(struct mixer *mixer, float step)
-{
-    float vol_l, vol_r;
-    mixer_getvolume(mixer, &vol_l, &vol_r);
-    mixer_setvolume(mixer, vol_l + step, vol_r + step);
-}
-
 void mixer_getbalance(struct mixer *mixer, float *val)
 {
     if (mixer->af)
@@ -242,130 +122,18 @@ void mixer_setbalance(struct mixer *mixer, float val)
     af_pan_balance->control(af_pan_balance, AF_CONTROL_SET_PAN_BALANCE, &val);
 }
 
-char *mixer_get_volume_restore_data(struct mixer *mixer)
-{
-    if (!mixer->driver[0])
-        return NULL;
-    return talloc_asprintf(NULL, "%s:%f:%f:%d", mixer->driver, mixer->vol_l,
-                           mixer->vol_r, mixer->muted_by_us);
-}
-
-static void probe_softvol(struct mixer *mixer)
-{
-    bool ao_perapp = ao_control(mixer->ao, AOCONTROL_HAS_PER_APP_VOLUME, 0) == 1;
-    bool ao_softvol = ao_control(mixer->ao, AOCONTROL_HAS_SOFT_VOLUME, 0) == 1;
-    assert(!(ao_perapp && ao_softvol));
-    mixer->persistent_volume = !ao_softvol;
-
-    if (mixer->opts->softvol == SOFTVOL_AUTO) {
-        // No system-wide volume => fine with AO volume control.
-        mixer->softvol = !ao_softvol && !ao_perapp;
-    } else {
-        mixer->softvol = mixer->opts->softvol == SOFTVOL_YES;
-    }
-
-    if (mixer->softvol)
-        mixer->persistent_volume = false;
-
-    MP_DBG(mixer, "Will use af_volume: %s\n", mixer->softvol ? "yes" : "no");
-
-    // If we can't use real volume control => force softvol.
-    if (!mixer->softvol) {
-        ao_control_vol_t vol;
-        if (ao_control(mixer->ao, AOCONTROL_GET_VOLUME, &vol) != CONTROL_OK) {
-            mixer->softvol = true;
-            MP_WARN(mixer, "Hardware volume control unavailable.\n");
-        }
-    }
-
-    // Probe native mute support.
-    mixer->emulate_mute = true;
-    if (!mixer->softvol) {
-        if (ao_control(mixer->ao, AOCONTROL_GET_MUTE, &(bool){0}) == CONTROL_OK)
-            mixer->emulate_mute = false;
-    }
-}
-
-static void restore_volume(struct mixer *mixer)
-{
-    struct MPOpts *opts = mixer->opts;
-    struct ao *ao = mixer->ao;
-
-    float force_vol_l = -1, force_vol_r = -1;
-    int force_mute = -1;
-
-    const char *prev_driver = mixer->driver;
-    mixer->driver = mixer->softvol ? "softvol" : ao_get_name(ao);
-    if (!prev_driver[0])
-        prev_driver = mixer->driver;
-
-    // Restore old parameters if volume won't survive reinitialization.
-    // But not if volume scale is possibly different.
-    if (!mixer->persistent_volume && strcmp(mixer->driver, prev_driver) == 0) {
-        force_vol_l = mixer->vol_l;
-        force_vol_r = mixer->vol_r;
-    }
-
-    // Set mute if we disabled it on uninit last time.
-    if (mixer->muted_by_us)
-        force_mute = 1;
-
-    // Set parameters from command line.
-    if (opts->mixer_init_volume >= 0)
-        force_vol_l = force_vol_r = opts->mixer_init_volume;
-    if (opts->mixer_init_mute >= 0)
-        force_mute = opts->mixer_init_mute;
-
-    // Set parameters from playback resume.
-    char *data = mixer->opts->mixer_restore_volume_data;
-    if (!mixer->persistent_volume && data && data[0]) {
-        char drv[40];
-        float v_l, v_r;
-        int m;
-        if (sscanf(data, "%39[^:]:%f:%f:%d", drv, &v_l, &v_r, &m) == 4) {
-            if (strcmp(mixer->driver, drv) == 0) {
-                force_vol_l = v_l;
-                force_vol_r = v_r;
-                force_mute = !!m;
-                MP_DBG(mixer, "Restoring volume from resume config.\n");
-            }
-        }
-        talloc_free(mixer->opts->mixer_restore_volume_data);
-        mixer->opts->mixer_restore_volume_data = NULL;
-    }
-
-    // Using --volume should not reset the volume on every file (i.e. reinit),
-    // OTOH mpv --{ --volume 10 f1.mkv --} --{ --volume 20 f2.mkv --} must work.
-    // Resetting the option volumes to "auto" (-1) is easiest. If file local
-    // options (as shown above) are used, the option handler code will reset
-    // them to other values, and force the volume to be reset as well.
-    opts->mixer_init_volume = -1;
-    opts->mixer_init_mute = -1;
-
-    checkvolume(mixer);
-    if (force_vol_l >= 0 && force_vol_r >= 0) {
-        MP_DBG(mixer, "Restoring previous volume.\n");
-        mixer_setvolume(mixer, force_vol_l, force_vol_r);
-    }
-    if (force_mute >= 0) {
-        MP_DBG(mixer, "Restoring previous mute toggle.\n");
-        mixer_setmute(mixer, force_mute);
-    }
-}
-
 // Called after the audio filter chain is built or rebuilt.
 // (Can be called multiple times, even without mixer_uninit() in-between.)
-void mixer_reinit_audio(struct mixer *mixer, struct ao *ao, struct af_stream *af)
+void mixer_reinit_audio(struct mixer *mixer, struct af_stream *af)
 {
-    if (!ao || !af)
-        return;
-    mixer->ao = ao;
     mixer->af = af;
+    if (!af)
+        return;
 
-    MP_DBG(mixer, "Reinit...\n");
+    if (mixer->opts->softvol == SOFTVOL_NO)
+        MP_ERR(mixer, "--softvol=no is not supported anymore.\n");
 
-    probe_softvol(mixer);
-    restore_volume(mixer);
+    mixer_update_volume(mixer);
 
     if (mixer->balance != 0)
         mixer_setbalance(mixer, mixer->balance);
@@ -380,24 +148,5 @@ void mixer_uninit_audio(struct mixer *mixer)
     if (!mixer->ao)
         return;
 
-    MP_DBG(mixer, "Uninit...\n");
-
-    checkvolume(mixer);
-    if (mixer->muted_by_us && mixer->persistent_volume) {
-        MP_DBG(mixer, "Draining.\n");
-        /* Current audio output API combines playing the remaining buffered
-         * audio and uninitializing the AO into one operation, even though
-         * ideally unmute would happen between those two steps. We can't do
-         * volume changes after uninitialization, but we don't want the
-         * remaining audio to play at full volume either. Thus this
-         * workaround to drop remaining audio first. */
-        ao_reset(mixer->ao);
-        mixer_setmute(mixer, false);
-        /* We remember mute status and re-enable it if we play more audio
-         * in the same process. */
-        mixer->muted_by_us = true;
-    }
-    mixer->ao = NULL;
     mixer->af = NULL;
-    mixer->softvol = false;
 }
diff --git a/audio/mixer.h b/audio/mixer.h
index 4e2ff35..b475c12 100644
--- a/audio/mixer.h
+++ b/audio/mixer.h
@@ -33,18 +33,11 @@ struct af_stream;
 struct mixer;
 
 struct mixer *mixer_init(void *talloc_ctx, struct mpv_global *global);
-void mixer_reinit_audio(struct mixer *mixer, struct ao *ao, struct af_stream *af);
+void mixer_reinit_audio(struct mixer *mixer, struct af_stream *af);
 void mixer_uninit_audio(struct mixer *mixer);
 bool mixer_audio_initialized(struct mixer *mixer);
-void mixer_getvolume(struct mixer *mixer, float *l, float *r);
-void mixer_setvolume(struct mixer *mixer, float l, float r);
-void mixer_addvolume(struct mixer *mixer, float step);
-void mixer_getbothvolume(struct mixer *mixer, float *b);
-void mixer_setmute(struct mixer *mixer, bool mute);
-bool mixer_getmute(struct mixer *mixer);
+void mixer_update_volume(struct mixer *mixer);
 void mixer_getbalance(struct mixer *mixer, float *bal);
 void mixer_setbalance(struct mixer *mixer, float bal);
-float mixer_getmaxvolume(struct mixer *mixer);
-char *mixer_get_volume_restore_data(struct mixer *mixer);
 
 #endif /* MPLAYER_MIXER_H */
diff --git a/audio/out/ao.c b/audio/out/ao.c
index 9c0f644..c9d8f42 100644
--- a/audio/out/ao.c
+++ b/audio/out/ao.c
@@ -490,10 +490,9 @@ struct ao_hotplug *ao_hotplug_create(struct mpv_global *global,
 static void get_devices(struct ao *ao, struct ao_device_list *list)
 {
     int num = list->num_devices;
-    if (ao->driver->list_devs)
+    if (ao->driver->list_devs) {
         ao->driver->list_devs(ao, list);
-    // Add at least a default entry
-    if (list->num_devices == num) {
+    } else {
         char name[80] = "Default";
         if (num > 1)
             mp_snprintf_cat(name, sizeof(name), " (%s)", ao->driver->name);
diff --git a/audio/out/ao_coreaudio_utils.c b/audio/out/ao_coreaudio_utils.c
index 8f9690f..0bcc0d6 100644
--- a/audio/out/ao_coreaudio_utils.c
+++ b/audio/out/ao_coreaudio_utils.c
@@ -114,6 +114,13 @@ OSStatus ca_select_device(struct ao *ao, char* name, AudioDeviceID *device)
             kAudioObjectSystemObject, &p_addr, 0, 0, &size, &v);
         CFRelease(uid);
         CHECK_CA_ERROR("unable to query for device UID");
+
+        uint32_t is_alive = 1;
+        err = CA_GET(*device, kAudioDevicePropertyDeviceIsAlive, &is_alive);
+        CHECK_CA_ERROR("could not check whether device is alive (invalid device?)");
+
+        if (!is_alive)
+            MP_WARN(ao, "device is not alive!\n");
     } else {
         // device not set by user, get the default one
         err = CA_GET(kAudioObjectSystemObject,
diff --git a/audio/out/ao_lavc.c b/audio/out/ao_lavc.c
index 572874d..6b4279c 100644
--- a/audio/out/ao_lavc.c
+++ b/audio/out/ao_lavc.c
@@ -39,8 +39,6 @@
 #include "common/encode_lavc.h"
 
 struct priv {
-    uint8_t *buffer;
-    size_t buffer_size;
     AVStream *stream;
     AVCodecContext *codec;
     int pcmhack;
@@ -146,18 +144,10 @@ static int init(struct ao *ao)
     if (ac->codec->frame_size <= 1)
         ac->pcmhack = av_get_bits_per_sample(ac->codec->codec_id) / 8;
 
-    if (ac->pcmhack) {
+    if (ac->pcmhack)
         ac->aframesize = 16384; // "enough"
-        ac->buffer_size =
-            ac->aframesize * ac->pcmhack * ao->channels.num * 2 + 200;
-    } else {
+    else
         ac->aframesize = ac->codec->frame_size;
-        ac->buffer_size =
-            ac->aframesize * ac->sample_size * ao->channels.num * 2 + 200;
-    }
-    if (ac->buffer_size < FF_MIN_BUFFER_SIZE)
-        ac->buffer_size = FF_MIN_BUFFER_SIZE;
-    ac->buffer = talloc_size(ac, ac->buffer_size);
 
     // enough frames for at least 0.25 seconds
     ac->framecount = ceil(ao->samplerate * 0.25 / ac->aframesize);
@@ -182,7 +172,7 @@ fail:
 }
 
 // close audio device
-static int encode(struct ao *ao, double apts, void **data);
+static void encode(struct ao *ao, double apts, void **data);
 static void uninit(struct ao *ao)
 {
     struct priv *ac = ao->priv;
@@ -199,12 +189,12 @@ static void uninit(struct ao *ao)
         return;
     }
 
-    if (ac->buffer) {
+    if (ac->stream) {
         double outpts = ac->expected_next_pts;
         if (!ectx->options->rawts && ectx->options->copyts)
             outpts += ectx->discontinuity_pts_offset;
         outpts += encode_lavc_getoffset(ectx, ac->codec);
-        while (encode(ao, outpts, NULL) > 0) ;
+        encode(ao, outpts, NULL);
     }
 
     pthread_mutex_unlock(&ectx->lock);
@@ -220,24 +210,130 @@ static int get_space(struct ao *ao)
     return ac->aframesize * ac->framecount;
 }
 
+static void write_packet(struct ao *ao, AVPacket *packet)
+{
+    // TODO: Can we unify this with the equivalent video code path?
+    struct priv *ac = ao->priv;
+
+    packet->stream_index = ac->stream->index;
+    if (packet->pts != AV_NOPTS_VALUE) {
+        packet->pts = av_rescale_q(packet->pts,
+                                   ac->codec->time_base,
+                                   ac->stream->time_base);
+    } else {
+        // Do we need this at all? Better be safe than sorry...
+        MP_WARN(ao, "encoder lost pts, why?\n");
+        if (ac->savepts != MP_NOPTS_VALUE) {
+            packet->pts = av_rescale_q(ac->savepts,
+                                       ac->codec->time_base,
+                                       ac->stream->time_base);
+        }
+    }
+    if (packet->dts != AV_NOPTS_VALUE) {
+        packet->dts = av_rescale_q(packet->dts,
+                                   ac->codec->time_base,
+                                   ac->stream->time_base);
+    }
+    if (packet->duration > 0) {
+        packet->duration = av_rescale_q(packet->duration,
+                                        ac->codec->time_base,
+                                        ac->stream->time_base);
+    }
+
+    ac->savepts = AV_NOPTS_VALUE;
+
+    if (encode_lavc_write_frame(ao->encode_lavc_ctx,
+                                ac->stream, packet) < 0) {
+        MP_ERR(ao, "error writing at %d %d/%d\n",
+               (int) packet->pts,
+               ac->stream->time_base.num,
+               ac->stream->time_base.den);
+        return;
+    }
+}
+
+static void encode_audio_and_write(struct ao *ao, AVFrame *frame)
+{
+    // TODO: Can we unify this with the equivalent video code path?
+    struct priv *ac = ao->priv;
+    AVPacket packet = {0};
+
+#if HAVE_AVCODEC_NEW_CODEC_API
+    int status = avcodec_send_frame(ac->codec, frame);
+    if (status < 0) {
+        MP_ERR(ao, "error encoding at %d %d/%d\n",
+               frame ? (int) frame->pts : -1,
+               ac->codec->time_base.num,
+               ac->codec->time_base.den);
+        return;
+    }
+    for (;;) {
+        av_init_packet(&packet);
+        status = avcodec_receive_packet(ac->codec, &packet);
+        if (status == AVERROR(EAGAIN)) { // No more packets for now.
+            if (frame == NULL) {
+                MP_ERR(ao, "sent flush frame, got EAGAIN");
+            }
+            break;
+        }
+        if (status == AVERROR_EOF) { // No more packets, ever.
+            if (frame != NULL) {
+                MP_ERR(ao, "sent audio frame, got EOF");
+            }
+            break;
+        }
+        if (status < 0) {
+            MP_ERR(ao, "error encoding at %d %d/%d\n",
+                   frame ? (int) frame->pts : -1,
+                   ac->codec->time_base.num,
+                   ac->codec->time_base.den);
+            break;
+        }
+        if (frame) {
+            if (ac->savepts == AV_NOPTS_VALUE)
+                ac->savepts = frame->pts;
+        }
+        encode_lavc_write_stats(ao->encode_lavc_ctx, ac->codec);
+        write_packet(ao, &packet);
+        av_packet_unref(&packet);
+    }
+#else
+    av_init_packet(&packet);
+    int got_packet = 0;
+    int status = avcodec_encode_audio2(ac->codec, &packet, frame, &got_packet);
+    if (status < 0) {
+        MP_ERR(ao, "error encoding at %d %d/%d\n",
+               frame ? (int) frame->pts : -1,
+               ac->codec->time_base.num,
+               ac->codec->time_base.den);
+        return;
+    }
+    if (!got_packet) {
+        return;
+    }
+    if (frame) {
+        if (ac->savepts == AV_NOPTS_VALUE)
+            ac->savepts = frame->pts;
+    }
+    encode_lavc_write_stats(ao->encode_lavc_ctx, ac->codec);
+    write_packet(ao, &packet);
+    av_packet_unref(&packet);
+#endif
+}
+
 // must get exactly ac->aframesize amount of data
-static int encode(struct ao *ao, double apts, void **data)
+static void encode(struct ao *ao, double apts, void **data)
 {
-    AVPacket packet;
     struct priv *ac = ao->priv;
     struct encode_lavc_context *ectx = ao->encode_lavc_ctx;
     double realapts = ac->aframecount * (double) ac->aframesize /
                       ao->samplerate;
-    int status, gotpacket;
 
     ac->aframecount++;
 
     if (data)
         ectx->audio_pts_offset = realapts - apts;
 
-    av_init_packet(&packet);
-    packet.data = ac->buffer;
-    packet.size = ac->buffer_size;
     if(data) {
         AVFrame *frame = av_frame_alloc();
         frame->format = af_to_avformat(ao->format);
@@ -270,64 +366,11 @@ static int encode(struct ao *ao, double apts, void **data)
         ac->lastpts = frame_pts;
 
         frame->quality = ac->codec->global_quality;
-        status = avcodec_encode_audio2(ac->codec, &packet, frame, &gotpacket);
-
-        if (!status) {
-            if (ac->savepts == AV_NOPTS_VALUE)
-                ac->savepts = frame->pts;
-        }
-
+        encode_audio_and_write(ao, frame);
         av_frame_free(&frame);
     }
     else
-    {
-        status = avcodec_encode_audio2(ac->codec, &packet, NULL, &gotpacket);
-    }
-
-    if(status) {
-        MP_ERR(ao, "error encoding\n");
-        return -1;
-    }
-
-    if(!gotpacket)
-        return 0;
-
-    MP_DBG(ao, "got pts %f (playback time: %f); out size: %d\n",
-           apts, realapts, packet.size);
-
-    encode_lavc_write_stats(ao->encode_lavc_ctx, ac->codec);
-
-    packet.stream_index = ac->stream->index;
-
-    // Do we need this at all? Better be safe than sorry...
-    if (packet.pts == AV_NOPTS_VALUE) {
-        MP_WARN(ao, "encoder lost pts, why?\n");
-        if (ac->savepts != MP_NOPTS_VALUE)
-            packet.pts = ac->savepts;
-    }
-
-    if (packet.pts != AV_NOPTS_VALUE)
-        packet.pts = av_rescale_q(packet.pts, ac->codec->time_base,
-                ac->stream->time_base);
-
-    if (packet.dts != AV_NOPTS_VALUE)
-        packet.dts = av_rescale_q(packet.dts, ac->codec->time_base,
-                ac->stream->time_base);
-
-    if(packet.duration > 0)
-        packet.duration = av_rescale_q(packet.duration, ac->codec->time_base,
-                ac->stream->time_base);
-
-    ac->savepts = AV_NOPTS_VALUE;
-
-    if (encode_lavc_write_frame(ao->encode_lavc_ctx, ac->stream, &packet) < 0) {
-        MP_ERR(ao, "error writing at %f %f/%f\n",
-               realapts, (double) ac->stream->time_base.num,
-               (double) ac->stream->time_base.den);
-        return -1;
-    }
-
-    return packet.size;
+        encode_audio_and_write(ao, NULL);
 }
 
 // this should round samples down to frame sizes
@@ -492,3 +535,5 @@ const struct ao_driver audio_out_lavc = {
     .play      = play,
     .drain     = drain,
 };
+
+// vim: sw=4 ts=4 et tw=80
diff --git a/audio/out/ao_oss.c b/audio/out/ao_oss.c
index 3216d67..90d3b3e 100644
--- a/audio/out/ao_oss.c
+++ b/audio/out/ao_oss.c
@@ -612,6 +612,12 @@ static int audio_wait(struct ao *ao, pthread_mutex_t *lock)
     return r;
 }
 
+static void list_devs(struct ao *ao, struct ao_device_list *list)
+{
+    if (stat(PATH_DEV_DSP, &(struct stat){0}) == 0)
+        ao_device_list_add(list, ao, &(struct ao_device_desc){"", "Default"});
+}
+
 #define OPT_BASE_STRUCT struct priv
 
 const struct ao_driver audio_out_oss = {
@@ -629,6 +635,7 @@ const struct ao_driver audio_out_oss = {
     .drain     = drain,
     .wait      = audio_wait,
     .wakeup    = ao_wakeup_poll,
+    .list_devs = list_devs,
     .priv_size = sizeof(struct priv),
     .priv_defaults = &(const struct priv) {
         .audio_fd = -1,
diff --git a/audio/out/ao_wasapi.c b/audio/out/ao_wasapi.c
index ae6bd3d..325a7cf 100644
--- a/audio/out/ao_wasapi.c
+++ b/audio/out/ao_wasapi.c
@@ -132,7 +132,7 @@ static bool thread_feed(struct ao *ao)
                  mp_time_us() + (int64_t)llrint(delay_us));
 
     // note, we can't use ao_read_data return value here since we already
-    // commited to frame_count above in the GetBuffer call
+    // committed to frame_count above in the GetBuffer call
     hr = IAudioRenderClient_ReleaseBuffer(state->pRenderClient,
                                           frame_count, 0);
     EXIT_ON_ERROR(hr);
diff --git a/audio/out/ao_wasapi_changenotify.c b/audio/out/ao_wasapi_changenotify.c
index e3ca4e4..46843ec 100644
--- a/audio/out/ao_wasapi_changenotify.c
+++ b/audio/out/ao_wasapi_changenotify.c
@@ -95,7 +95,7 @@ static HRESULT STDMETHODCALLTYPE sIMMNotificationClient_OnDeviceAdded(
     return S_OK;
 }
 
-// maybe MPV can go over to the prefered device once it is plugged in?
+// maybe MPV can go over to the preferred device once it is plugged in?
 static HRESULT STDMETHODCALLTYPE sIMMNotificationClient_OnDeviceRemoved(
     IMMNotificationClient *This,
     LPCWSTR pwstrDeviceId)
diff --git a/common/av_log.c b/common/av_log.c
index 8049728..64ce26d 100644
--- a/common/av_log.c
+++ b/common/av_log.c
@@ -189,7 +189,7 @@ struct lib {
     unsigned runv;
 };
 
-void print_libav_versions(struct mp_log *log, int v)
+bool print_libav_versions(struct mp_log *log, int v)
 {
     const struct lib libs[] = {
         {"libavutil",     LIBAVUTIL_VERSION_INT,     avutil_version()},
@@ -222,14 +222,7 @@ void print_libav_versions(struct mp_log *log, int v)
     mp_msg(log, v, "%s version: %s\n", LIB_PREFIX, av_version_info());
 #endif
 
-    if (mismatch) {
-        // Using mismatched libraries can be legitimate, but even then it's
-        // a bad idea. We don't acknowledge its usefulness and stability.
-        mp_warn(log, "Warning: mpv was compiled against a different version of "
-                "%s than the shared\nlibrary it is linked against. This is "
-                "most likely a broken build\nand misbehavior and crashes are "
-                "to be expected.\n", LIB_PREFIX);
-    }
+    return !mismatch;
 }
 
 #undef V
diff --git a/common/av_log.h b/common/av_log.h
index 17326b6..18f7fc9 100644
--- a/common/av_log.h
+++ b/common/av_log.h
@@ -1,8 +1,11 @@
 #ifndef MP_AV_LOG_H
 #define MP_AV_LOG_H
+
+#include <stdbool.h>
+
 struct mpv_global;
 struct mp_log;
 void init_libav(struct mpv_global *global);
 void uninit_libav(struct mpv_global *global);
-void print_libav_versions(struct mp_log *log, int v);
+bool print_libav_versions(struct mp_log *log, int v);
 #endif
diff --git a/input/input.c b/input/input.c
index b6ed77d..dd9486f 100644
--- a/input/input.c
+++ b/input/input.c
@@ -1247,12 +1247,9 @@ void mp_input_load(struct input_ctx *ictx)
         void *tmp = talloc_new(NULL);
         char **files = mp_find_all_config_files(tmp, ictx->global, "input.conf");
         for (int n = 0; files && files[n]; n++)
-            config_ok = config_ok | parse_config_file(ictx, files[n], false);
+            parse_config_file(ictx, files[n], false);
         talloc_free(tmp);
     }
-    if (!config_ok) {
-        MP_VERBOSE(ictx, "Falling back on default (hardcoded) input config\n");
-    }
 
     if (input_conf->use_alt_gr) {
         ictx->using_alt_gr = true;
diff --git a/libmpv/client.h b/libmpv/client.h
index 1980cea..3f5d759 100644
--- a/libmpv/client.h
+++ b/libmpv/client.h
@@ -1446,7 +1446,7 @@ int mpv_request_event(mpv_handle *ctx, mpv_event_id event, int enable);
  * required log level for a message to be received with MPV_EVENT_LOG_MESSAGE.
  *
  * @param min_level Minimal log level as string. Valid log levels:
- *                      no fatal error warn info status v debug trace
+ *                      no fatal error warn info v debug trace
  *                  The value "no" disables all messages. This is the default.
  *                  An exception is the value "terminal-default", which uses the
  *                  log level as set by the "--msg-level" option. This works
diff --git a/misc/bstr.h b/misc/bstr.h
index 2785520..4aba35e 100644
--- a/misc/bstr.h
+++ b/misc/bstr.h
@@ -116,7 +116,7 @@ int bstr_validate_utf8(struct bstr s);
 // talloc, with talloc_ctx as parent.
 struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s);
 
-// Return the text before the occurance of a character, and return it. Change
+// Return the text before the occurrence of a character, and return it. Change
 // *rest to point to the text following this character. (rest can be NULL.)
 struct bstr bstr_splitchar(struct bstr str, struct bstr *rest, const char c);
 
diff --git a/options/m_config.c b/options/m_config.c
index a3dcb30..1cb01eb 100644
--- a/options/m_config.c
+++ b/options/m_config.c
@@ -499,6 +499,13 @@ struct m_config_option *m_config_get_co(const struct m_config *config,
                     co->warning_was_printed = true;
                 }
                 return NULL;
+            } else if (co->opt->deprecation_message) {
+                if (!co->warning_was_printed) {
+                    MP_WARN(config, "Warning: option %s%s is deprecated "
+                            "and might be removed in the future (%s).\n",
+                            prefix, co->name, co->opt->deprecation_message);
+                    co->warning_was_printed = true;
+                }
             }
             return co;
         }
diff --git a/options/m_option.h b/options/m_option.h
index 7e65506..e77452a 100644
--- a/options/m_option.h
+++ b/options/m_option.h
@@ -333,6 +333,10 @@ struct m_option {
 
     // Initialize variable to given default before parsing options
     const void *defval;
+
+    // Print a warning when this option is used (for options with no direct
+    // replacement.)
+    const char *deprecation_message;
 };
 
 
@@ -394,7 +398,7 @@ struct m_option {
 
 // Dynamic data type.
 /** This flag indicates that the data is dynamically allocated (m_option::p
- *  points to a pointer). It enables a little hack in the \ref Config wich
+ *  points to a pointer). It enables a little hack in the \ref Config which
  *  replaces the initial value of such variables with a dynamic copy in case
  *  the initial value is statically allocated (pretty common with strings).
  */
diff --git a/options/options.c b/options/options.c
index bb2ccb5..e58eea1 100644
--- a/options/options.c
+++ b/options/options.c
@@ -410,13 +410,13 @@ const m_option_t mp_opts[] = {
                ({"no", SOFTVOL_NO},
                 {"yes", SOFTVOL_YES},
                 {"auto", SOFTVOL_AUTO})),
-    OPT_FLOATRANGE("softvol-max", softvol_max, 0, 100, 1000),
-    OPT_FLOATRANGE("volume", mixer_init_volume, 0, -1, 1000),
-    OPT_CHOICE("mute", mixer_init_mute, 0,
+    OPT_FLOATRANGE("volume-max", softvol_max, 0, 100, 1000),
+    // values <0 for volume and mute are legacy and ignored
+    OPT_FLOATRANGE("volume", softvol_volume, 0, -1, 1000),
+    OPT_CHOICE("mute", softvol_mute, 0,
                ({"auto", -1},
                 {"no", 0},
                 {"yes", 1})),
-    OPT_STRING("volume-restore-data", mixer_restore_volume_data, 0),
     OPT_CHOICE("gapless-audio", gapless_audio, 0,
                ({"no", 0},
                 {"yes", 1},
@@ -469,7 +469,8 @@ const m_option_t mp_opts[] = {
     OPT_STRING("vo-mmcss-profile", vo.mmcss_profile, M_OPT_FIXED),
 #endif
 
-    OPT_STRING("heartbeat-cmd", heartbeat_cmd, 0),
+    OPT_STRING("heartbeat-cmd", heartbeat_cmd, 0,
+               .deprecation_message = "use Lua scripting instead"),
     OPT_FLOAT("heartbeat-interval", heartbeat_interval, CONF_MIN, 0),
 
     OPT_CHOICE_OR_INT("screen", vo.screen_id, 0, 0, 32,
@@ -684,6 +685,7 @@ const m_option_t mp_opts[] = {
     OPT_REPLACED("ass-use-margins", "sub-use-margins"),
     OPT_REPLACED("media-title", "force-media-title"),
     OPT_REPLACED("input-unix-socket", "input-ipc-server"),
+    OPT_REPLACED("softvol-max", "volume-max"),
 
     {0}
 };
@@ -697,8 +699,8 @@ const struct MPOpts mp_default_opts = {
     .deinterlace = -1,
     .softvol = SOFTVOL_AUTO,
     .softvol_max = 130,
-    .mixer_init_volume = -1,
-    .mixer_init_mute = -1,
+    .softvol_volume = 100,
+    .softvol_mute = 0,
     .gapless_audio = -1,
     .audio_buffer = 0.2,
     .audio_device = "auto",
diff --git a/options/options.h b/options/options.h
index 5dcc642..3e8474f 100644
--- a/options/options.h
+++ b/options/options.h
@@ -89,9 +89,8 @@ typedef struct MPOpts {
     int ao_null_fallback;
     int force_vo;
     int softvol;
-    float mixer_init_volume;
-    int mixer_init_mute;
-    char *mixer_restore_volume_data;
+    float softvol_volume;
+    int softvol_mute;
     float softvol_max;
     int gapless_audio;
     double audio_buffer;
diff --git a/osdep/ar/HIDRemote.h b/osdep/ar/HIDRemote.h
index 9ea01d1..35db408 100644
--- a/osdep/ar/HIDRemote.h
+++ b/osdep/ar/HIDRemote.h
@@ -74,7 +74,7 @@
 typedef enum
 {
         kHIDRemoteModeNone = 0L,
-        kHIDRemoteModeShared,           // Share the remote with others - let's you listen to the remote control events as long as noone has an exclusive lock on it
+        kHIDRemoteModeShared,           // Share the remote with others - let's you listen to the remote control events as long as no one has an exclusive lock on it
                                         // (RECOMMENDED ONLY FOR SPECIAL PURPOSES)
 
         kHIDRemoteModeExclusive,        // Try to acquire an exclusive lock on the remote (NOT RECOMMENDED)
@@ -182,7 +182,7 @@ typedef enum
 - (BOOL)hidRemote:(HIDRemote *)hidRemote                                // Invoked when new hardware is inspected
         inspectNewHardwareWithService:(io_service_t)service             //
         prematchResult:(BOOL)prematchResult;                            // Return YES if HIDRemote should go on with this hardware and try
-                                                                        // to use it, or NO if it should not be persued further.
+                                                                        // to use it, or NO if it should not be pursued further.
 
 // Exlusive lock lending
 - (BOOL)hidRemote:(HIDRemote *)hidRemote
diff --git a/osdep/ar/HIDRemote.m b/osdep/ar/HIDRemote.m
index f62289e..47e35f4 100644
--- a/osdep/ar/HIDRemote.m
+++ b/osdep/ar/HIDRemote.m
@@ -293,7 +293,7 @@ static HIDRemote *sHIDRemote = nil;
 
                 }while(0);
 
-                // An error occured. Do necessary clean up.
+                // An error occurred. Do necessary clean up.
                 if (matchDict!=NULL)
                 {
                         CFRelease(matchDict);
@@ -1422,7 +1422,7 @@ static HIDRemote *sHIDRemote = nil;
                 [((NSObject <HIDRemoteDelegate> *)[self delegate]) hidRemote:self failedNewHardwareWithError:error];
         }
 
-        // An error occured or this device is not of interest .. cleanup ..
+        // An error occurred or this device is not of interest .. cleanup ..
         if (serviceNotification!=0)
         {
                 IOObjectRelease(serviceNotification);
@@ -1615,7 +1615,7 @@ static HIDRemote *sHIDRemote = nil;
         switch (buttonCode)
         {
                 case kHIDRemoteButtonCodeIDChanged:
-                        // Do nothing, this is handled seperately
+                        // Do nothing, this is handled separately
                 break;
 
                 case kHIDRemoteButtonCodeUp:
diff --git a/osdep/subprocess.c b/osdep/subprocess.c
index dbd6100..bc18f44 100644
--- a/osdep/subprocess.c
+++ b/osdep/subprocess.c
@@ -45,6 +45,10 @@ static void *run_subprocess(void *ptr)
     return NULL;
 }
 
+void mp_devnull(void *ctx, char *data, size_t size)
+{
+}
+
 void mp_subprocess_detached(struct mp_log *log, char **args)
 {
     struct subprocess_args *p = talloc_zero(NULL, struct subprocess_args);
diff --git a/osdep/subprocess.h b/osdep/subprocess.h
index a32e791..f272e1a 100644
--- a/osdep/subprocess.h
+++ b/osdep/subprocess.h
@@ -24,6 +24,8 @@ struct mp_cancel;
 
 typedef void (*subprocess_read_cb)(void *ctx, char *data, size_t size);
 
+void mp_devnull(void *ctx, char *data, size_t size);
+
 // Start a subprocess. Uses callbacks to read from stdout and stderr.
 int mp_subprocess(char **args, struct mp_cancel *cancel, void *ctx,
                   subprocess_read_cb on_stdout, subprocess_read_cb on_stderr,
diff --git a/player/audio.c b/player/audio.c
index 2ce1669..b61e464 100644
--- a/player/audio.c
+++ b/player/audio.c
@@ -132,7 +132,7 @@ static int recreate_audio_filters(struct MPContext *mpctx)
     if (afs->initialized < 1 && af_init(afs) < 0)
         goto fail;
 
-    mixer_reinit_audio(mpctx->mixer, mpctx->ao, afs);
+    mixer_reinit_audio(mpctx->mixer, afs);
 
     mp_notify(mpctx, MPV_EVENT_AUDIO_RECONFIG, NULL);
 
diff --git a/player/command.c b/player/command.c
index bc48d8c..3bca64b 100644
--- a/player/command.c
+++ b/player/command.c
@@ -60,6 +60,7 @@
 #include "audio/filter/af.h"
 #include "video/decode/dec_video.h"
 #include "audio/decode/dec_audio.h"
+#include "video/out/bitmap_packer.h"
 #include "options/path.h"
 #include "screenshot.h"
 
@@ -88,7 +89,8 @@ struct command_ctx {
     // One of these is in use by the OSD; the other one exists so that the
     // bitmap list can be manipulated without additional synchronization.
     struct sub_bitmaps overlay_osd[2];
-    struct sub_bitmaps *overlay_osd_current;
+    int overlay_osd_current;
+    struct bitmap_packer *overlay_packer;
 
     struct hook_handler **hooks;
     int num_hooks;
@@ -98,9 +100,8 @@ struct command_ctx {
 };
 
 struct overlay {
-    void *map_start;
-    size_t map_size;
-    struct sub_bitmap osd;
+    struct mp_image *source;
+    int x, y;
 };
 
 struct hook_handler {
@@ -1568,44 +1569,29 @@ static int mp_property_volume(void *ctx, struct m_property *prop,
                               int action, void *arg)
 {
     MPContext *mpctx = ctx;
+    struct MPOpts *opts = mpctx->opts;
+
     switch (action) {
-    case M_PROPERTY_GET:
-        mixer_getbothvolume(mpctx->mixer, arg);
-        return M_PROPERTY_OK;
     case M_PROPERTY_GET_TYPE:
         *(struct m_option *)arg = (struct m_option){
             .type = CONF_TYPE_FLOAT,
             .flags = M_OPT_RANGE,
             .min = 0,
-            .max = mixer_getmaxvolume(mpctx->mixer),
+            .max = opts->softvol_max,
         };
         return M_PROPERTY_OK;
     case M_PROPERTY_GET_NEUTRAL:
         *(float *)arg = 100;
         return M_PROPERTY_OK;
-    case M_PROPERTY_PRINT: {
-        float val;
-        mixer_getbothvolume(mpctx->mixer, &val);
-        *(char **)arg = talloc_asprintf(NULL, "%i", (int)val);
-        return M_PROPERTY_OK;
-    }
-    case M_PROPERTY_SET:
-        mixer_setvolume(mpctx->mixer, *(float *) arg, *(float *) arg);
-        return M_PROPERTY_OK;
-    case M_PROPERTY_SWITCH: {
-        struct m_property_switch_arg *sarg = arg;
-        mixer_addvolume(mpctx->mixer, sarg->inc);
+    case M_PROPERTY_PRINT:
+        *(char **)arg = talloc_asprintf(NULL, "%i", (int)opts->softvol_volume);
         return M_PROPERTY_OK;
     }
-    }
-    return M_PROPERTY_NOT_IMPLEMENTED;
-}
 
-static int mp_property_volume_max(void *ctx, struct m_property *prop,
-                                  int action, void *arg)
-{
-    MPContext *mpctx = ctx;
-    return m_property_float_ro(action, arg, mixer_getmaxvolume(mpctx->mixer));
+    int r = mp_property_generic_option(mpctx, prop, action, arg);
+    if (action == M_PROPERTY_SET)
+        mixer_update_volume(mpctx->mixer);
+    return r;
 }
 
 /// Mute (RW)
@@ -1613,34 +1599,76 @@ static int mp_property_mute(void *ctx, struct m_property *prop,
                             int action, void *arg)
 {
     MPContext *mpctx = ctx;
+
+    if (action == M_PROPERTY_GET_TYPE) {
+        *(struct m_option *)arg = (struct m_option){.type = CONF_TYPE_FLAG};
+        return M_PROPERTY_OK;
+    }
+
+    int r = mp_property_generic_option(mpctx, prop, action, arg);
+    if (action == M_PROPERTY_SET)
+        mixer_update_volume(mpctx->mixer);
+    return r;
+}
+
+static int mp_property_ao_volume(void *ctx, struct m_property *prop,
+                                 int action, void *arg)
+{
+    MPContext *mpctx = ctx;
+    struct ao *ao = mpctx->ao;
+    if (!ao)
+        return M_PROPERTY_NOT_IMPLEMENTED;
+
     switch (action) {
-    case M_PROPERTY_SET:
-        mixer_setmute(mpctx->mixer, *(int *) arg);
+    case M_PROPERTY_SET: {
+        float value = *(float *)arg;
+        ao_control_vol_t vol = {value, value};
+        if (ao_control(ao, AOCONTROL_SET_VOLUME, &vol) != CONTROL_OK)
+            return M_PROPERTY_UNAVAILABLE;
         return M_PROPERTY_OK;
-    case M_PROPERTY_GET:
-        *(int *)arg =  mixer_getmute(mpctx->mixer);
+    }
+    case M_PROPERTY_GET: {
+        ao_control_vol_t vol = {0};
+        if (ao_control(ao, AOCONTROL_GET_VOLUME, &vol) != CONTROL_OK)
+            return M_PROPERTY_UNAVAILABLE;
+        *(float *)arg = (vol.left + vol.right) / 2.0f;
         return M_PROPERTY_OK;
+    }
     case M_PROPERTY_GET_TYPE:
-        *(struct m_option *)arg = (struct m_option){.type = CONF_TYPE_FLAG};
+        *(struct m_option *)arg = (struct m_option){.type = CONF_TYPE_FLOAT};
         return M_PROPERTY_OK;
     }
     return M_PROPERTY_NOT_IMPLEMENTED;
 }
 
-static int mp_property_volrestore(void *ctx, struct m_property *prop,
-                                  int action, void *arg)
+
+static int mp_property_ao_mute(void *ctx, struct m_property *prop,
+                               int action, void *arg)
 {
     MPContext *mpctx = ctx;
+    struct ao *ao = mpctx->ao;
+    if (!ao)
+        return M_PROPERTY_NOT_IMPLEMENTED;
+
     switch (action) {
+    case M_PROPERTY_SET: {
+        bool value = *(int *)arg;
+        if (ao_control(ao, AOCONTROL_SET_MUTE, &value) != CONTROL_OK)
+            return M_PROPERTY_UNAVAILABLE;
+        return M_PROPERTY_OK;
+    }
     case M_PROPERTY_GET: {
-        char *s = mixer_get_volume_restore_data(mpctx->mixer);
-        *(char **)arg = s;
-        return s ? M_PROPERTY_OK : M_PROPERTY_UNAVAILABLE;
+        bool value = false;
+        if (ao_control(ao, AOCONTROL_GET_MUTE, &value) != CONTROL_OK)
+            return M_PROPERTY_UNAVAILABLE;
+        *(int *)arg = value;
+        return M_PROPERTY_OK;
     }
-    case M_PROPERTY_SET:
-        return M_PROPERTY_NOT_IMPLEMENTED;
+    case M_PROPERTY_GET_TYPE:
+        *(struct m_option *)arg = (struct m_option){.type = CONF_TYPE_FLAG};
+        return M_PROPERTY_OK;
     }
-    return mp_property_generic_option(mpctx, prop, action, arg);
+    return M_PROPERTY_NOT_IMPLEMENTED;
 }
 
 static int get_device_entry(int item, int action, void *arg, void *ctx)
@@ -2257,88 +2285,6 @@ static int mp_property_detected_hwdec(void *ctx, struct m_property *prop,
     return M_PROPERTY_NOT_IMPLEMENTED;
 }
 
-#define VF_DEINTERLACE_LABEL "deinterlace"
-
-static bool probe_deint_filter(struct MPContext *mpctx, const char *filt)
-{
-    char filter[80];
-    // add a label so that removing the filter is easier
-    snprintf(filter, sizeof(filter), "@%s:%s", VF_DEINTERLACE_LABEL, filt);
-    return edit_filters(mpctx, mp_null_log, STREAM_VIDEO, "pre", filter) >= 0;
-}
-
-static bool check_output_format(struct MPContext *mpctx, int imgfmt)
-{
-    struct vo_chain *vo_c = mpctx->vo_chain;
-    if (!vo_c)
-        return false;
-    return vo_c->vf->allowed_output_formats[imgfmt - IMGFMT_START];
-}
-
-static int probe_deint_filters(struct MPContext *mpctx)
-{
-    if (check_output_format(mpctx, IMGFMT_VDPAU)) {
-        char filter[80] = "vdpaupp:deint=yes";
-        int pref = 0;
-        vo_control(mpctx->video_out, VOCTRL_GET_PREF_DEINT, &pref);
-        pref = pref < 0 ? -pref : pref;
-        if (pref > 0 && pref <= 4) {
-            const char *types[] =
-                {"", "first-field", "bob", "temporal", "temporal-spatial"};
-            mp_snprintf_cat(filter, sizeof(filter), ":deint-mode=%s",
-                            types[pref]);
-        }
-
-        probe_deint_filter(mpctx, filter);
-        return 0;
-    }
-    if (check_output_format(mpctx, IMGFMT_VAAPI) &&
-        probe_deint_filter(mpctx, "vavpp"))
-        return 0;
-    if ((check_output_format(mpctx, IMGFMT_D3D11VA) ||
-         check_output_format(mpctx, IMGFMT_D3D11NV12)) &&
-        probe_deint_filter(mpctx, "d3d11vpp"))
-        return 0;
-    if (probe_deint_filter(mpctx, "yadif"))
-        return 0;
-    return -1;
-}
-
-static int get_deinterlacing(struct MPContext *mpctx)
-{
-    struct vo_chain *vo_c = mpctx->vo_chain;
-    int enabled = 0;
-    if (video_vf_vo_control(vo_c, VFCTRL_GET_DEINTERLACE, &enabled) != CONTROL_OK)
-        enabled = -1;
-    if (enabled < 0) {
-        // vf_lavfi doesn't support VFCTRL_GET_DEINTERLACE
-        if (vf_find_by_label(vo_c->vf, VF_DEINTERLACE_LABEL))
-            enabled = 1;
-    }
-    return enabled;
-}
-
-void remove_deint_filter(struct MPContext *mpctx)
-{
-    edit_filters(mpctx, mp_null_log, STREAM_VIDEO, "del", "@" VF_DEINTERLACE_LABEL);
-}
-
-void set_deinterlacing(struct MPContext *mpctx, bool enable)
-{
-    struct vo_chain *vo_c = mpctx->vo_chain;
-    if (vf_find_by_label(vo_c->vf, VF_DEINTERLACE_LABEL)) {
-        if (!enable)
-            remove_deint_filter(mpctx);
-    } else {
-        if ((get_deinterlacing(mpctx) > 0) != enable) {
-            int arg = enable;
-            if (video_vf_vo_control(vo_c, VFCTRL_SET_DEINTERLACE, &arg) != CONTROL_OK)
-                probe_deint_filters(mpctx);
-        }
-    }
-    mpctx->opts->deinterlace = get_deinterlacing(mpctx) > 0;
-}
-
 static int mp_property_deinterlace(void *ctx, struct m_property *prop,
                                    int action, void *arg)
 {
@@ -2568,13 +2514,13 @@ static int property_imgparams(struct mp_image_params p, int action, void *arg)
         {"aspect",          SUB_PROP_FLOAT(d_w / (double)d_h)},
         {"par",             SUB_PROP_FLOAT(p.p_w / (double)p.p_h)},
         {"colormatrix",
-            SUB_PROP_STR(m_opt_choice_str(mp_csp_names, p.colorspace))},
+            SUB_PROP_STR(m_opt_choice_str(mp_csp_names, p.color.space))},
         {"colorlevels",
-            SUB_PROP_STR(m_opt_choice_str(mp_csp_levels_names, p.colorlevels))},
+            SUB_PROP_STR(m_opt_choice_str(mp_csp_levels_names, p.color.levels))},
         {"primaries",
-            SUB_PROP_STR(m_opt_choice_str(mp_csp_prim_names, p.primaries))},
+            SUB_PROP_STR(m_opt_choice_str(mp_csp_prim_names, p.color.primaries))},
         {"gamma",
-            SUB_PROP_STR(m_opt_choice_str(mp_csp_trc_names, p.gamma))},
+            SUB_PROP_STR(m_opt_choice_str(mp_csp_trc_names, p.color.gamma))},
         {"chroma-location",
             SUB_PROP_STR(m_opt_choice_str(mp_chroma_names, p.chroma_location))},
         {"stereo-in",
@@ -3751,8 +3697,10 @@ static const struct m_property mp_properties[] = {
     // Audio
     {"mixer-active", mp_property_mixer_active},
     {"volume", mp_property_volume},
-    {"volume-max", mp_property_volume_max},
+    {"volume-max", mp_property_generic_option},
     {"mute", mp_property_mute},
+    {"ao-volume", mp_property_ao_volume},
+    {"ao-mute", mp_property_ao_mute},
     {"audio-delay", mp_property_audio_delay},
     {"audio-codec-name", mp_property_audio_codec_name},
     {"audio-codec", mp_property_audio_codec},
@@ -3762,7 +3710,6 @@ static const struct m_property mp_properties[] = {
     M_PROPERTY_DEPRECATED_ALIAS("audio-channels", "audio-params/channel-count"),
     {"aid", mp_property_audio},
     {"balance", mp_property_balance},
-    {"volume-restore-data", mp_property_volrestore},
     {"audio-device", mp_property_audio_device},
     {"audio-device-list", mp_property_audio_devices},
     {"current-ao", mp_property_ao},
@@ -3940,7 +3887,7 @@ static const char *const *const mp_event_property_change[] = {
       "colormatrix-output-range", "colormatrix-primaries", "video-aspect"),
     E(MPV_EVENT_AUDIO_RECONFIG, "audio-format", "audio-codec", "audio-bitrate",
       "samplerate", "channels", "audio", "volume", "mute", "balance",
-      "volume-restore-data", "current-ao", "audio-codec-name", "audio-params",
+      "current-ao", "audio-codec-name", "audio-params",
       "audio-out-params", "volume-max", "mixer-active"),
     E(MPV_EVENT_SEEK, "seeking", "core-idle", "eof-reached"),
     E(MPV_EVENT_PLAYBACK_RESTART, "seeking", "core-idle", "eof-reached"),
@@ -4314,20 +4261,85 @@ static int edit_filters_osd(struct MPContext *mpctx, enum stream_type mediatype,
 static void recreate_overlays(struct MPContext *mpctx)
 {
     struct command_ctx *cmd = mpctx->command_ctx;
-    struct sub_bitmaps *new = &cmd->overlay_osd[0];
-    if (new == cmd->overlay_osd_current)
-        new += 1; // pick the unused one
+    int overlay_next = !cmd->overlay_osd_current;
+    struct sub_bitmaps *new = &cmd->overlay_osd[overlay_next];
     new->format = SUBBITMAP_RGBA;
     new->change_id = 1;
-    // overlay array can have unused entries, but parts list must be "packed"
+
+    bool valid = false;
+
     new->num_parts = 0;
     for (int n = 0; n < cmd->num_overlays; n++) {
         struct overlay *o = &cmd->overlays[n];
-        if (o->osd.bitmap)
-            MP_TARRAY_APPEND(cmd, new->parts, new->num_parts, o->osd);
+        if (o->source) {
+            struct mp_image *s = o->source;
+            struct sub_bitmap b = {
+                .bitmap = s->planes[0],
+                .stride = s->stride[0],
+                .w = s->w, .dw = s->w,
+                .h = s->h, .dh = s->h,
+                .x = o->x,
+                .y = o->y,
+            };
+            MP_TARRAY_APPEND(cmd, new->parts, new->num_parts, b);
+        }
+    }
+
+    if (!cmd->overlay_packer)
+        cmd->overlay_packer = talloc_zero(cmd, struct bitmap_packer);
+
+    cmd->overlay_packer->padding = 1; // assume bilinear scaling
+    packer_set_size(cmd->overlay_packer, new->num_parts);
+
+    for (int n = 0; n < new->num_parts; n++)
+        cmd->overlay_packer->in[n] = (struct pos){new->parts[n].w, new->parts[n].h};
+
+    if (packer_pack(cmd->overlay_packer) < 0 || new->num_parts == 0)
+        goto done;
+
+    struct pos bb[2];
+    packer_get_bb(cmd->overlay_packer, bb);
+
+    new->packed_w = bb[1].x;
+    new->packed_h = bb[1].y;
+
+    if (!new->packed || new->packed->w < new->packed_w ||
+                        new->packed->h < new->packed_h)
+    {
+        talloc_free(new->packed);
+        new->packed = mp_image_alloc(IMGFMT_BGRA, cmd->overlay_packer->w,
+                                                  cmd->overlay_packer->h);
+        if (!new->packed)
+            goto done;
+    }
+
+    // clear padding
+    mp_image_clear(new->packed, 0, 0, new->packed->w, new->packed->h);
+
+    for (int n = 0; n < new->num_parts; n++) {
+        struct sub_bitmap *b = &new->parts[n];
+        struct pos pos = cmd->overlay_packer->result[n];
+
+        int stride = new->packed->stride[0];
+        void *pdata = (uint8_t *)new->packed->planes[0] + pos.y * stride + pos.x * 4;
+        memcpy_pic(pdata, b->bitmap, b->w * 4, b->h, stride, b->stride);
+
+        b->bitmap = pdata;
+        b->stride = stride;
+
+        b->src_x = pos.x;
+        b->src_y = pos.y;
+    }
+
+    valid = true;
+done:
+    if (!valid) {
+        new->format = SUBBITMAP_EMPTY;
+        new->num_parts = 0;
     }
-    cmd->overlay_osd_current = new;
-    osd_set_external2(mpctx->osd, cmd->overlay_osd_current);
+
+    osd_set_external2(mpctx->osd, new);
+    cmd->overlay_osd_current = overlay_next;
 }
 
 // Set overlay with the given ID to the contents as described by "new".
@@ -4342,17 +4354,11 @@ static void replace_overlay(struct MPContext *mpctx, int id, struct overlay *new
     }
 
     struct overlay *ptr = &cmd->overlays[id];
-    struct overlay old = *ptr;
-
-    if (!ptr->osd.bitmap && !new->osd.bitmap)
-        return; // don't need to recreate or unmap
 
+    talloc_free(ptr->source);
     *ptr = *new;
-    recreate_overlays(mpctx);
 
-    // Do this afterwards, so we never unmap while the OSD is using it.
-    if (old.map_start && old.map_size)
-        munmap(old.map_start, old.map_size);
+    recreate_overlays(mpctx);
 }
 
 static int overlay_add(struct MPContext *mpctx, int id, int x, int y,
@@ -4368,18 +4374,17 @@ static int overlay_add(struct MPContext *mpctx, int id, int x, int y,
         MP_ERR(mpctx, "overlay_add: invalid id %d\n", id);
         goto error;
     }
-    if (w < 0 || h < 0 || stride < w * 4 || (stride % 4)) {
+    if (w <= 0 || h <= 0 || stride < w * 4 || (stride % 4)) {
         MP_ERR(mpctx, "overlay_add: inconsistent parameters\n");
         goto error;
     }
     struct overlay overlay = {
-        .osd = {
-            .stride = stride,
-            .x = x, .y = y,
-            .w = w, .h = h,
-            .dw = w, .dh = h,
-        },
+        .source = mp_image_alloc(IMGFMT_BGRA, w, h),
+        .x = x,
+        .y = y,
     };
+    if (!overlay.source)
+        goto error;
     int fd = -1;
     bool close_fd = true;
     void *p = NULL;
@@ -4398,21 +4403,25 @@ static int overlay_add(struct MPContext *mpctx, int id, int x, int y,
     } else {
         fd = open(file, O_RDONLY | O_BINARY | O_CLOEXEC);
     }
+    int map_size = 0;
     if (fd >= 0) {
-        overlay.map_size = offset + h * stride;
-        void *m = mmap(NULL, overlay.map_size, PROT_READ, MAP_SHARED, fd, 0);
+        map_size = offset + h * stride;
+        void *m = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, 0);
         if (close_fd)
             close(fd);
-        if (m && m != MAP_FAILED) {
-            overlay.map_start = m;
+        if (m && m != MAP_FAILED)
             p = m;
-        }
     }
     if (!p) {
         MP_ERR(mpctx, "overlay_add: could not open or map '%s'\n", file);
+        talloc_free(overlay.source);
         goto error;
     }
-    overlay.osd.bitmap = (char *)p + offset;
+    memcpy_pic(overlay.source->planes[0], (char *)p + offset, w * 4, h,
+               overlay.source->stride[0], stride);
+    if (map_size)
+        munmap(p, map_size);
+
     replace_overlay(mpctx, id, &overlay);
     r = 0;
 error:
@@ -4434,6 +4443,8 @@ static void overlay_uninit(struct MPContext *mpctx)
     for (int id = 0; id < cmd->num_overlays; id++)
         overlay_remove(mpctx, id);
     osd_set_external2(mpctx->osd, NULL);
+    for (int n = 0; n < 2; n++)
+        mp_image_unrefp(&cmd->overlay_osd[n].packed);
 }
 
 struct cycle_counter {
@@ -4958,6 +4969,7 @@ int run_command(struct MPContext *mpctx, struct mp_cmd *cmd, struct mpv_node *re
         mpctx->add_osd_seek_info |=
                 (msg_osd ? OSD_SEEK_INFO_TEXT : 0) |
                 (bar_osd ? OSD_SEEK_INFO_BAR : 0);
+        mpctx->osd_force_update = true;
         break;
 
     case MP_CMD_TV_LAST_CHANNEL: {
diff --git a/player/command.h b/player/command.h
index 7c3994c..a233319 100644
--- a/player/command.h
+++ b/player/command.h
@@ -59,7 +59,4 @@ void mp_hook_run(struct MPContext *mpctx, char *client, char *type);
 
 void handle_ab_loop(struct MPContext *mpctx);
 
-void remove_deint_filter(struct MPContext *mpctx);
-void set_deinterlacing(struct MPContext *mpctx, bool enable);
-
 #endif /* MPLAYER_COMMAND_H */
diff --git a/player/configfiles.c b/player/configfiles.c
index 3c42331..7356a9a 100644
--- a/player/configfiles.c
+++ b/player/configfiles.c
@@ -208,7 +208,8 @@ static const char *const backup_properties[] = {
     "options/speed",
     "options/edition",
     "options/pause",
-    "volume-restore-data",
+    "volume",
+    "mute",
     "options/audio-delay",
     //"balance",
     "options/fullscreen",
diff --git a/player/core.h b/player/core.h
index 61360b7..8afcfbe 100644
--- a/player/core.h
+++ b/player/core.h
@@ -555,5 +555,7 @@ void uninit_video_out(struct MPContext *mpctx);
 void uninit_video_chain(struct MPContext *mpctx);
 double calc_average_frame_duration(struct MPContext *mpctx);
 int init_video_decoder(struct MPContext *mpctx, struct track *track);
+int get_deinterlacing(struct MPContext *mpctx);
+void set_deinterlacing(struct MPContext *mpctx, bool enable);
 
 #endif /* MPLAYER_MP_CORE_H */
diff --git a/player/main.c b/player/main.c
index 21c2733..88b60e1 100644
--- a/player/main.c
+++ b/player/main.c
@@ -418,6 +418,18 @@ int mp_initialize(struct MPContext *mpctx, char **options)
     if (handle_help_options(mpctx))
         return -2;
 
+    if (!print_libav_versions(mp_null_log, 0)) {
+        // Using mismatched libraries can be legitimate, but even then it's
+        // a bad idea. We don't acknowledge its usefulness and stability.
+        print_libav_versions(mpctx->log, MSGL_FATAL);
+        MP_FATAL(mpctx, "\nmpv was compiled against a different version of "
+                 "FFmpeg/Libav than the shared\nlibrary it is linked against. "
+                 "This is most likely a broken build and could\nresult in "
+                 "misbehavior and crashes.\n\nmpv does not support this "
+                 "configuration and will not run - rebuild mpv instead.\n");
+        return -1;
+    }
+
     if (opts->dump_stats && opts->dump_stats[0]) {
         if (mp_msg_open_stats_file(mpctx->global, opts->dump_stats) < 0)
             MP_ERR(mpctx, "Failed to open stats file '%s'\n", opts->dump_stats);
diff --git a/player/osd.c b/player/osd.c
index 69b8dbb..aa4d724 100644
--- a/player/osd.c
+++ b/player/osd.c
@@ -129,6 +129,12 @@ static void term_osd_set_status(struct MPContext *mpctx, const char *text)
 {
     talloc_free(mpctx->term_osd_status);
     mpctx->term_osd_status = talloc_strdup(mpctx, text);
+
+    int w = 80, h = 24;
+    terminal_get_size(&w, &h);
+    if (strlen(mpctx->term_osd_status) > w)
+        mpctx->term_osd_status[w] = '\0';
+
     term_osd_update(mpctx);
 }
 
diff --git a/player/playloop.c b/player/playloop.c
index 311bbd1..0062a30 100644
--- a/player/playloop.c
+++ b/player/playloop.c
@@ -189,7 +189,7 @@ static void mp_seek(MPContext *mpctx, struct seek_params seek)
 
     if (!mpctx->demuxer->seekable) {
         MP_ERR(mpctx, "Cannot seek in this file.\n");
-        MP_ERR(mpctx, "You can forcibly enable it with '--force-seeking=yes'.\n");
+        MP_ERR(mpctx, "You can forcibly enable it with '--force-seekable=yes'.\n");
         return;
     }
 
diff --git a/player/video.c b/player/video.c
index 1d2dc29..59ce72f 100644
--- a/player/video.c
+++ b/player/video.c
@@ -49,6 +49,8 @@
 #include "command.h"
 #include "screenshot.h"
 
+#define VF_DEINTERLACE_LABEL "deinterlace"
+
 enum {
     // update_video() - code also uses: <0 error, 0 eof, >0 progress
     VD_ERROR = -1,
@@ -153,8 +155,45 @@ static int try_filter(struct vo_chain *vo_c, char *name, char *label, char **arg
     return 0;
 }
 
+static bool check_output_format(struct vo_chain *vo_c, int imgfmt)
+{
+    return vo_c->vf->output_params.imgfmt == imgfmt;
+}
+
+static int probe_deint_filters(struct vo_chain *vo_c)
+{
+    // Usually, we prefer inserting/removing deint filters. But If there's VO
+    // support, or the user inserted a filter that supports swichting deint and
+    // that has no VF_DEINTERLACE_LABEL, or if the filter was auto-inserted
+    // for other reasons and supports switching deint (like vf_d3d11vpp), then
+    // use the runtime switching method.
+    if (video_vf_vo_control(vo_c, VFCTRL_SET_DEINTERLACE, &(int){1}) == CONTROL_OK)
+        return 0;
+
+    if (check_output_format(vo_c, IMGFMT_VDPAU)) {
+        char *args[5] = {"deint", "yes"};
+        int pref = 0;
+        vo_control(vo_c->vo, VOCTRL_GET_PREF_DEINT, &pref);
+        pref = pref < 0 ? -pref : pref;
+        if (pref > 0 && pref <= 4) {
+            const char *types[] =
+                {"", "first-field", "bob", "temporal", "temporal-spatial"};
+            args[2] = "deint-mode";
+            args[3] = (char *)types[pref];
+        }
+
+        return try_filter(vo_c, "vdpaupp", VF_DEINTERLACE_LABEL, args);
+    }
+    if (check_output_format(vo_c, IMGFMT_VAAPI))
+        return try_filter(vo_c, "vavpp", VF_DEINTERLACE_LABEL, NULL);
+    if (check_output_format(vo_c, IMGFMT_D3D11VA) ||
+        check_output_format(vo_c, IMGFMT_D3D11NV12))
+        return try_filter(vo_c, "d3d11vpp", VF_DEINTERLACE_LABEL, NULL);
+    return try_filter(vo_c, "yadif", VF_DEINTERLACE_LABEL, NULL);
+}
+
 // Reconfigure the filter chain according to the new input format.
-static void filter_reconfig(struct vo_chain *vo_c)
+static void filter_reconfig(struct MPContext *mpctx, struct vo_chain *vo_c)
 {
     struct mp_image_params params = vo_c->input_format;
     if (!params.imgfmt)
@@ -162,19 +201,21 @@ static void filter_reconfig(struct vo_chain *vo_c)
 
     set_allowed_vo_formats(vo_c);
 
-    if (vf_reconfig(vo_c->vf, &params) < 0)
-        return;
-
-    char *filters[] = {"autorotate", "autostereo3d", NULL};
+    char *filters[] = {"autorotate", "autostereo3d", "deinterlace", NULL};
     for (int n = 0; filters[n]; n++) {
         struct vf_instance *vf = vf_find_by_label(vo_c->vf, filters[n]);
-        if (vf) {
+        if (vf)
             vf_remove_filter(vo_c->vf, vf);
-            if (vf_reconfig(vo_c->vf, &params) < 0)
-                return;
-        }
     }
 
+    if (vo_c->vf->initialized < 1) {
+        if (vf_reconfig(vo_c->vf, &params) < 0)
+            return;
+    }
+
+    // Make sure to reset this even if runtime deint switching is used.
+    video_vf_vo_control(vo_c, VFCTRL_SET_DEINTERLACE, &(int){0});
+
     if (params.rotate && (params.rotate % 90 == 0)) {
         if (!(vo_c->vo->driver->caps & VO_CAP_ROTATE90)) {
             // Try to insert a rotation filter.
@@ -194,6 +235,42 @@ static void filter_reconfig(struct vo_chain *vo_c)
                 MP_ERR(vo_c, "Can't insert 3D conversion filter.\n");
         }
     }
+
+    if (mpctx->opts->deinterlace == 1)
+        probe_deint_filters(vo_c);
+}
+
+static void recreate_auto_filters(struct MPContext *mpctx)
+{
+    filter_reconfig(mpctx, mpctx->vo_chain);
+
+    mp_force_video_refresh(mpctx);
+
+    mp_notify(mpctx, MPV_EVENT_VIDEO_RECONFIG, NULL);
+}
+
+int get_deinterlacing(struct MPContext *mpctx)
+{
+    struct vo_chain *vo_c = mpctx->vo_chain;
+    int enabled = 0;
+    if (video_vf_vo_control(vo_c, VFCTRL_GET_DEINTERLACE, &enabled) != CONTROL_OK)
+        enabled = -1;
+    if (enabled < 0) {
+        // vf_lavfi doesn't support VFCTRL_GET_DEINTERLACE
+        if (vf_find_by_label(vo_c->vf, VF_DEINTERLACE_LABEL))
+            enabled = 1;
+    }
+    return enabled;
+}
+
+void set_deinterlacing(struct MPContext *mpctx, bool enable)
+{
+    if (enable == (get_deinterlacing(mpctx) > 0))
+        return;
+
+    mpctx->opts->deinterlace = enable;
+    recreate_auto_filters(mpctx);
+    mpctx->opts->deinterlace = get_deinterlacing(mpctx) > 0;
 }
 
 static void recreate_video_filters(struct MPContext *mpctx)
@@ -230,7 +307,7 @@ int reinit_video_filters(struct MPContext *mpctx)
     recreate_video_filters(mpctx);
 
     if (need_reconfig)
-        filter_reconfig(vo_c);
+        filter_reconfig(mpctx, vo_c);
 
     mp_force_video_refresh(mpctx);
 
@@ -314,7 +391,6 @@ void uninit_video_chain(struct MPContext *mpctx)
 
         mpctx->video_status = STATUS_EOF;
 
-        remove_deint_filter(mpctx);
         mp_notify(mpctx, MPV_EVENT_VIDEO_RECONFIG, NULL);
     }
 }
@@ -522,22 +598,6 @@ static int decode_image(struct MPContext *mpctx)
     }
 }
 
-// Called after video reinit. This can be generally used to try to insert more
-// filters using the filter chain edit functionality in command.c.
-static void init_filter_params(struct MPContext *mpctx)
-{
-    struct MPOpts *opts = mpctx->opts;
-
-    // Note that the filter chain is already initialized. This code might
-    // recreate the chain a second time, which is not very elegant, but allows
-    // us to test whether enabling deinterlacing works with the current video
-    // format and other filters.
-    if (opts->deinterlace >= 0) {
-        remove_deint_filter(mpctx);
-        set_deinterlacing(mpctx, opts->deinterlace != 0);
-    }
-}
-
 // Feed newly decoded frames to the filter, take care of format changes.
 // If eof=true, drain the filter chain, and return VD_EOF if empty.
 static int video_filter(struct MPContext *mpctx, bool eof)
@@ -564,7 +624,8 @@ static int video_filter(struct MPContext *mpctx, bool eof)
             return VD_PROGRESS;
 
         // The filter chain is drained; execute the filter format change.
-        filter_reconfig(mpctx->vo_chain);
+        vf->initialized = 0;
+        filter_reconfig(mpctx, mpctx->vo_chain);
 
         mp_notify(mpctx, MPV_EVENT_VIDEO_RECONFIG, NULL);
 
@@ -586,7 +647,6 @@ static int video_filter(struct MPContext *mpctx, bool eof)
             MP_FATAL(mpctx, "Cannot initialize video filters.\n");
             return VD_ERROR;
         }
-        init_filter_params(mpctx);
         return VD_RECONFIG;
     }
 
@@ -1360,11 +1420,13 @@ void write_video(struct MPContext *mpctx)
     };
     calculate_frame_duration(mpctx);
 
+    int req = vo_get_num_req_frames(mpctx->video_out);
+    assert(req >= 1 && req <= VO_MAX_REQ_FRAMES);
     struct vo_frame dummy = {
         .pts = pts,
         .duration = -1,
         .still = mpctx->step_frames > 0,
-        .num_frames = MPMIN(mpctx->num_next_frames, VO_MAX_REQ_FRAMES),
+        .num_frames = MPMIN(mpctx->num_next_frames, req),
         .num_vsyncs = 1,
     };
     for (int n = 0; n < dummy.num_frames; n++)
diff --git a/stream/tvi_v4l2.c b/stream/tvi_v4l2.c
index f882818..91c810a 100644
--- a/stream/tvi_v4l2.c
+++ b/stream/tvi_v4l2.c
@@ -1352,7 +1352,7 @@ static int start(priv_t *priv)
         if (priv->map[i].buf.flags & V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC)
             MP_WARN(priv, "compiled without clock_gettime() that is needed to handle monotone video timestamps from the kernel. Expect desync.\n");
 #endif
-        /* count up to make sure this is correct everytime */
+        /* count up to make sure this is correct every time */
         priv->mapcount++;
 
         if (v4l2_ioctl(priv->video_fd, VIDIOC_QBUF, &(priv->map[i].buf)) < 0) {
diff --git a/sub/ass_mp.c b/sub/ass_mp.c
index 2f9b397..84a706b 100644
--- a/sub/ass_mp.c
+++ b/sub/ass_mp.c
@@ -32,9 +32,12 @@
 #include "common/msg.h"
 #include "options/path.h"
 #include "ass_mp.h"
+#include "img_convert.h"
 #include "osd.h"
 #include "stream/stream.h"
 #include "options/options.h"
+#include "video/out/bitmap_packer.h"
+#include "video/mp_image.h"
 
 // res_y should be track->PlayResY
 // It determines scaling of font sizes and more.
@@ -97,37 +100,6 @@ void mp_ass_configure_fonts(ASS_Renderer *priv, struct osd_style_opts *opts,
     talloc_free(tmp);
 }
 
-void mp_ass_render_frame(ASS_Renderer *renderer, ASS_Track *track, double time,
-                         struct sub_bitmaps *res)
-{
-    int changed;
-    ASS_Image *imgs = ass_render_frame(renderer, track, time, &changed);
-    if (changed)
-        res->change_id++;
-    assert(res->format == 0 || res->format == SUBBITMAP_LIBASS);
-    res->format = SUBBITMAP_LIBASS;
-
-    int num_parts_alloc = MP_TALLOC_AVAIL(res->parts);
-    for (struct ass_image *img = imgs; img; img = img->next) {
-        if (img->w == 0 || img->h == 0)
-            continue;
-        if (res->num_parts >= num_parts_alloc) {
-            num_parts_alloc = MPMAX(num_parts_alloc * 2, 32);
-            res->parts = talloc_realloc(NULL, res->parts, struct sub_bitmap,
-                                        num_parts_alloc);
-        }
-        struct sub_bitmap *p = &res->parts[res->num_parts];
-        p->bitmap = img->bitmap;
-        p->stride = img->stride;
-        p->libass.color = img->color;
-        p->dw = p->w = img->w;
-        p->dh = p->h = img->h;
-        p->x = img->dst_x;
-        p->y = img->dst_y;
-        res->num_parts++;
-    }
-}
-
 static const int map_ass_level[] = {
     MSGL_ERR,           // 0 "FATAL errors"
     MSGL_WARN,
@@ -177,3 +149,228 @@ void mp_ass_flush_old_events(ASS_Track *track, long long ts)
         track->events[i] = track->events[i+n];
     }
 }
+
+static void draw_ass_rgba(unsigned char *src, int src_w, int src_h,
+                          int src_stride, unsigned char *dst, size_t dst_stride,
+                          int dst_x, int dst_y, uint32_t color)
+{
+    const unsigned int r = (color >> 24) & 0xff;
+    const unsigned int g = (color >> 16) & 0xff;
+    const unsigned int b = (color >>  8) & 0xff;
+    const unsigned int a = 0xff - (color & 0xff);
+
+    dst += dst_y * dst_stride + dst_x * 4;
+
+    for (int y = 0; y < src_h; y++, dst += dst_stride, src += src_stride) {
+        uint32_t *dstrow = (uint32_t *) dst;
+        for (int x = 0; x < src_w; x++) {
+            const unsigned int v = src[x];
+            int rr = (r * a * v);
+            int gg = (g * a * v);
+            int bb = (b * a * v);
+            int aa =      a * v;
+            uint32_t dstpix = dstrow[x];
+            unsigned int dstb =  dstpix        & 0xFF;
+            unsigned int dstg = (dstpix >>  8) & 0xFF;
+            unsigned int dstr = (dstpix >> 16) & 0xFF;
+            unsigned int dsta = (dstpix >> 24) & 0xFF;
+            dstb = (bb       + dstb * (255 * 255 - aa)) / (255 * 255);
+            dstg = (gg       + dstg * (255 * 255 - aa)) / (255 * 255);
+            dstr = (rr       + dstr * (255 * 255 - aa)) / (255 * 255);
+            dsta = (aa * 255 + dsta * (255 * 255 - aa)) / (255 * 255);
+            dstrow[x] = dstb | (dstg << 8) | (dstr << 16) | (dsta << 24);
+        }
+    }
+}
+
+struct mp_ass_packer {
+    struct sub_bitmap *cached_parts; // only for the array memory
+    struct mp_image *cached_img;
+    struct sub_bitmaps cached_subs;
+    bool cached_subs_valid;
+    struct sub_bitmap rgba_imgs[MP_SUB_BB_LIST_MAX];
+    struct bitmap_packer *packer;
+};
+
+// Free with talloc_free().
+struct mp_ass_packer *mp_ass_packer_alloc(void *ta_parent)
+{
+    struct mp_ass_packer *p = talloc_zero(ta_parent, struct mp_ass_packer);
+    p->packer = talloc_zero(p, struct bitmap_packer);
+    return p;
+}
+
+static bool pack(struct mp_ass_packer *p, struct sub_bitmaps *res, int imgfmt)
+{
+    packer_set_size(p->packer, res->num_parts);
+
+    for (int n = 0; n < res->num_parts; n++)
+        p->packer->in[n] = (struct pos){res->parts[n].w, res->parts[n].h};
+
+    if (p->packer->count == 0 || packer_pack(p->packer) < 0)
+        return false;
+
+    struct pos bb[2];
+    packer_get_bb(p->packer, bb);
+
+    res->packed_w = bb[1].x;
+    res->packed_h = bb[1].y;
+
+    if (!p->cached_img || p->cached_img->w < res->packed_w ||
+                          p->cached_img->h < res->packed_h)
+    {
+        talloc_free(p->cached_img);
+        p->cached_img = mp_image_alloc(imgfmt, p->packer->w, p->packer->h);
+        if (!p->cached_img)
+            return false;
+        talloc_steal(p, p->cached_img);
+    }
+
+    res->packed = p->cached_img;
+
+    for (int n = 0; n < res->num_parts; n++) {
+        struct sub_bitmap *b = &res->parts[n];
+        struct pos pos = p->packer->result[n];
+
+        b->src_x = pos.x;
+        b->src_y = pos.y;
+    }
+
+    return true;
+}
+
+static bool pack_libass(struct mp_ass_packer *p, struct sub_bitmaps *res)
+{
+    if (!pack(p, res, IMGFMT_Y8))
+        return false;
+
+    for (int n = 0; n < res->num_parts; n++) {
+        struct sub_bitmap *b = &res->parts[n];
+
+        int stride = res->packed->stride[0];
+        void *pdata =
+            (uint8_t *)res->packed->planes[0] + b->src_y * stride + b->src_x;
+        memcpy_pic(pdata, b->bitmap, b->w, b->h, stride, b->stride);
+
+        b->bitmap = pdata;
+        b->stride = stride;
+    }
+
+    return true;
+}
+
+static bool pack_rgba(struct mp_ass_packer *p, struct sub_bitmaps *res)
+{
+    struct mp_rect bb_list[MP_SUB_BB_LIST_MAX];
+    int num_bb = mp_get_sub_bb_list(res, bb_list, MP_SUB_BB_LIST_MAX);
+
+    struct sub_bitmaps imgs = {
+        .change_id = res->change_id,
+        .format = SUBBITMAP_RGBA,
+        .parts = p->rgba_imgs,
+        .num_parts = num_bb,
+    };
+
+    for (int n = 0; n < imgs.num_parts; n++) {
+        imgs.parts[n].w = bb_list[n].x1 - bb_list[n].x0;
+        imgs.parts[n].h = bb_list[n].y1 - bb_list[n].y0;
+    }
+
+    if (!pack(p, &imgs, IMGFMT_BGRA))
+        return false;
+
+    for (int n = 0; n < num_bb; n++) {
+        struct mp_rect bb = bb_list[n];
+        struct sub_bitmap *b = &imgs.parts[n];
+
+        b->x = bb.x0;
+        b->y = bb.y0;
+        b->w = b->dw = bb.x1 - bb.x0;
+        b->h = b->dh = bb.y1 - bb.y0;
+        b->stride = imgs.packed->stride[0];
+        b->bitmap = (uint8_t *)imgs.packed->planes[0] +
+                    b->stride * b->src_y + b->src_x * 4;
+
+        memset_pic(b->bitmap, 0, b->w * 4, b->h, b->stride);
+
+        for (int i = 0; i < res->num_parts; i++) {
+            struct sub_bitmap *s = &res->parts[i];
+
+            // Assume mp_get_sub_bb_list() never splits sub bitmaps
+            // So we don't clip/adjust the size of the sub bitmap
+            if (s->x > bb.x1 || s->x + s->w < bb.x0 ||
+                s->y > bb.y1 || s->y + s->h < bb.y0)
+                continue;
+
+            draw_ass_rgba(s->bitmap, s->w, s->h, s->stride,
+                          b->bitmap, b->stride,
+                          s->x - bb.x0, s->y - bb.y0,
+                          s->libass.color);
+        }
+    }
+
+    *res = imgs;
+    return true;
+}
+
+// Pack the contents of image_lists[0] to image_lists[num_image_lists-1] into
+// a single image, and make *out point to it. *out is completely overwritten.
+// If libass reported any change, image_lists_changed must be set (it then
+// repacks all images). preferred_osd_format can be set to a desired
+// sub_bitmap_format. Currently, only SUBBITMAP_LIBASS is supported.
+void mp_ass_packer_pack(struct mp_ass_packer *p, ASS_Image **image_lists,
+                        int num_image_lists, bool image_lists_changed,
+                        int preferred_osd_format, struct sub_bitmaps *out)
+{
+    int format = preferred_osd_format == SUBBITMAP_RGBA ? SUBBITMAP_RGBA
+                                                        : SUBBITMAP_LIBASS;
+
+    if (p->cached_subs_valid && !image_lists_changed &&
+        p->cached_subs.format == format)
+    {
+        *out = p->cached_subs;
+        return;
+    }
+
+    *out = (struct sub_bitmaps){.change_id = 1};
+    p->cached_subs_valid = false;
+
+    struct sub_bitmaps res = {
+        .change_id = image_lists_changed,
+        .format = SUBBITMAP_LIBASS,
+        .parts = p->cached_parts,
+    };
+
+    for (int n = 0; n < num_image_lists; n++) {
+        for (struct ass_image *img = image_lists[n]; img; img = img->next) {
+            if (img->w == 0 || img->h == 0)
+                continue;
+            MP_TARRAY_GROW(p, p->cached_parts, res.num_parts);
+            res.parts = p->cached_parts;
+            struct sub_bitmap *b = &res.parts[res.num_parts];
+            b->bitmap = img->bitmap;
+            b->stride = img->stride;
+            b->libass.color = img->color;
+            b->dw = b->w = img->w;
+            b->dh = b->h = img->h;
+            b->x = img->dst_x;
+            b->y = img->dst_y;
+            res.num_parts++;
+        }
+    }
+
+    bool r = false;
+    if (format == SUBBITMAP_RGBA) {
+        r = pack_rgba(p, &res);
+    } else {
+        r = pack_libass(p, &res);
+    }
+
+    if (!r)
+        return;
+
+    *out = res;
+    p->cached_subs = res;
+    p->cached_subs.change_id = 0;
+    p->cached_subs_valid = true;
+}
diff --git a/sub/ass_mp.h b/sub/ass_mp.h
index 20f0ebe..50397bd 100644
--- a/sub/ass_mp.h
+++ b/sub/ass_mp.h
@@ -49,9 +49,11 @@ void mp_ass_configure_fonts(ASS_Renderer *priv, struct osd_style_opts *opts,
                             struct mpv_global *global, struct mp_log *log);
 ASS_Library *mp_ass_init(struct mpv_global *global, struct mp_log *log);
 
-struct sub_bitmap;
 struct sub_bitmaps;
-void mp_ass_render_frame(ASS_Renderer *renderer, ASS_Track *track, double time,
-                         struct sub_bitmaps *res);
+struct mp_ass_packer;
+struct mp_ass_packer *mp_ass_packer_alloc(void *ta_parent);
+void mp_ass_packer_pack(struct mp_ass_packer *p, ASS_Image **image_lists,
+                        int num_image_lists, bool changed,
+                        int preferred_osd_format, struct sub_bitmaps *out);
 
 #endif                          /* MPLAYER_ASS_MP_H */
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 3b1e957..22dc332 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -255,8 +255,8 @@ bool sub_read_packets(struct dec_sub *sub, double video_pts)
 // You must call sub_lock/sub_unlock if more than 1 thread access sub.
 // The issue is that *res will contain decoder allocated data, which might
 // be deallocated on the next decoder access.
-void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,
-                     struct sub_bitmaps *res)
+void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, int format,
+                     double pts, struct sub_bitmaps *res)
 {
     struct MPOpts *opts = sub->opts;
 
@@ -267,7 +267,7 @@ void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,
         return;
 
     if (opts->sub_visibility && sub->sd->driver->get_bitmaps)
-        sub->sd->driver->get_bitmaps(sub->sd, dim, pts, res);
+        sub->sd->driver->get_bitmaps(sub->sd, dim, format, pts, res);
 }
 
 // See sub_get_bitmaps() for locking requirements.
diff --git a/sub/dec_sub.h b/sub/dec_sub.h
index 63603e2..1048bf0 100644
--- a/sub/dec_sub.h
+++ b/sub/dec_sub.h
@@ -35,8 +35,8 @@ void sub_unlock(struct dec_sub *sub);
 bool sub_can_preload(struct dec_sub *sub);
 void sub_preload(struct dec_sub *sub);
 bool sub_read_packets(struct dec_sub *sub, double video_pts);
-void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,
-                     struct sub_bitmaps *res);
+void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, int format,
+                     double pts, struct sub_bitmaps *res);
 char *sub_get_text(struct dec_sub *sub, double pts);
 void sub_reset(struct dec_sub *sub);
 void sub_select(struct dec_sub *sub, bool selected);
diff --git a/sub/draw_bmp.c b/sub/draw_bmp.c
index 5356a8f..b79810c 100644
--- a/sub/draw_bmp.c
+++ b/sub/draw_bmp.c
@@ -193,8 +193,7 @@ static void scale_sb_rgba(struct sub_bitmap *sb, struct mp_image *dst_format,
     mp_image_swscale(sbisrc2, &sbisrc, SWS_BILINEAR);
     unpremultiply_and_split_BGR32(sbisrc2, sba);
 
-    sbi->params.colorspace = dst_format->params.colorspace;
-    sbi->params.colorlevels = dst_format->params.colorlevels;
+    sbi->params.color = dst_format->params.color;
     mp_image_swscale(sbi, sbisrc2, SWS_BILINEAR);
 
     talloc_free(sbisrc2);
@@ -367,8 +366,8 @@ static struct part *get_cache(struct mp_draw_sub_cache *cache,
         if (part) {
             if (part->change_id != sbs->change_id
                 || part->imgfmt != format->imgfmt
-                || part->colorspace != format->params.colorspace
-                || part->levels != format->params.colorlevels)
+                || part->colorspace != format->params.color.space
+                || part->levels != format->params.color.levels)
             {
                 talloc_free(part);
                 part = NULL;
@@ -380,8 +379,8 @@ static struct part *get_cache(struct mp_draw_sub_cache *cache,
                 .change_id = sbs->change_id,
                 .num_imgs = sbs->num_parts,
                 .imgfmt = format->imgfmt,
-                .levels = format->params.colorlevels,
-                .colorspace = format->params.colorspace,
+                .levels = format->params.color.levels,
+                .colorspace = format->params.color.space,
             };
             part->imgs = talloc_zero_array(part, struct sub_cache,
                                            part->num_imgs);
@@ -436,10 +435,8 @@ static struct mp_image *chroma_up(struct mp_draw_sub_cache *cache, int imgfmt,
 
     // The temp image is always YUV, but src not necessarily.
     // Reduce amount of conversions in YUV case (upsampling/shifting only)
-    if (src->fmt.flags & MP_IMGFLAG_YUV) {
-        temp->params.colorspace = src->params.colorspace;
-        temp->params.colorlevels = src->params.colorlevels;
-    }
+    if (src->fmt.flags & MP_IMGFLAG_YUV)
+        temp->params.color = src->params.color;
 
     if (src->imgfmt == IMGFMT_420P) {
         assert(imgfmt == IMGFMT_444P);
diff --git a/sub/img_convert.c b/sub/img_convert.c
index 78be881..0ce5c7a 100644
--- a/sub/img_convert.c
+++ b/sub/img_convert.c
@@ -30,17 +30,6 @@
 #include "video/mp_image.h"
 #include "video/sws_utils.h"
 
-struct osd_conv_cache {
-    struct sub_bitmap part[MP_SUB_BB_LIST_MAX];
-    struct sub_bitmap *parts;
-    void *scratch;
-};
-
-struct osd_conv_cache *osd_conv_cache_new(void)
-{
-    return talloc_zero(NULL, struct osd_conv_cache);
-}
-
 void mp_blur_rgba_sub_bitmap(struct sub_bitmap *d, double gblur)
 {
     struct mp_image *tmp1 = mp_image_alloc(IMGFMT_BGRA, d->w, d->h);
@@ -58,154 +47,6 @@ void mp_blur_rgba_sub_bitmap(struct sub_bitmap *d, double gblur)
     talloc_free(tmp1);
 }
 
-// If RGBA parts need scaling, scale them.
-bool osd_scale_rgba(struct osd_conv_cache *c, struct sub_bitmaps *imgs)
-{
-    struct sub_bitmaps src = *imgs;
-    if (src.format != SUBBITMAP_RGBA)
-        return false;
-
-    bool need_scale = false;
-    for (int n = 0; n < src.num_parts; n++) {
-        struct sub_bitmap *sb = &src.parts[n];
-        if (sb->w != sb->dw || sb->h != sb->dh)
-            need_scale = true;
-    }
-    if (!need_scale)
-        return false;
-
-    talloc_free(c->parts);
-    imgs->parts = c->parts = talloc_array(c, struct sub_bitmap, src.num_parts);
-    imgs->packed = NULL;
-
-    // Note: we scale all parts, since most likely all need scaling anyway, and
-    //       to get a proper copy of all data in the imgs list.
-    for (int n = 0; n < src.num_parts; n++) {
-        struct sub_bitmap *d = &imgs->parts[n];
-        struct sub_bitmap *s = &src.parts[n];
-
-        struct mp_image src_image = {0};
-        mp_image_setfmt(&src_image, IMGFMT_BGRA);
-        mp_image_set_size(&src_image, s->w, s->h);
-        src_image.planes[0] = s->bitmap;
-        src_image.stride[0] = s->stride;
-
-        d->x = s->x;
-        d->y = s->y;
-        d->w = d->dw = s->dw;
-        d->h = d->dh = s->dh;
-        struct mp_image *image = mp_image_alloc(IMGFMT_BGRA, d->w, d->h);
-        talloc_steal(c->parts, image);
-        if (image) {
-            d->stride = image->stride[0];
-            d->bitmap = image->planes[0];
-
-            mp_image_swscale(image, &src_image, mp_sws_fast_flags);
-        } else {
-            // on OOM, skip the region; just don't scale it
-            *d = *s;
-        }
-    }
-    return true;
-}
-
-static void draw_ass_rgba(unsigned char *src, int src_w, int src_h,
-                          int src_stride, unsigned char *dst, size_t dst_stride,
-                          int dst_x, int dst_y, uint32_t color)
-{
-    const unsigned int r = (color >> 24) & 0xff;
-    const unsigned int g = (color >> 16) & 0xff;
-    const unsigned int b = (color >>  8) & 0xff;
-    const unsigned int a = 0xff - (color & 0xff);
-
-    dst += dst_y * dst_stride + dst_x * 4;
-
-    for (int y = 0; y < src_h; y++, dst += dst_stride, src += src_stride) {
-        uint32_t *dstrow = (uint32_t *) dst;
-        for (int x = 0; x < src_w; x++) {
-            const unsigned int v = src[x];
-            int rr = (r * a * v);
-            int gg = (g * a * v);
-            int bb = (b * a * v);
-            int aa =      a * v;
-            uint32_t dstpix = dstrow[x];
-            unsigned int dstb =  dstpix        & 0xFF;
-            unsigned int dstg = (dstpix >>  8) & 0xFF;
-            unsigned int dstr = (dstpix >> 16) & 0xFF;
-            unsigned int dsta = (dstpix >> 24) & 0xFF;
-            dstb = (bb       + dstb * (255 * 255 - aa)) / (255 * 255);
-            dstg = (gg       + dstg * (255 * 255 - aa)) / (255 * 255);
-            dstr = (rr       + dstr * (255 * 255 - aa)) / (255 * 255);
-            dsta = (aa * 255 + dsta * (255 * 255 - aa)) / (255 * 255);
-            dstrow[x] = dstb | (dstg << 8) | (dstr << 16) | (dsta << 24);
-        }
-    }
-}
-
-bool osd_conv_ass_to_rgba(struct osd_conv_cache *c, struct sub_bitmaps *imgs)
-{
-    struct sub_bitmaps src = *imgs;
-    if (src.format != SUBBITMAP_LIBASS)
-        return false;
-    assert(!src.scaled); // ASS is always unscaled
-
-    struct mp_rect bb_list[MP_SUB_BB_LIST_MAX];
-    int num_bb = mp_get_sub_bb_list(&src, bb_list, MP_SUB_BB_LIST_MAX);
-
-    imgs->format = SUBBITMAP_RGBA;
-    imgs->parts = c->part;
-    imgs->num_parts = num_bb;
-    imgs->packed = NULL;
-
-    size_t newsize = 0;
-    for (int n = 0; n < num_bb; n++) {
-        struct mp_rect bb = bb_list[n];
-        int w = bb.x1 - bb.x0;
-        int h = bb.y1 - bb.y0;
-        int stride = w * 4;
-        newsize += h * stride;
-    }
-
-    if (talloc_get_size(c->scratch) < newsize) {
-        talloc_free(c->scratch);
-        c->scratch = talloc_array(c, uint8_t, newsize);
-    }
-
-    uint8_t *data = c->scratch;
-
-    for (int n = 0; n < num_bb; n++) {
-        struct mp_rect bb = bb_list[n];
-        struct sub_bitmap *bmp = &c->part[n];
-
-        bmp->x = bb.x0;
-        bmp->y = bb.y0;
-        bmp->w = bmp->dw = bb.x1 - bb.x0;
-        bmp->h = bmp->dh = bb.y1 - bb.y0;
-        bmp->stride = bmp->w * 4;
-        bmp->bitmap = data;
-        data += bmp->h * bmp->stride;
-
-        memset_pic(bmp->bitmap, 0, bmp->w * 4, bmp->h, bmp->stride);
-
-        for (int p = 0; p < src.num_parts; p++) {
-            struct sub_bitmap *s = &src.parts[p];
-
-            // Assume mp_get_sub_bb_list() never splits sub bitmaps
-            // So we don't clip/adjust the size of the sub bitmap
-            if (s->x > bb.x1 || s->x + s->w < bb.x0 ||
-                s->y > bb.y1 || s->y + s->h < bb.y0)
-                continue;
-
-            draw_ass_rgba(s->bitmap, s->w, s->h, s->stride,
-                          bmp->bitmap, bmp->stride,
-                          s->x - bb.x0, s->y - bb.y0,
-                          s->libass.color);
-        }
-    }
-
-    return true;
-}
-
 bool mp_sub_bitmaps_bb(struct sub_bitmaps *imgs, struct mp_rect *out_bb)
 {
     struct mp_rect bb = {INT_MAX, INT_MAX, INT_MIN, INT_MIN};
diff --git a/sub/img_convert.h b/sub/img_convert.h
index a0020df..e03c155 100644
--- a/sub/img_convert.h
+++ b/sub/img_convert.h
@@ -3,19 +3,12 @@
 
 #include <stdbool.h>
 
-struct osd_conv_cache;
 struct sub_bitmaps;
 struct sub_bitmap;
 struct mp_rect;
 
-struct osd_conv_cache *osd_conv_cache_new(void);
-
-// These functions convert from one OSD format to another. On success, they copy
-// the converted image data into c, and change imgs to point to the data.
-bool osd_conv_ass_to_rgba(struct osd_conv_cache *c, struct sub_bitmaps *imgs);
 // Sub postprocessing
 void mp_blur_rgba_sub_bitmap(struct sub_bitmap *d, double gblur);
-bool osd_scale_rgba(struct osd_conv_cache *c, struct sub_bitmaps *imgs);
 
 bool mp_sub_bitmaps_bb(struct sub_bitmaps *imgs, struct mp_rect *out_bb);
 
diff --git a/sub/osd.c b/sub/osd.c
index a0618aa..381fa88 100644
--- a/sub/osd.c
+++ b/sub/osd.c
@@ -127,8 +127,6 @@ struct osd_state *osd_create(struct mpv_global *global)
             .text = talloc_strdup(obj, ""),
             .progbar_state = {.type = -1},
         };
-        for (int i = 0; i < OSD_CONV_CACHE_MAX; i++)
-            obj->cache[i] = talloc_steal(obj, osd_conv_cache_new());
         osd->objs[n] = obj;
     }
 
@@ -247,10 +245,9 @@ static void render_object(struct osd_state *osd, struct osd_object *obj,
 {
     struct MPOpts *opts = osd->opts;
 
-    bool formats[SUBBITMAP_COUNT];
-    memcpy(formats, sub_formats, sizeof(formats));
-    if (opts->force_rgba_osd)
-        formats[SUBBITMAP_LIBASS] = false;
+    int format = SUBBITMAP_LIBASS;
+    if (!sub_formats[format] || opts->force_rgba_osd)
+        format = SUBBITMAP_RGBA;
 
     *out_imgs = (struct sub_bitmaps) {0};
 
@@ -261,7 +258,7 @@ static void render_object(struct osd_state *osd, struct osd_object *obj,
             double sub_pts = video_pts;
             if (sub_pts != MP_NOPTS_VALUE)
                 sub_pts -= opts->sub_delay;
-            sub_get_bitmaps(obj->sub, obj->vo_res, sub_pts, out_imgs);
+            sub_get_bitmaps(obj->sub, obj->vo_res, format, sub_pts, out_imgs);
         }
     } else if (obj->type == OSDTYPE_EXTERNAL2) {
         if (obj->external2 && obj->external2->format) {
@@ -269,7 +266,7 @@ static void render_object(struct osd_state *osd, struct osd_object *obj,
             obj->external2->change_id = 0;
         }
     } else {
-        osd_object_get_bitmaps(osd, obj, out_imgs);
+        osd_object_get_bitmaps(osd, obj, format, out_imgs);
     }
 
     if (obj->force_redraw)
@@ -281,25 +278,8 @@ static void render_object(struct osd_state *osd, struct osd_object *obj,
     if (out_imgs->num_parts == 0)
         return;
 
-    if (obj->cached.change_id == obj->vo_change_id && formats[obj->cached.format])
-    {
-        *out_imgs = obj->cached;
-        return;
-    }
-
     out_imgs->render_index = obj->type;
     out_imgs->change_id = obj->vo_change_id;
-
-    if (formats[out_imgs->format])
-        return;
-
-    bool cached = false; // do we have a copy of all the image data?
-
-    if (formats[SUBBITMAP_RGBA] && out_imgs->format == SUBBITMAP_LIBASS)
-        cached |= osd_conv_ass_to_rgba(obj->cache[3], out_imgs);
-
-    if (cached)
-        obj->cached = *out_imgs;
 }
 
 // draw_flags is a bit field of OSD_DRAW_* constants
@@ -456,5 +436,4 @@ void osd_rescale_bitmaps(struct sub_bitmaps *imgs, int frame_w, int frame_h,
         bi->dw = (int)(bi->w * xscale + 0.5);
         bi->dh = (int)(bi->h * yscale + 0.5);
     }
-    imgs->scaled = xscale != 1 || yscale != 1;
 }
diff --git a/sub/osd.h b/sub/osd.h
index 7cfc695..6bfa6dd 100644
--- a/sub/osd.h
+++ b/sub/osd.h
@@ -58,10 +58,6 @@ struct sub_bitmaps {
 
     enum sub_bitmap_format format;
 
-    // If false, dw==w && dh==h.
-    // SUBBITMAP_LIBASS is never scaled.
-    bool scaled;
-
     struct sub_bitmap *parts;
     int num_parts;
 
@@ -69,6 +65,7 @@ struct sub_bitmaps {
     // parts[].bitmap pointer points into the image data here (and stride will
     // correspond to packed->stride[0]).
     //  SUBBITMAP_RGBA: IMGFMT_BGRA (exact match)
+    //  SUBBITMAP_LIBASS: IMGFMT_Y8 (not the same, but compatible layout)
     // Other formats have this set to NULL.
     struct mp_image *packed;
 
@@ -204,7 +201,7 @@ void osd_rescale_bitmaps(struct sub_bitmaps *imgs, int frame_w, int frame_h,
 
 // internal use only
 void osd_object_get_bitmaps(struct osd_state *osd, struct osd_object *obj,
-                            struct sub_bitmaps *out_imgs);
+                            int format, struct sub_bitmaps *out_imgs);
 void osd_init_backend(struct osd_state *osd);
 void osd_destroy_backend(struct osd_state *osd);
 
diff --git a/sub/osd_dummy.c b/sub/osd_dummy.c
index d9c366e..796d954 100644
--- a/sub/osd_dummy.c
+++ b/sub/osd_dummy.c
@@ -19,7 +19,7 @@ void osd_get_function_sym(char *buffer, size_t buffer_size, int osd_function)
 }
 
 void osd_object_get_bitmaps(struct osd_state *osd, struct osd_object *obj,
-                            struct sub_bitmaps *out_imgs)
+                            int format, struct sub_bitmaps *out_imgs)
 {
     *out_imgs = (struct sub_bitmaps) {0};
 }
diff --git a/sub/osd_libass.c b/sub/osd_libass.c
index 6a2efa6..a9ab7ea 100644
--- a/sub/osd_libass.c
+++ b/sub/osd_libass.c
@@ -90,7 +90,6 @@ void osd_destroy_backend(struct osd_state *osd)
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
         struct osd_object *obj = osd->objs[n];
         destroy_ass_renderer(&obj->ass);
-        talloc_free(obj->parts_cache.parts);
         for (int i = 0; i < obj->num_externals; i++)
             destroy_external(&obj->externals[i]);
         obj->num_externals = 0;
@@ -501,7 +500,7 @@ void osd_set_external(struct osd_state *osd, void *id, int res_x, int res_y,
         entry->res_x = res_x;
         entry->res_y = res_y;
         update_external(osd, obj, entry);
-        obj->parts_cache.change_id = 1;
+        obj->changed = true;
         osd_changed_unlocked(osd, obj->type);
     }
 
@@ -510,28 +509,40 @@ done:
 }
 
 static void append_ass(struct ass_state *ass, struct mp_osd_res *res,
-                       struct sub_bitmaps *imgs)
+                       ASS_Image **img_list, bool *changed)
 {
-    if (!ass->render || !ass->track)
+    if (!ass->render || !ass->track) {
+        *img_list = NULL;
         return;
+    }
 
     ass_set_frame_size(ass->render, res->w, res->h);
     ass_set_aspect_ratio(ass->render, res->display_par, 1.0);
-    mp_ass_render_frame(ass->render, ass->track, 0, imgs);
+
+    int ass_changed;
+    *img_list = ass_render_frame(ass->render, ass->track, 0, &ass_changed);
+    *changed |= ass_changed;
 }
 
 void osd_object_get_bitmaps(struct osd_state *osd, struct osd_object *obj,
-                            struct sub_bitmaps *out_imgs)
+                            int format, struct sub_bitmaps *out_imgs)
 {
     if (obj->force_redraw && obj->type == OSDTYPE_OSD)
         update_osd(osd, obj);
 
-    append_ass(&obj->ass, &obj->vo_res, &obj->parts_cache);
-    for (int n = 0; n < obj->num_externals; n++)
-        append_ass(&obj->externals[n].ass, &obj->vo_res, &obj->parts_cache);
+    if (!obj->ass_packer)
+        obj->ass_packer = mp_ass_packer_alloc(obj);
+
+    MP_TARRAY_GROW(obj, obj->ass_imgs, obj->num_externals + 1);
+
+    append_ass(&obj->ass, &obj->vo_res, &obj->ass_imgs[0], &obj->changed);
+    for (int n = 0; n < obj->num_externals; n++) {
+        append_ass(&obj->externals[n].ass, &obj->vo_res, &obj->ass_imgs[n + 1],
+                   &obj->changed);
+    }
 
-    *out_imgs = obj->parts_cache;
+    mp_ass_packer_pack(obj->ass_packer, obj->ass_imgs, obj->num_externals + 1,
+                       obj->changed, format, out_imgs);
 
-    obj->parts_cache.change_id = 0;
-    obj->parts_cache.num_parts = 0;
+    obj->changed = false;
 }
diff --git a/sub/osd_state.h b/sub/osd_state.h
index 0fff668..81bdbd6 100644
--- a/sub/osd_state.h
+++ b/sub/osd_state.h
@@ -5,8 +5,6 @@
 
 #include "osd.h"
 
-#define OSD_CONV_CACHE_MAX 4
-
 enum mp_osdtype {
     OSDTYPE_SUB,
     OSDTYPE_SUB2, // IDs must be numerically successive
@@ -48,17 +46,15 @@ struct osd_object {
     // OSDTYPE_EXTERNAL2
     struct sub_bitmaps *external2;
 
-    // caches for OSD conversion (internal to render_object())
-    struct osd_conv_cache *cache[OSD_CONV_CACHE_MAX];
-    struct sub_bitmaps cached;
-
     // VO cache state
     int vo_change_id;
     struct mp_osd_res vo_res;
 
     // Internally used by osd_libass.c
-    struct sub_bitmaps parts_cache;
+    bool changed;
     struct ass_state ass;
+    struct mp_ass_packer *ass_packer;
+    struct ass_image **ass_imgs;
 };
 
 struct osd_external {
diff --git a/sub/sd.h b/sub/sd.h
index fe64163..c8056d3 100644
--- a/sub/sd.h
+++ b/sub/sd.h
@@ -37,8 +37,8 @@ struct sd_functions {
     bool (*accepts_packet)(struct sd *sd); // implicit default if NULL: true
     int (*control)(struct sd *sd, enum sd_ctrl cmd, void *arg);
 
-    void (*get_bitmaps)(struct sd *sd, struct mp_osd_res dim, double pts,
-                        struct sub_bitmaps *res);
+    void (*get_bitmaps)(struct sd *sd, struct mp_osd_res dim, int format,
+                        double pts, struct sub_bitmaps *res);
     char *(*get_text)(struct sd *sd, double pts);
 };
 
diff --git a/sub/sd_ass.c b/sub/sd_ass.c
index 6f35053..34a49c1 100644
--- a/sub/sd_ass.c
+++ b/sub/sd_ass.c
@@ -44,7 +44,8 @@ struct sd_ass_priv {
     bool is_converted;
     struct lavc_conv *converter;
     bool on_top;
-    struct sub_bitmaps part_cache;
+    struct mp_ass_packer *packer;
+    struct sub_bitmap *bs;
     char last_text[500];
     struct mp_image_params video_params;
     struct mp_image_params last_params;
@@ -212,6 +213,8 @@ static int init(struct sd *sd)
 
     enable_output(sd, true);
 
+    ctx->packer = mp_ass_packer_alloc(ctx);
+
     return 0;
 }
 
@@ -417,8 +420,8 @@ static long long find_timestamp(struct sd *sd, double pts)
 
 #undef END
 
-static void get_bitmaps(struct sd *sd, struct mp_osd_res dim, double pts,
-                        struct sub_bitmaps *res)
+static void get_bitmaps(struct sd *sd, struct mp_osd_res dim, int format,
+                        double pts, struct sub_bitmaps *res)
 {
     struct sd_ass_priv *ctx = sd->priv;
     struct MPOpts *opts = sd->opts;
@@ -459,15 +462,18 @@ static void get_bitmaps(struct sd *sd, struct mp_osd_res dim, double pts,
     if (no_ass)
         fill_plaintext(sd, pts);
 
-    ctx->part_cache.change_id = 0;
-    ctx->part_cache.num_parts = 0;
-    mp_ass_render_frame(renderer, track, ts, &ctx->part_cache);
-    talloc_steal(ctx, ctx->part_cache.parts);
+    int changed;
+    ASS_Image *imgs = ass_render_frame(renderer, track, ts, &changed);
+    mp_ass_packer_pack(ctx->packer, &imgs, 1, changed, format, res);
 
-    if (!converted)
-        mangle_colors(sd, &ctx->part_cache);
+    if (!converted && res->num_parts > 0) {
+        // mangle_colors() modifies the color field, so copy the thing.
+        MP_TARRAY_GROW(ctx, ctx->bs, res->num_parts);
+        memcpy(ctx->bs, res->parts, sizeof(ctx->bs[0]) * res->num_parts);
+        res->parts = ctx->bs;
 
-    *res = ctx->part_cache;
+        mangle_colors(sd, res);
+    }
 }
 
 struct buf {
@@ -727,15 +733,17 @@ static void mangle_colors(struct sd *sd, struct sub_bitmaps *parts)
     struct mp_image_params params = ctx->video_params;
 
     if (force_601) {
-        params.colorspace = MP_CSP_BT_709;
-        params.colorlevels = MP_CSP_LEVELS_TV;
+        params.color = (struct mp_colorspace){
+            .space = MP_CSP_BT_709,
+            .levels = MP_CSP_LEVELS_TV,
+        };
     }
 
-    if (csp == params.colorspace && levels == params.colorlevels)
+    if (csp == params.color.space && levels == params.color.levels)
         return;
 
-    bool basic_conv = params.colorspace == MP_CSP_BT_709 &&
-                      params.colorlevels == MP_CSP_LEVELS_TV &&
+    bool basic_conv = params.color.space == MP_CSP_BT_709 &&
+                      params.color.levels == MP_CSP_LEVELS_TV &&
                       csp == MP_CSP_BT_601 &&
                       levels == MP_CSP_LEVELS_TV;
 
@@ -743,8 +751,8 @@ static void mangle_colors(struct sd *sd, struct sub_bitmaps *parts)
     if (opts->ass_vsfilter_color_compat == 1 && !basic_conv)
         return;
 
-    if (params.colorspace != ctx->last_params.colorspace ||
-        params.colorlevels != ctx->last_params.colorlevels)
+    if (params.color.space != ctx->last_params.color.space ||
+        params.color.levels != ctx->last_params.color.levels)
     {
         int msgl = basic_conv ? MSGL_V : MSGL_WARN;
         ctx->last_params = params;
@@ -752,22 +760,21 @@ static void mangle_colors(struct sd *sd, struct sub_bitmaps *parts)
                "RGB -> %s %s -> %s %s -> RGB\n",
                m_opt_choice_str(mp_csp_names, csp),
                m_opt_choice_str(mp_csp_levels_names, levels),
-               m_opt_choice_str(mp_csp_names, params.colorspace),
-               m_opt_choice_str(mp_csp_names, params.colorlevels));
+               m_opt_choice_str(mp_csp_names, params.color.space),
+               m_opt_choice_str(mp_csp_names, params.color.levels));
     }
 
     // Conversion that VSFilter would use
     struct mp_csp_params vs_params = MP_CSP_PARAMS_DEFAULTS;
-    vs_params.colorspace = csp;
-    vs_params.levels_in = levels;
+    vs_params.color.space = csp;
+    vs_params.color.levels = levels;
     struct mp_cmat vs_yuv2rgb, vs_rgb2yuv;
     mp_get_csp_matrix(&vs_params, &vs_yuv2rgb);
     mp_invert_cmat(&vs_rgb2yuv, &vs_yuv2rgb);
 
     // Proper conversion to RGB
     struct mp_csp_params rgb_params = MP_CSP_PARAMS_DEFAULTS;
-    rgb_params.colorspace = params.colorspace;
-    rgb_params.levels_in = params.colorlevels;
+    rgb_params.color = params.color;
     struct mp_cmat vs2rgb;
     mp_get_csp_matrix(&rgb_params, &vs2rgb);
 
diff --git a/sub/sd_lavc.c b/sub/sd_lavc.c
index 7f68a22..fca4374 100644
--- a/sub/sd_lavc.c
+++ b/sub/sd_lavc.c
@@ -422,8 +422,8 @@ static void decode(struct sd *sd, struct demux_packet *packet)
     }
 }
 
-static void get_bitmaps(struct sd *sd, struct mp_osd_res d, double pts,
-                        struct sub_bitmaps *res)
+static void get_bitmaps(struct sd *sd, struct mp_osd_res d, int format,
+                        double pts, struct sub_bitmaps *res)
 {
     struct sd_lavc_priv *priv = sd->priv;
     struct MPOpts *opts = sd->opts;
diff --git a/video/csputils.c b/video/csputils.c
index ffa1f82..ea55d4d 100644
--- a/video/csputils.c
+++ b/video/csputils.c
@@ -65,6 +65,7 @@ const struct m_opt_choice_alternatives mp_csp_prim_names[] = {
     {"prophoto",    MP_CSP_PRIM_PRO_PHOTO},
     {"cie1931",     MP_CSP_PRIM_CIE_1931},
     {"dci-p3",      MP_CSP_PRIM_DCI_P3},
+    {"v-gamut",     MP_CSP_PRIM_V_GAMUT},
     {0}
 };
 
@@ -78,6 +79,8 @@ const struct m_opt_choice_alternatives mp_csp_trc_names[] = {
     {"gamma2.8",    MP_CSP_TRC_GAMMA28},
     {"prophoto",    MP_CSP_TRC_PRO_PHOTO},
     {"st2084",      MP_CSP_TRC_SMPTE_ST2084},
+    {"std-b67",     MP_CSP_TRC_ARIB_STD_B67},
+    {"v-log",       MP_CSP_TRC_V_LOG},
     {0}
 };
 
@@ -171,8 +174,9 @@ enum mp_csp_trc avcol_trc_to_mp_csp_trc(int avtrc)
     case AVCOL_TRC_LINEAR:       return MP_CSP_TRC_LINEAR;
     case AVCOL_TRC_GAMMA22:      return MP_CSP_TRC_GAMMA22;
     case AVCOL_TRC_GAMMA28:      return MP_CSP_TRC_GAMMA28;
-#if HAVE_AVUTIL_ST2084
+#if HAVE_AVUTIL_HDR
     case AVCOL_TRC_SMPTEST2084:  return MP_CSP_TRC_SMPTE_ST2084;
+    case AVCOL_TRC_ARIB_STD_B67: return MP_CSP_TRC_ARIB_STD_B67;
 #endif
     default:                     return MP_CSP_TRC_AUTO;
     }
@@ -222,8 +226,9 @@ int mp_csp_trc_to_avcol_trc(enum mp_csp_trc trc)
     case MP_CSP_TRC_LINEAR:       return AVCOL_TRC_LINEAR;
     case MP_CSP_TRC_GAMMA22:      return AVCOL_TRC_GAMMA22;
     case MP_CSP_TRC_GAMMA28:      return AVCOL_TRC_GAMMA28;
-#if HAVE_AVUTIL_ST2084
+#if HAVE_AVUTIL_HDR
     case MP_CSP_TRC_SMPTE_ST2084: return AVCOL_TRC_SMPTEST2084;
+    case MP_CSP_TRC_ARIB_STD_B67: return AVCOL_TRC_ARIB_STD_B67;
 #endif
     default:                      return AVCOL_TRC_UNSPECIFIED;
     }
@@ -419,11 +424,45 @@ struct mp_csp_primaries mp_get_csp_primaries(enum mp_csp_prim spc)
             .blue  = {0.150, 0.060},
             .white = d65
         };
+    // From Panasonic VARICAM reference manual
+    case MP_CSP_PRIM_V_GAMUT:
+        return (struct mp_csp_primaries) {
+            .red   = {0.730, 0.280},
+            .green = {0.165, 0.840},
+            .blue  = {0.100, -0.03},
+            .white = d65
+        };
     default:
         return (struct mp_csp_primaries) {{0}};
     }
 }
 
+// Get the nominal peak for a given colorspace, based on a known reference peak
+// (i.e. the display of a reference white illuminant. This may or may not
+// be the actual signal peak)
+float mp_csp_trc_nom_peak(enum mp_csp_trc trc, float ref_peak)
+{
+    switch (trc) {
+    case MP_CSP_TRC_SMPTE_ST2084: return 10000; // fixed peak
+    case MP_CSP_TRC_ARIB_STD_B67: return 12.0 * ref_peak;
+    case MP_CSP_TRC_V_LOG:        return 46.0855 * ref_peak;
+    }
+
+    return ref_peak;
+}
+
+bool mp_trc_is_hdr(enum mp_csp_trc trc)
+{
+    switch (trc) {
+    case MP_CSP_TRC_SMPTE_ST2084:
+    case MP_CSP_TRC_ARIB_STD_B67:
+    case MP_CSP_TRC_V_LOG:
+        return true;
+    }
+
+    return false;
+}
+
 // Compute the RGB/XYZ matrix as described here:
 // http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
 static void mp_get_rgb2xyz_matrix(struct mp_csp_primaries space, float m[3][3])
@@ -506,7 +545,7 @@ static void mp_apply_chromatic_adaptation(struct mp_csp_col_xy src,
     mp_mul_matrix3x3(m, tmp);
 }
 
-// get the coefficients of the source -> bt2020 cms matrix
+// get the coefficients of the source -> dest cms matrix
 void mp_get_cms_matrix(struct mp_csp_primaries src, struct mp_csp_primaries dest,
                        enum mp_render_intent intent, float m[3][3])
 {
@@ -543,7 +582,7 @@ void mp_get_cms_matrix(struct mp_csp_primaries src, struct mp_csp_primaries dest
 static void mp_get_xyz2rgb_coeffs(struct mp_csp_params *params,
                                   enum mp_render_intent intent, struct mp_cmat *m)
 {
-    struct mp_csp_primaries prim = mp_get_csp_primaries(params->primaries);
+    struct mp_csp_primaries prim = mp_get_csp_primaries(params->color.primaries);
     float brightness = params->brightness;
     mp_get_rgb2xyz_matrix(prim, m->m);
     mp_invert_matrix3x3(m->m);
@@ -620,10 +659,10 @@ static void luma_coeffs(struct mp_cmat *mat, float lr, float lg, float lb)
 // get the coefficients of the yuv -> rgb conversion matrix
 void mp_get_csp_matrix(struct mp_csp_params *params, struct mp_cmat *m)
 {
-    int colorspace = params->colorspace;
+    enum mp_csp colorspace = params->color.space;
     if (colorspace <= MP_CSP_AUTO || colorspace >= MP_CSP_COUNT)
         colorspace = MP_CSP_BT_601;
-    int levels_in = params->levels_in;
+    enum mp_csp_levels levels_in = params->color.levels;
     if (levels_in <= MP_CSP_LEVELS_AUTO || levels_in >= MP_CSP_LEVELS_COUNT)
         levels_in = MP_CSP_LEVELS_TV;
 
@@ -682,6 +721,10 @@ void mp_get_csp_matrix(struct mp_csp_params *params, struct mp_cmat *m)
     // The values below are written in 0-255 scale - thus bring s into range.
     double s =
         mp_get_csp_mul(colorspace, params->input_bits, params->texture_bits) / 255;
+    // NOTE: The yuvfull ranges as presented here are arguably ambiguous,
+    // and conflict with at least the full-range YCbCr/ICtCp values as defined
+    // by ITU-R BT.2100. If somebody ever complains about full-range YUV looking
+    // different from their reference display, this comment is probably why.
     struct yuvlevels { double ymin, ymax, cmin, cmid; }
         yuvlim =  { 16*s, 235*s, 16*s, 128*s },
         yuvfull = {  0*s, 255*s,  1*s, 128*s },  // '1' for symmetry around 128
@@ -734,9 +777,17 @@ void mp_csp_set_image_params(struct mp_csp_params *params,
 {
     struct mp_image_params p = *imgparams;
     mp_image_params_guess_csp(&p); // ensure consistency
-    params->colorspace = p.colorspace;
-    params->levels_in = p.colorlevels;
-    params->primaries = p.primaries;
+    params->color = p.color;
+}
+
+bool mp_colorspace_equal(struct mp_colorspace c1, struct mp_colorspace c2)
+{
+    return c1.space == c2.space &&
+           c1.levels == c2.levels &&
+           c1.primaries == c2.primaries &&
+           c1.gamma == c2.gamma &&
+           c1.sig_peak == c2.sig_peak &&
+           c1.nom_peak == c2.nom_peak;
 }
 
 // Copy settings from eq into params.
diff --git a/video/csputils.h b/video/csputils.h
index 19dd88f..0406ddf 100644
--- a/video/csputils.h
+++ b/video/csputils.h
@@ -64,6 +64,7 @@ enum mp_csp_prim {
     MP_CSP_PRIM_PRO_PHOTO,
     MP_CSP_PRIM_CIE_1931,
     MP_CSP_PRIM_DCI_P3,
+    MP_CSP_PRIM_V_GAMUT,
     MP_CSP_PRIM_COUNT
 };
 
@@ -79,6 +80,8 @@ enum mp_csp_trc {
     MP_CSP_TRC_GAMMA28,
     MP_CSP_TRC_PRO_PHOTO,
     MP_CSP_TRC_SMPTE_ST2084,
+    MP_CSP_TRC_ARIB_STD_B67,
+    MP_CSP_TRC_V_LOG,
     MP_CSP_TRC_COUNT
 };
 
@@ -113,11 +116,18 @@ extern const struct m_opt_choice_alternatives mp_stereo3d_names[];
 #define MP_STEREO3D_NAME_DEF(x, def) \
     (MP_STEREO3D_NAME(x) ? MP_STEREO3D_NAME(x) : (def))
 
-struct mp_csp_params {
-    enum mp_csp colorspace;
-    enum mp_csp_levels levels_in;      // encoded video
-    enum mp_csp_levels levels_out;     // output device
+struct mp_colorspace {
+    enum mp_csp space;
+    enum mp_csp_levels levels;
     enum mp_csp_prim primaries;
+    enum mp_csp_trc gamma;
+    float nom_peak; // nominal (absolute) peak. 0 = auto/unknown
+    float sig_peak; // signal peak, highest value that occurs in the source
+};
+
+struct mp_csp_params {
+    struct mp_colorspace color; // input colorspace
+    enum mp_csp_levels levels_out; // output device
     float brightness;
     float contrast;
     float hue;
@@ -131,9 +141,8 @@ struct mp_csp_params {
 };
 
 #define MP_CSP_PARAMS_DEFAULTS {                                \
-    .colorspace = MP_CSP_BT_601,                                \
-    .levels_in = MP_CSP_LEVELS_TV,                              \
-    .primaries = MP_CSP_PRIM_AUTO,                              \
+    .color = { .space = MP_CSP_BT_601,                          \
+               .levels = MP_CSP_LEVELS_TV },                    \
     .levels_out = MP_CSP_LEVELS_PC,                             \
     .brightness = 0, .contrast = 1, .hue = 0, .saturation = 1,  \
     .gamma = 1, .texture_bits = 8, .input_bits = 8}
@@ -142,6 +151,8 @@ struct mp_image_params;
 void mp_csp_set_image_params(struct mp_csp_params *params,
                              const struct mp_image_params *imgparams);
 
+bool mp_colorspace_equal(struct mp_colorspace c1, struct mp_colorspace c2);
+
 enum mp_chroma_location {
     MP_CHROMA_AUTO,
     MP_CHROMA_LEFT,     // mpeg2/4, h264
@@ -193,27 +204,19 @@ struct mp_csp_primaries {
 
 void mp_csp_copy_equalizer_values(struct mp_csp_params *params,
                                   const struct mp_csp_equalizer *eq);
-
 int mp_csp_equalizer_set(struct mp_csp_equalizer *eq, const char *property,
                          int value);
-
 int mp_csp_equalizer_get(struct mp_csp_equalizer *eq, const char *property,
                          int *out_value);
 
 enum mp_csp avcol_spc_to_mp_csp(int avcolorspace);
-
 enum mp_csp_levels avcol_range_to_mp_csp_levels(int avrange);
-
 enum mp_csp_prim avcol_pri_to_mp_csp_prim(int avpri);
-
 enum mp_csp_trc avcol_trc_to_mp_csp_trc(int avtrc);
 
 int mp_csp_to_avcol_spc(enum mp_csp colorspace);
-
 int mp_csp_levels_to_avcol_range(enum mp_csp_levels range);
-
 int mp_csp_prim_to_avcol_pri(enum mp_csp_prim prim);
-
 int mp_csp_trc_to_avcol_trc(enum mp_csp_trc trc);
 
 enum mp_csp mp_csp_guess_colorspace(int width, int height);
@@ -221,10 +224,11 @@ enum mp_csp_prim mp_csp_guess_primaries(int width, int height);
 
 enum mp_chroma_location avchroma_location_to_mp(int avloc);
 int mp_chroma_location_to_av(enum mp_chroma_location mploc);
-
 void mp_get_chroma_location(enum mp_chroma_location loc, int *x, int *y);
 
 struct mp_csp_primaries mp_get_csp_primaries(enum mp_csp_prim csp);
+float mp_csp_trc_nom_peak(enum mp_csp_trc trc, float ref_peak);
+bool mp_trc_is_hdr(enum mp_csp_trc trc);
 
 /* Color conversion matrix: RGB = m * YUV + c
  * m is in row-major matrix, with m[row][col], e.g.:
diff --git a/video/decode/d3d.c b/video/decode/d3d.c
index b978472..b50f3a0 100644
--- a/video/decode/d3d.c
+++ b/video/decode/d3d.c
@@ -24,6 +24,7 @@
 #include "common/av_common.h"
 #include "video/fmt-conversion.h"
 #include "video/mp_image.h"
+#include "video/mp_image_pool.h"
 #include "osdep/windows_utils.h"
 
 #include "d3d.h"
@@ -68,7 +69,7 @@ struct d3dva_mode {
 
 #define MODE2(id) &MP_CONCAT(DXVA2_Mode, id), # id
 #define  MODE(id) &MP_CONCAT(DXVA_,      id), # id
-// Prefered modes must come first
+// Preferred modes must come first
 static const struct d3dva_mode d3dva_modes[] = {
     // MPEG-1/2
     {MODE2(MPEG2_VLD),        AV_CODEC_ID_MPEG2VIDEO, PROF_MPEG2_MAIN},
@@ -266,3 +267,95 @@ void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
     buf.stride[1] = src_pitch;
     mp_image_copy_gpu(dest, &buf);
 }
+
+// Test if Direct3D11 can be used by us. Basically, this prevents trying to use
+// D3D11 on Win7, and then failing somewhere in the process.
+bool d3d11_check_decoding(ID3D11Device *dev)
+{
+    HRESULT hr;
+    // We assume that NV12 is always supported, if hw decoding is supported at
+    // all.
+    UINT supported = 0;
+    hr = ID3D11Device_CheckFormatSupport(dev, DXGI_FORMAT_NV12, &supported);
+    return !FAILED(hr) && (supported & D3D11_BIND_DECODER);
+}
+
+static int get_dxgi_mpfmt(DWORD dxgi_fmt)
+{
+    switch (dxgi_fmt) {
+    case DXGI_FORMAT_NV12: return IMGFMT_NV12;
+    case DXGI_FORMAT_P010: return IMGFMT_P010;
+    case DXGI_FORMAT_P016: return IMGFMT_P010;
+    }
+    return 0;
+}
+
+struct mp_image *d3d11_download_image(struct mp_hwdec_ctx *ctx,
+                                      struct mp_image *mpi,
+                                      struct mp_image_pool *swpool)
+{
+    HRESULT hr;
+    ID3D11Device *device = ctx->ctx;
+
+    if (mpi->imgfmt != IMGFMT_D3D11VA && mpi->imgfmt != IMGFMT_D3D11NV12)
+        return NULL;
+
+    ID3D11Texture2D *texture = (void *)mpi->planes[1];
+    int subindex = (intptr_t)mpi->planes[2];
+    if (!texture)
+        return NULL;
+
+    D3D11_TEXTURE2D_DESC tex_desc;
+    ID3D11Texture2D_GetDesc(texture, &tex_desc);
+    int mpfmt = get_dxgi_mpfmt(tex_desc.Format);
+    if (!mpfmt)
+        return NULL;
+
+    // create staging texture shared with the CPU with mostly the same
+    // parameters as the source texture
+    tex_desc.MipLevels      = 1;
+    tex_desc.MiscFlags      = 0;
+    tex_desc.ArraySize      = 1;
+    tex_desc.Usage          = D3D11_USAGE_STAGING;
+    tex_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+    tex_desc.BindFlags      = 0;
+    ID3D11Texture2D *staging = NULL;
+    hr = ID3D11Device_CreateTexture2D(device, &tex_desc, NULL, &staging);
+    if (FAILED(hr))
+        return NULL;
+
+    bool ok = false;
+    struct mp_image *sw_img = NULL;
+    ID3D11DeviceContext *device_ctx = NULL;
+    ID3D11Device_GetImmediateContext(device, &device_ctx);
+
+    // copy to the staging texture
+    ID3D11DeviceContext_CopySubresourceRegion(
+        device_ctx,
+        (ID3D11Resource *)staging, 0, 0, 0, 0,
+        (ID3D11Resource *)texture, subindex, NULL);
+
+    sw_img = mp_image_pool_get(swpool, mpfmt, tex_desc.Width, tex_desc.Height);
+    if (!sw_img)
+        goto done;
+
+    // copy staging texture to the cpu mp_image
+    D3D11_MAPPED_SUBRESOURCE lock;
+    hr = ID3D11DeviceContext_Map(device_ctx, (ID3D11Resource *)staging,
+                                 0, D3D11_MAP_READ, 0, &lock);
+    if (FAILED(hr))
+        goto done;
+    copy_nv12(sw_img, lock.pData, lock.RowPitch, tex_desc.Height);
+    ID3D11DeviceContext_Unmap(device_ctx, (ID3D11Resource *)staging, 0);
+
+    mp_image_set_size(sw_img, mpi->w, mpi->h);
+    mp_image_copy_attributes(sw_img, mpi);
+    ok = true;
+
+done:
+    ID3D11Texture2D_Release(staging);
+    ID3D11DeviceContext_Release(device_ctx);
+    if (!ok)
+        mp_image_unrefp(&sw_img);
+    return sw_img;
+}
diff --git a/video/decode/d3d.h b/video/decode/d3d.h
index 15c423a..6caeb2d 100644
--- a/video/decode/d3d.h
+++ b/video/decode/d3d.h
@@ -19,6 +19,9 @@
 #define MPV_DECODE_D3D_H
 
 #include <windows.h>
+#include <d3d11.h>
+
+#include <stdbool.h>
 #include <inttypes.h>
 
 struct mp_image;
@@ -62,4 +65,10 @@ BOOL is_clearvideo(const GUID *mode_guid);
 void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
                unsigned src_pitch, unsigned surf_height);
 
+bool d3d11_check_decoding(ID3D11Device *dev);
+
+struct mp_image *d3d11_download_image(struct mp_hwdec_ctx *ctx,
+                                      struct mp_image *mpi,
+                                      struct mp_image_pool *swpool);
+
 #endif
diff --git a/video/decode/lavc.h b/video/decode/lavc.h
index 689222d..993c3ec 100644
--- a/video/decode/lavc.h
+++ b/video/decode/lavc.h
@@ -25,6 +25,9 @@ typedef struct lavc_ctx {
     bool hwdec_failed;
     bool hwdec_notified;
 
+    // For HDR side-data caching
+    double cached_hdr_peak;
+
     struct mp_image **delay_queue;
     int num_delay_queue;
     int max_delay_queue;
diff --git a/video/decode/vd_lavc.c b/video/decode/vd_lavc.c
index fbb04d1..5962f88 100644
--- a/video/decode/vd_lavc.c
+++ b/video/decode/vd_lavc.c
@@ -47,6 +47,10 @@
 #include "video/csputils.h"
 #include "video/sws_utils.h"
 
+#if HAVE_AVUTIL_MASTERING_METADATA
+#include <libavutil/mastering_display_metadata.h>
+#endif
+
 #include "lavc.h"
 
 #if AVPALETTE_SIZE != MP_PALETTE_SIZE
@@ -129,17 +133,21 @@ extern const struct vd_lavc_hwdec mp_vd_lavc_dxva2_copy;
 extern const struct vd_lavc_hwdec mp_vd_lavc_d3d11va;
 extern const struct vd_lavc_hwdec mp_vd_lavc_d3d11va_copy;
 
+#if HAVE_RPI
 static const struct vd_lavc_hwdec mp_vd_lavc_rpi = {
     .type = HWDEC_RPI,
     .lavc_suffix = "_mmal",
     .image_format = IMGFMT_MMAL,
 };
+#endif
 
+#if HAVE_ANDROID
 static const struct vd_lavc_hwdec mp_vd_lavc_mediacodec = {
     .type = HWDEC_MEDIACODEC,
     .lavc_suffix = "_mediacodec",
     .copying = true,
 };
+#endif
 
 static const struct vd_lavc_hwdec *const hwdec_list[] = {
 #if HAVE_RPI
@@ -568,16 +576,39 @@ static void update_image_params(struct dec_video *vd, AVFrame *frame,
     vd_ffmpeg_ctx *ctx = vd->priv;
     struct MPOpts *opts = ctx->opts;
 
+#if HAVE_AVUTIL_MASTERING_METADATA
+    // Get the reference peak (for HDR) if available. This is cached into ctx
+    // when it's found, since it's not available on every frame (and seems to
+    // be only available for keyframes)
+    AVFrameSideData *sd = av_frame_get_side_data(frame,
+                          AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+    if (sd) {
+        AVMasteringDisplayMetadata *mdm = (AVMasteringDisplayMetadata *)sd->data;
+        if (mdm->has_luminance) {
+            double peak = av_q2d(mdm->max_luminance);
+            if (!isnormal(peak) || peak < 10 || peak > 100000) {
+                // Invalid data, ignore it. Sadly necessary
+                MP_WARN(vd, "Invalid HDR reference peak in stream: %f\n", peak);
+            } else {
+                ctx->cached_hdr_peak = peak;
+            }
+        }
+    }
+#endif
+
     *out_params = (struct mp_image_params) {
         .imgfmt = pixfmt2imgfmt(frame->format),
         .w = frame->width,
         .h = frame->height,
         .p_w = frame->sample_aspect_ratio.num,
         .p_h = frame->sample_aspect_ratio.den,
-        .colorspace = avcol_spc_to_mp_csp(ctx->avctx->colorspace),
-        .colorlevels = avcol_range_to_mp_csp_levels(ctx->avctx->color_range),
-        .primaries = avcol_pri_to_mp_csp_prim(ctx->avctx->color_primaries),
-        .gamma = avcol_trc_to_mp_csp_trc(ctx->avctx->color_trc),
+        .color = {
+            .space = avcol_spc_to_mp_csp(ctx->avctx->colorspace),
+            .levels = avcol_range_to_mp_csp_levels(ctx->avctx->color_range),
+            .primaries = avcol_pri_to_mp_csp_prim(ctx->avctx->color_primaries),
+            .gamma = avcol_trc_to_mp_csp_trc(ctx->avctx->color_trc),
+            .sig_peak = ctx->cached_hdr_peak,
+        },
         .chroma_location =
             avchroma_location_to_mp(ctx->avctx->chroma_sample_location),
         .rotate = vd->codec->rotate,
diff --git a/video/filter/vf.c b/video/filter/vf.c
index 176ac95..274ca94 100644
--- a/video/filter/vf.c
+++ b/video/filter/vf.c
@@ -227,6 +227,8 @@ void vf_print_filter_chain(struct vf_chain *c, int msglevel,
     for (vf_instance_t *f = c->first; f; f = f->next) {
         char b[128] = {0};
         mp_snprintf_cat(b, sizeof(b), "  [%s] ", f->info->name);
+        if (f->label)
+            mp_snprintf_cat(b, sizeof(b), "\"%s\" ", f->label);
         mp_snprintf_cat(b, sizeof(b), "%s", mp_image_params_to_str(&f->fmt_out));
         if (f->autoinserted)
             mp_snprintf_cat(b, sizeof(b), " [a]");
@@ -298,6 +300,7 @@ void vf_remove_filter(struct vf_chain *c, struct vf_instance *vf)
     assert(prev); // not inserted
     prev->next = vf->next;
     vf_uninit_filter(vf);
+    c->initialized = 0;
 }
 
 struct vf_instance *vf_append_filter(struct vf_chain *c, const char *name,
@@ -312,6 +315,7 @@ struct vf_instance *vf_append_filter(struct vf_chain *c, const char *name,
             pprev = &(*pprev)->next;
         vf->next = *pprev ? *pprev : NULL;
         *pprev = vf;
+        c->initialized = 0;
     }
     return vf;
 }
@@ -652,7 +656,7 @@ int vf_reconfig(struct vf_chain *c, const struct mp_image_params *params)
     mp_msg(c->log, loglevel, "Video filter chain:\n");
     vf_print_filter_chain(c, loglevel, failing);
     if (r < 0)
-        c->input_params = c->output_params = (struct mp_image_params){0};
+        c->output_params = (struct mp_image_params){0};
     return r;
 }
 
diff --git a/video/filter/vf_d3d11vpp.c b/video/filter/vf_d3d11vpp.c
index a0aa0ed..6faf712 100644
--- a/video/filter/vf_d3d11vpp.c
+++ b/video/filter/vf_d3d11vpp.c
@@ -211,21 +211,21 @@ static int recreate_video_proc(struct vf_instance *vf)
                                                          FALSE, 0);
 
     D3D11_VIDEO_PROCESSOR_COLOR_SPACE csp = {
-        .YCbCr_Matrix = p->params.colorspace != MP_CSP_BT_601,
-        .Nominal_Range = p->params.colorlevels == MP_CSP_LEVELS_TV ? 1 : 2,
+        .YCbCr_Matrix = p->params.color.space != MP_CSP_BT_601,
+        .Nominal_Range = p->params.color.levels == MP_CSP_LEVELS_TV ? 1 : 2,
     };
     ID3D11VideoContext_VideoProcessorSetStreamColorSpace(p->video_ctx,
                                                          p->video_proc,
                                                          0, &csp);
     if (p->out_rgb) {
-        if (p->params.colorspace != MP_CSP_BT_601 &&
-            p->params.colorspace != MP_CSP_BT_709)
+        if (p->params.color.space != MP_CSP_BT_601 &&
+            p->params.color.space != MP_CSP_BT_709)
         {
             MP_WARN(vf, "Unsupported video colorspace (%s/%s). Consider "
                     "disabling hardware decoding, or using "
                     "--hwdec=d3d11va-copy to get correct output.\n",
-                    m_opt_choice_str(mp_csp_names, p->params.colorspace),
-                    m_opt_choice_str(mp_csp_levels_names, p->params.colorlevels));
+                    m_opt_choice_str(mp_csp_names, p->params.color.space),
+                    m_opt_choice_str(mp_csp_levels_names, p->params.color.levels));
         }
     } else {
         ID3D11VideoContext_VideoProcessorSetOutputColorSpace(p->video_ctx,
@@ -331,11 +331,6 @@ static int render(struct vf_instance *vf)
         goto cleanup;
     }
 
-    // Make sure the texture is updated correctly on the shared context.
-    // (I'm not sure if this is correct, though it won't harm.)
-    if (p->out_shared)
-        ID3D11DeviceContext_Flush(p->device_ctx);
-
     res = 0;
 cleanup:
     if (in_view)
@@ -360,8 +355,11 @@ static int filter_out(struct vf_instance *vf)
 
     // no filtering
     if (!mp_refqueue_should_deint(p->queue) && !p->require_filtering) {
-        struct mp_image *in = mp_refqueue_get(p->queue, 0);
-        vf_add_output_frame(vf, mp_image_new_ref(in));
+        struct mp_image *in = mp_image_new_ref(mp_refqueue_get(p->queue, 0));
+        if (!in)
+            return -1;
+        mp_image_set_params(in, &p->out_params);
+        vf_add_output_frame(vf, in);
         mp_refqueue_next(p->queue);
         return 0;
     }
diff --git a/video/filter/vf_format.c b/video/filter/vf_format.c
index 109fda4..d406d98 100644
--- a/video/filter/vf_format.c
+++ b/video/filter/vf_format.c
@@ -88,15 +88,15 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
     if (p->outfmt)
         out->imgfmt = p->outfmt;
     if (p->colormatrix)
-        out->colorspace = p->colormatrix;
+        out->color.space = p->colormatrix;
     if (p->colorlevels)
-        out->colorlevels = p->colorlevels;
+        out->color.levels = p->colorlevels;
     if (p->primaries)
-        out->primaries = p->primaries;
+        out->color.primaries = p->primaries;
     if (p->gamma)
-        out->gamma = p->gamma;
+        out->color.gamma = p->gamma;
     if (p->peak)
-        out->peak = p->peak;
+        out->color.sig_peak = p->peak;
     if (p->chroma_location)
         out->chroma_location = p->chroma_location;
     if (p->stereo_in)
diff --git a/video/filter/vf_scale.c b/video/filter/vf_scale.c
index 518ff41..0b233e7 100644
--- a/video/filter/vf_scale.c
+++ b/video/filter/vf_scale.c
@@ -166,8 +166,8 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
     struct mp_imgfmt_desc d_fmt = mp_imgfmt_get_desc(out->imgfmt);
     // keep colorspace settings if the data stays in yuv
     if (!(s_fmt.flags & MP_IMGFLAG_YUV) || !(d_fmt.flags & MP_IMGFLAG_YUV)) {
-        out->colorspace = MP_CSP_AUTO;
-        out->colorlevels = MP_CSP_LEVELS_AUTO;
+        out->color.space = MP_CSP_AUTO;
+        out->color.levels = MP_CSP_LEVELS_AUTO;
     }
     mp_image_params_guess_csp(out);
 
diff --git a/video/filter/vf_vapoursynth.c b/video/filter/vf_vapoursynth.c
index 5592e03..625d539 100644
--- a/video/filter/vf_vapoursynth.c
+++ b/video/filter/vf_vapoursynth.c
@@ -143,13 +143,13 @@ static void copy_mp_to_vs_frame_props_map(struct vf_priv_s *p, VSMap *map,
     struct mp_image_params *params = &img->params;
     p->vsapi->propSetInt(map, "_SARNum", params->p_w, 0);
     p->vsapi->propSetInt(map, "_SARDen", params->p_h, 0);
-    if (params->colorlevels) {
+    if (params->color.levels) {
         p->vsapi->propSetInt(map, "_ColorRange",
-                params->colorlevels == MP_CSP_LEVELS_TV, 0);
+                params->color.levels == MP_CSP_LEVELS_TV, 0);
     }
     // The docs explicitly say it uses libavcodec values.
     p->vsapi->propSetInt(map, "_ColorSpace",
-            mp_csp_to_avcol_spc(params->colorspace), 0);
+            mp_csp_to_avcol_spc(params->color.space), 0);
     if (params->chroma_location) {
         p->vsapi->propSetInt(map, "_ChromaLocation",
                 params->chroma_location == MP_CHROMA_CENTER, 0);
diff --git a/video/filter/vf_vavpp.c b/video/filter/vf_vavpp.c
index 0365b55..b24f886 100644
--- a/video/filter/vf_vavpp.c
+++ b/video/filter/vf_vavpp.c
@@ -168,7 +168,7 @@ static struct mp_image *render(struct vf_instance *vf)
     mp_image_set_size(img, in->w, in->h);
     mp_image_copy_attributes(img, in);
 
-    unsigned int flags = va_get_colorspace_flag(p->params.colorspace);
+    unsigned int flags = va_get_colorspace_flag(p->params.color.space);
     if (!mp_refqueue_is_interlaced(p->queue)) {
         flags |= VA_FRAME_PICTURE;
     } else if (mp_refqueue_is_top_field(p->queue)) {
diff --git a/video/gpu_memcpy.c b/video/gpu_memcpy.c
index 355da0e..542fbc8 100644
--- a/video/gpu_memcpy.c
+++ b/video/gpu_memcpy.c
@@ -83,7 +83,7 @@ void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
         xmm15 = _mm_stream_load_si128(pSrc + 15);
 #endif
         pSrc += regsInLoop;
-        // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
+        // _mm_store_si128 emit the SSE2 instruction MOVDQA (aligned store)
         _mm_store_si128(pTrg     , xmm0);
         _mm_store_si128(pTrg +  1, xmm1);
         _mm_store_si128(pTrg +  2, xmm2);
diff --git a/video/image_writer.c b/video/image_writer.c
index 5ba89c8..6b2f0f9 100644
--- a/video/image_writer.c
+++ b/video/image_writer.c
@@ -133,8 +133,8 @@ static bool write_lavc(struct image_writer_ctx *ctx, mp_image_t *image, FILE *fp
     pic->width = avctx->width;
     pic->height = avctx->height;
     if (ctx->opts->tag_csp) {
-        pic->color_primaries = mp_csp_prim_to_avcol_pri(image->params.primaries);
-        pic->color_trc = mp_csp_trc_to_avcol_trc(image->params.gamma);
+        pic->color_primaries = mp_csp_prim_to_avcol_pri(image->params.color.primaries);
+        pic->color_trc = mp_csp_trc_to_avcol_trc(image->params.color.gamma);
     }
 
 #if HAVE_AVCODEC_NEW_CODEC_API
diff --git a/video/mp_image.c b/video/mp_image.c
index d5b9748..a4ce6d1 100644
--- a/video/mp_image.c
+++ b/video/mp_image.c
@@ -393,11 +393,13 @@ void mp_image_copy_attributes(struct mp_image *dst, struct mp_image *src)
         dst->params.p_w = src->params.p_w;
         dst->params.p_h = src->params.p_h;
     }
-    dst->params.primaries = src->params.primaries;
-    dst->params.gamma = src->params.gamma;
+    dst->params.color.primaries = src->params.color.primaries;
+    dst->params.color.gamma = src->params.color.gamma;
+    dst->params.color.nom_peak = src->params.color.nom_peak;
+    dst->params.color.sig_peak = src->params.color.sig_peak;
     if ((dst->fmt.flags & MP_IMGFLAG_YUV) == (src->fmt.flags & MP_IMGFLAG_YUV)) {
-        dst->params.colorspace = src->params.colorspace;
-        dst->params.colorlevels = src->params.colorlevels;
+        dst->params.color.space = src->params.color.space;
+        dst->params.color.levels = src->params.color.levels;
         dst->params.chroma_location = src->params.chroma_location;
     }
     mp_image_params_guess_csp(&dst->params); // ensure colorspace consistency
@@ -512,8 +514,8 @@ char *mp_image_params_to_str_buf(char *b, size_t bs,
         if (p->hw_subfmt)
             mp_snprintf_cat(b, bs, "[%llu]", (unsigned long long)(p->hw_subfmt));
         mp_snprintf_cat(b, bs, " %s/%s",
-                        m_opt_choice_str(mp_csp_names, p->colorspace),
-                        m_opt_choice_str(mp_csp_levels_names, p->colorlevels));
+                        m_opt_choice_str(mp_csp_names, p->color.space),
+                        m_opt_choice_str(mp_csp_levels_names, p->color.levels));
         mp_snprintf_cat(b, bs, " CL=%s",
                         m_opt_choice_str(mp_chroma_names, p->chroma_location));
         if (p->rotate)
@@ -564,11 +566,7 @@ bool mp_image_params_equal(const struct mp_image_params *p1,
            p1->hw_subfmt == p2->hw_subfmt &&
            p1->w == p2->w && p1->h == p2->h &&
            p1->p_w == p2->p_w && p1->p_h == p2->p_h &&
-           p1->colorspace == p2->colorspace &&
-           p1->colorlevels == p2->colorlevels &&
-           p1->primaries == p2->primaries &&
-           p1->gamma == p2->gamma &&
-           p1->peak == p2->peak &&
+           mp_colorspace_equal(p1->color, p2->color) &&
            p1->chroma_location == p2->chroma_location &&
            p1->rotate == p2->rotate &&
            p1->stereo_in == p2->stereo_in &&
@@ -598,51 +596,56 @@ void mp_image_params_guess_csp(struct mp_image_params *params)
     if (!fmt.id)
         return;
     if (fmt.flags & MP_IMGFLAG_YUV) {
-        if (params->colorspace != MP_CSP_BT_601 &&
-            params->colorspace != MP_CSP_BT_709 &&
-            params->colorspace != MP_CSP_BT_2020_NC &&
-            params->colorspace != MP_CSP_BT_2020_C &&
-            params->colorspace != MP_CSP_SMPTE_240M &&
-            params->colorspace != MP_CSP_YCGCO)
+        if (params->color.space != MP_CSP_BT_601 &&
+            params->color.space != MP_CSP_BT_709 &&
+            params->color.space != MP_CSP_BT_2020_NC &&
+            params->color.space != MP_CSP_BT_2020_C &&
+            params->color.space != MP_CSP_SMPTE_240M &&
+            params->color.space != MP_CSP_YCGCO)
         {
             // Makes no sense, so guess instead
             // YCGCO should be separate, but libavcodec disagrees
-            params->colorspace = MP_CSP_AUTO;
+            params->color.space = MP_CSP_AUTO;
         }
-        if (params->colorspace == MP_CSP_AUTO)
-            params->colorspace = mp_csp_guess_colorspace(params->w, params->h);
-        if (params->colorlevels == MP_CSP_LEVELS_AUTO)
-            params->colorlevels = MP_CSP_LEVELS_TV;
-        if (params->primaries == MP_CSP_PRIM_AUTO) {
+        if (params->color.space == MP_CSP_AUTO)
+            params->color.space = mp_csp_guess_colorspace(params->w, params->h);
+        if (params->color.levels == MP_CSP_LEVELS_AUTO) {
+            if (params->color.gamma == MP_CSP_TRC_V_LOG) {
+                params->color.levels = MP_CSP_LEVELS_PC;
+            } else {
+                params->color.levels = MP_CSP_LEVELS_TV;
+            }
+        }
+        if (params->color.primaries == MP_CSP_PRIM_AUTO) {
             // Guess based on the colormatrix as a first priority
-            if (params->colorspace == MP_CSP_BT_2020_NC ||
-                params->colorspace == MP_CSP_BT_2020_C) {
-                params->primaries = MP_CSP_PRIM_BT_2020;
-            } else if (params->colorspace == MP_CSP_BT_709) {
-                params->primaries = MP_CSP_PRIM_BT_709;
+            if (params->color.space == MP_CSP_BT_2020_NC ||
+                params->color.space == MP_CSP_BT_2020_C) {
+                params->color.primaries = MP_CSP_PRIM_BT_2020;
+            } else if (params->color.space == MP_CSP_BT_709) {
+                params->color.primaries = MP_CSP_PRIM_BT_709;
             } else {
                 // Ambiguous colormatrix for BT.601, guess based on res
-                params->primaries = mp_csp_guess_primaries(params->w, params->h);
+                params->color.primaries = mp_csp_guess_primaries(params->w, params->h);
             }
         }
-        if (params->gamma == MP_CSP_TRC_AUTO)
-            params->gamma = MP_CSP_TRC_BT_1886;
+        if (params->color.gamma == MP_CSP_TRC_AUTO)
+            params->color.gamma = MP_CSP_TRC_BT_1886;
     } else if (fmt.flags & MP_IMGFLAG_RGB) {
-        params->colorspace = MP_CSP_RGB;
-        params->colorlevels = MP_CSP_LEVELS_PC;
+        params->color.space = MP_CSP_RGB;
+        params->color.levels = MP_CSP_LEVELS_PC;
 
         // The majority of RGB content is either sRGB or (rarely) some other
         // color space which we don't even handle, like AdobeRGB or
         // ProPhotoRGB. The only reasonable thing we can do is assume it's
         // sRGB and hope for the best, which should usually just work out fine.
         // Note: sRGB primaries = BT.709 primaries
-        if (params->primaries == MP_CSP_PRIM_AUTO)
-            params->primaries = MP_CSP_PRIM_BT_709;
-        if (params->gamma == MP_CSP_TRC_AUTO)
-            params->gamma = MP_CSP_TRC_SRGB;
+        if (params->color.primaries == MP_CSP_PRIM_AUTO)
+            params->color.primaries = MP_CSP_PRIM_BT_709;
+        if (params->color.gamma == MP_CSP_TRC_AUTO)
+            params->color.gamma = MP_CSP_TRC_SRGB;
     } else if (fmt.flags & MP_IMGFLAG_XYZ) {
-        params->colorspace = MP_CSP_XYZ;
-        params->colorlevels = MP_CSP_LEVELS_PC;
+        params->color.space = MP_CSP_XYZ;
+        params->color.levels = MP_CSP_LEVELS_PC;
 
         // The default XYZ matrix converts it to BT.709 color space
         // since that's the most likely scenario. Proper VOs should ignore
@@ -652,22 +655,22 @@ void mp_image_params_guess_csp(struct mp_image_params *params)
         // gamut for VOs which *do* use the specialized XYZ matrix but don't
         // know any better output gamut other than whatever the source is
         // tagged with.
-        if (params->primaries == MP_CSP_PRIM_AUTO)
-            params->primaries = MP_CSP_PRIM_BT_709;
-        if (params->gamma == MP_CSP_TRC_AUTO)
-            params->gamma = MP_CSP_TRC_LINEAR;
+        if (params->color.primaries == MP_CSP_PRIM_AUTO)
+            params->color.primaries = MP_CSP_PRIM_BT_709;
+        if (params->color.gamma == MP_CSP_TRC_AUTO)
+            params->color.gamma = MP_CSP_TRC_LINEAR;
     } else {
         // We have no clue.
-        params->colorspace = MP_CSP_AUTO;
-        params->colorlevels = MP_CSP_LEVELS_AUTO;
-        params->primaries = MP_CSP_PRIM_AUTO;
-        params->gamma = MP_CSP_TRC_AUTO;
+        params->color.space = MP_CSP_AUTO;
+        params->color.levels = MP_CSP_LEVELS_AUTO;
+        params->color.primaries = MP_CSP_PRIM_AUTO;
+        params->color.gamma = MP_CSP_TRC_AUTO;
     }
 
-    // Guess the reference peak (independent of the colorspace)
-    if (params->gamma == MP_CSP_TRC_SMPTE_ST2084) {
-        if (!params->peak)
-            params->peak = 10000; // As per the spec
+    // Guess the nominal peak (independent of the colorspace)
+    if (params->color.gamma == MP_CSP_TRC_SMPTE_ST2084) {
+        if (!params->color.nom_peak)
+            params->color.nom_peak = 10000; // As per the spec
     }
 }
 
@@ -724,8 +727,8 @@ static void mp_image_copy_fields_to_av_frame(struct AVFrame *dst,
     if (src->fields & MP_IMGFIELD_REPEAT_FIRST)
         dst->repeat_pict = 1;
 
-    dst->colorspace = mp_csp_to_avcol_spc(src->params.colorspace);
-    dst->color_range = mp_csp_levels_to_avcol_range(src->params.colorlevels);
+    dst->colorspace = mp_csp_to_avcol_spc(src->params.color.space);
+    dst->color_range = mp_csp_levels_to_avcol_range(src->params.color.levels);
 }
 
 // Create a new mp_image reference to av_frame.
diff --git a/video/mp_image.h b/video/mp_image.h
index 18d2596..dfbe4ee 100644
--- a/video/mp_image.h
+++ b/video/mp_image.h
@@ -43,11 +43,7 @@ struct mp_image_params {
                                 // (will use the HW API's format identifiers)
     int w, h;                   // image dimensions
     int p_w, p_h;               // define pixel aspect ratio (undefined: 0/0)
-    enum mp_csp colorspace;
-    enum mp_csp_levels colorlevels;
-    enum mp_csp_prim primaries;
-    enum mp_csp_trc gamma;
-    float peak; // 0 = auto/unknown
+    struct mp_colorspace color;
     enum mp_chroma_location chroma_location;
     // The image should be rotated clockwise (0-359 degrees).
     int rotate;
diff --git a/video/out/bitmap_packer.c b/video/out/bitmap_packer.c
index 3f75a72..5169357 100644
--- a/video/out/bitmap_packer.c
+++ b/video/out/bitmap_packer.c
@@ -29,8 +29,6 @@
 #include "mpv_talloc.h"
 #include "bitmap_packer.h"
 #include "common/common.h"
-#include "sub/dec_sub.h"
-#include "video/mp_image.h"
 
 #define IS_POWER_OF_2(x) (((x) > 0) && !(((x) - 1) & (x)))
 
@@ -199,34 +197,3 @@ void packer_set_size(struct bitmap_packer *packer, int size)
     packer->scratch = talloc_array_ptrtype(packer, packer->scratch,
                                            packer->asize + 16);
 }
-
-int packer_pack_from_subbitmaps(struct bitmap_packer *packer,
-                                struct sub_bitmaps *b)
-{
-    packer->count = 0;
-    if (b->format == SUBBITMAP_EMPTY)
-        return 0;
-    packer_set_size(packer, b->num_parts);
-    for (int i = 0; i < b->num_parts; i++)
-        packer->in[i] = (struct pos){b->parts[i].w, b->parts[i].h};
-    return packer_pack(packer);
-}
-
-void packer_copy_subbitmaps(struct bitmap_packer *packer, struct sub_bitmaps *b,
-                            void *data, int pixel_stride, int stride)
-{
-    assert(packer->count == b->num_parts);
-    if (packer->padding) {
-        struct pos bb[2];
-        packer_get_bb(packer, bb);
-        memset_pic(data, 0, bb[1].x * pixel_stride, bb[1].y, stride);
-    }
-    for (int n = 0; n < packer->count; n++) {
-        struct sub_bitmap *s = &b->parts[n];
-        struct pos p = packer->result[n];
-
-        void *pdata = (uint8_t *)data + p.y * stride + p.x * pixel_stride;
-        memcpy_pic(pdata, s->bitmap, s->w * pixel_stride, s->h,
-                   stride, s->stride);
-    }
-}
diff --git a/video/out/bitmap_packer.h b/video/out/bitmap_packer.h
index 8fd2fce..97bf88f 100644
--- a/video/out/bitmap_packer.h
+++ b/video/out/bitmap_packer.h
@@ -48,20 +48,4 @@ void packer_set_size(struct bitmap_packer *packer, int size);
  */
 int packer_pack(struct bitmap_packer *packer);
 
-/* Like above, but packer->count will be automatically set and
- * packer->in will be reallocated if needed and filled from the
- * given image list.
- */
-int packer_pack_from_subbitmaps(struct bitmap_packer *packer,
-                                struct sub_bitmaps *b);
-
-// Copy the (already packed) sub-bitmaps from b to the image in data.
-// data must point to an image that is at least (packer->w, packer->h) big.
-// The image has the given stride (bytes between (x, y) to (x, y + 1)), and the
-// pixel format used by both the sub-bitmaps and the image uses pixel_stride
-// bytes per pixel (bytes between (x, y) to (x + 1, y)).
-// If packer->padding is set, the padding borders are cleared with 0.
-void packer_copy_subbitmaps(struct bitmap_packer *packer, struct sub_bitmaps *b,
-                            void *data, int pixel_stride, int stride);
-
 #endif
diff --git a/video/out/opengl/angle_common.c b/video/out/opengl/angle_common.c
deleted file mode 100644
index 21cc924..0000000
--- a/video/out/opengl/angle_common.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "angle_common.h"
-
-// Test if Direct3D11 can be used by us. Basically, this prevents trying to use
-// D3D11 on Win7, and then failing somewhere in the process.
-bool d3d11_check_decoding(ID3D11Device *dev)
-{
-    HRESULT hr;
-    // We assume that NV12 is always supported, if hw decoding is supported at
-    // all.
-    UINT supported = 0;
-    hr = ID3D11Device_CheckFormatSupport(dev, DXGI_FORMAT_NV12, &supported);
-    return !FAILED(hr) && (supported & D3D11_BIND_DECODER);
-}
diff --git a/video/out/opengl/angle_common.h b/video/out/opengl/angle_common.h
deleted file mode 100644
index 14ecd6a..0000000
--- a/video/out/opengl/angle_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef MP_ANGLE_COMMON_H
-#define MP_ANGLE_COMMON_H
-
-#include <initguid.h>
-#include <assert.h>
-#include <windows.h>
-#include <d3d11.h>
-
-#include <stdbool.h>
-
-bool d3d11_check_decoding(ID3D11Device *dev);
-
-#endif
\ No newline at end of file
diff --git a/video/out/opengl/context_angle.c b/video/out/opengl/context_angle.c
index cc14fc3..28515f4 100644
--- a/video/out/opengl/context_angle.c
+++ b/video/out/opengl/context_angle.c
@@ -38,6 +38,7 @@ struct priv {
     EGLContext egl_context;
     EGLSurface egl_surface;
     bool use_es2;
+    PFNEGLPOSTSUBBUFFERNVPROC eglPostSubBufferNV;
 };
 
 static void angle_uninit(MPGLContext *ctx)
@@ -288,6 +289,11 @@ static int angle_init(struct MPGLContext *ctx, int flags)
     // Configure the underlying Direct3D device
     d3d_init(ctx);
 
+    if (strstr(exts, "EGL_NV_post_sub_buffer")) {
+        p->eglPostSubBufferNV =
+            (PFNEGLPOSTSUBBUFFERNVPROC)eglGetProcAddress("eglPostSubBufferNV");
+    }
+
     mpgl_load_functions(ctx->gl, get_proc_address, NULL, vo->log);
     return 0;
 
@@ -315,7 +321,16 @@ static int angle_reconfig(struct MPGLContext *ctx)
 
 static int angle_control(MPGLContext *ctx, int *events, int request, void *arg)
 {
-    return vo_w32_control(ctx->vo, events, request, arg);
+    struct priv *p = ctx->priv;
+    int r = vo_w32_control(ctx->vo, events, request, arg);
+
+    // Calling eglPostSubBufferNV with a 0-sized region doesn't present a frame
+    // or block, but it does update the swapchain to match the window size
+    // See: https://groups.google.com/d/msg/angleproject/RvyVkjRCQGU/gfKfT64IAgAJ
+    if ((*events & VO_EVENT_RESIZE) && p->eglPostSubBufferNV)
+        p->eglPostSubBufferNV(p->egl_display, p->egl_surface, 0, 0, 0, 0);
+
+    return r;
 }
 
 static void angle_swap_buffers(MPGLContext *ctx)
diff --git a/video/out/opengl/hwdec_d3d11egl.c b/video/out/opengl/hwdec_d3d11egl.c
index 549d3f5..07333c3 100644
--- a/video/out/opengl/hwdec_d3d11egl.c
+++ b/video/out/opengl/hwdec_d3d11egl.c
@@ -23,7 +23,6 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "angle_common.h"
 #include "angle_dynamic.h"
 
 #include "common/common.h"
@@ -31,6 +30,7 @@
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
+#include "video/decode/d3d.h"
 
 #ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
 #define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
@@ -195,6 +195,7 @@ static int create(struct gl_hwdec *hw)
         .type = HWDEC_D3D11VA,
         .driver_name = hw->driver->name,
         .ctx = p->d3d11_device,
+        .download_image = d3d11_download_image,
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
 
diff --git a/video/out/opengl/hwdec_d3d11eglrgb.c b/video/out/opengl/hwdec_d3d11eglrgb.c
index 2e61189..be8057c 100644
--- a/video/out/opengl/hwdec_d3d11eglrgb.c
+++ b/video/out/opengl/hwdec_d3d11eglrgb.c
@@ -23,7 +23,6 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "angle_common.h"
 #include "angle_dynamic.h"
 
 #include "common/common.h"
@@ -31,6 +30,7 @@
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
+#include "video/decode/d3d.h"
 
 #ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
 #define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
@@ -87,6 +87,8 @@ static int create(struct gl_hwdec *hw)
     if (!angle_load())
         return -1;
 
+    d3d_load_dlls();
+
     EGLDisplay egl_display = eglGetCurrentDisplay();
     if (!egl_display)
         return -1;
@@ -104,7 +106,6 @@ static int create(struct gl_hwdec *hw)
 
     p->egl_display = egl_display;
 
-    HANDLE d3d11_dll = GetModuleHandleW(L"d3d11.dll");
     if (!d3d11_dll) {
         if (!hw->probing)
             MP_ERR(hw, "Failed to load D3D11 library\n");
diff --git a/video/out/opengl/hwdec_dxva2egl.c b/video/out/opengl/hwdec_dxva2egl.c
index d67a85b..f206b96 100644
--- a/video/out/opengl/hwdec_dxva2egl.c
+++ b/video/out/opengl/hwdec_dxva2egl.c
@@ -29,11 +29,11 @@
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
+#include "video/decode/d3d.h"
 
 struct priv {
     struct mp_hwdec_ctx hwctx;
 
-    HMODULE             d3d9_dll;
     IDirect3D9Ex       *d3d9ex;
     IDirect3DDevice9Ex *device9ex;
     IDirect3DQuery9    *query9;
@@ -89,9 +89,6 @@ static void destroy(struct gl_hwdec *hw)
 
     if (p->d3d9ex)
         IDirect3D9Ex_Release(p->d3d9ex);
-
-    if (p->d3d9_dll)
-        FreeLibrary(p->d3d9_dll);
 }
 
 static int create(struct gl_hwdec *hw)
@@ -99,6 +96,8 @@ static int create(struct gl_hwdec *hw)
     if (!angle_load())
         return -1;
 
+    d3d_load_dlls();
+
     EGLDisplay egl_display = eglGetCurrentDisplay();
     if (!egl_display)
         return -1;
@@ -118,15 +117,14 @@ static int create(struct gl_hwdec *hw)
 
     p->egl_display = egl_display;
 
-    p->d3d9_dll = LoadLibraryW(L"d3d9.dll");
-    if (!p->d3d9_dll) {
+    if (!d3d9_dll) {
         MP_FATAL(hw, "Failed to load \"d3d9.dll\": %s\n",
                  mp_LastError_to_str());
         goto fail;
     }
 
     HRESULT (WINAPI *Direct3DCreate9Ex)(UINT SDKVersion, IDirect3D9Ex **ppD3D);
-    Direct3DCreate9Ex = (void *)GetProcAddress(p->d3d9_dll, "Direct3DCreate9Ex");
+    Direct3DCreate9Ex = (void *)GetProcAddress(d3d9_dll, "Direct3DCreate9Ex");
     if (!Direct3DCreate9Ex) {
         MP_FATAL(hw, "Direct3D 9Ex not supported\n");
         goto fail;
diff --git a/video/out/opengl/hwdec_vaglx.c b/video/out/opengl/hwdec_vaglx.c
index 2e3017c..0400604 100644
--- a/video/out/opengl/hwdec_vaglx.c
+++ b/video/out/opengl/hwdec_vaglx.c
@@ -185,7 +185,7 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
                           0, 0, hw_image->w, hw_image->h,
                           0, 0, hw_image->w, hw_image->h,
                           NULL, 0,
-                          va_get_colorspace_flag(hw_image->params.colorspace));
+                          va_get_colorspace_flag(hw_image->params.color.space));
     CHECK_VA_STATUS(p, "vaPutSurface()");
     va_unlock(p->ctx);
 
diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c
index 7b1ec16..5df5bb1 100644
--- a/video/out/opengl/osd.c
+++ b/video/out/opengl/osd.c
@@ -21,8 +21,6 @@
 
 #include <libavutil/common.h>
 
-#include "video/out/bitmap_packer.h"
-
 #include "formats.h"
 #include "utils.h"
 #include "osd.h"
@@ -53,20 +51,17 @@ struct mpgl_osd_part {
     int change_id;
     GLuint texture;
     int w, h;
-    GLuint buffer;
+    struct gl_pbo_upload pbo;
     int num_subparts;
     int prev_num_subparts;
     struct sub_bitmap *subparts;
     struct vertex *vertices;
-    struct bitmap_packer *packer;
-    void *upload;
 };
 
 struct mpgl_osd {
     struct mp_log *log;
     struct osd_state *osd;
     GL *gl;
-    GLint max_tex_wh;
     bool use_pbo;
     struct mpgl_osd_part *parts[MAX_OSD_PARTS];
     const struct gl_format *fmt_table[SUBBITMAP_COUNT];
@@ -89,21 +84,11 @@ struct mpgl_osd *mpgl_osd_init(GL *gl, struct mp_log *log, struct osd_state *osd
         .scratch = talloc_zero_size(ctx, 1),
     };
 
-    gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &ctx->max_tex_wh);
-
     ctx->fmt_table[SUBBITMAP_LIBASS] = gl_find_unorm_format(gl, 1, 1);
     ctx->fmt_table[SUBBITMAP_RGBA]   = gl_find_unorm_format(gl, 1, 4);
 
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct mpgl_osd_part *p = talloc_ptrtype(ctx, p);
-        *p = (struct mpgl_osd_part) {
-            .packer = talloc_struct(p, struct bitmap_packer, {
-                .w_max = ctx->max_tex_wh,
-                .h_max = ctx->max_tex_wh,
-            }),
-        };
-        ctx->parts[n] = p;
-    }
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
 
     for (int n = 0; n < SUBBITMAP_COUNT; n++)
         ctx->formats[n] = !!ctx->fmt_table[n];
@@ -125,9 +110,7 @@ void mpgl_osd_destroy(struct mpgl_osd *ctx)
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
         struct mpgl_osd_part *p = ctx->parts[n];
         gl->DeleteTextures(1, &p->texture);
-        if (gl->DeleteBuffers)
-            gl->DeleteBuffers(1, &p->buffer);
-        talloc_free(p->upload);
+        gl_pbo_upload_uninit(&p->pbo);
     }
     talloc_free(ctx);
 }
@@ -137,87 +120,6 @@ void mpgl_osd_set_options(struct mpgl_osd *ctx, bool pbo)
     ctx->use_pbo = pbo;
 }
 
-static bool upload(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
-                   struct sub_bitmaps *imgs, bool pbo)
-{
-    GL *gl = ctx->gl;
-    bool success = true;
-    const struct gl_format *fmt = ctx->fmt_table[imgs->format];
-    size_t pix_stride = gl_bytes_per_pixel(fmt->format, fmt->type);
-    size_t buffer_size = pix_stride * osd->h * osd->w;
-
-    char *data = NULL;
-    void *texdata = NULL;
-
-    if (pbo) {
-        if (!osd->buffer) {
-            gl->GenBuffers(1, &osd->buffer);
-            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
-            gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size, NULL,
-                           GL_DYNAMIC_COPY);
-        }
-
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
-        data = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, buffer_size,
-                                  GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
-        if (!data) {
-            success = false;
-            goto done;
-        }
-    } else {
-        if (!imgs->packed) {
-            if (!osd->upload)
-                osd->upload = talloc_size(NULL, buffer_size);
-            data = osd->upload;
-            texdata = data;
-        }
-    }
-
-    int copy_w = 0;
-    int copy_h = 0;
-    size_t stride = 0;
-    if (imgs->packed) {
-        copy_w = imgs->packed_w;
-        copy_h = imgs->packed_h;
-        stride = imgs->packed->stride[0];
-        texdata = imgs->packed->planes[0];
-        if (pbo) {
-            memcpy_pic(data, texdata, pix_stride * copy_w,  copy_h,
-                       osd->w * pix_stride, stride);
-            stride = osd->w * pix_stride;
-            texdata = NULL;
-        }
-    } else {
-        struct pos bb[2];
-        packer_get_bb(osd->packer, bb);
-        copy_w = bb[1].x;
-        copy_h = bb[1].y;
-        stride = osd->w * pix_stride;
-        packer_copy_subbitmaps(osd->packer, imgs, data, pix_stride, stride);
-    }
-
-    if (pbo) {
-        if (!gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER)) {
-            success = false;
-            goto done;
-        }
-    }
-
-    gl_upload_tex(gl, GL_TEXTURE_2D, fmt->format, fmt->type, texdata, stride,
-                  0, 0, copy_w, copy_h);
-
-    if (pbo)
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-done:
-    if (!success) {
-        MP_FATAL(ctx, "Error: can't upload subtitles! "
-                 "Remove the 'pbo' suboption.\n");
-    }
-
-    return success;
-}
-
 static int next_pow2(int v)
 {
     for (int x = 0; x < 30; x++) {
@@ -231,31 +133,12 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
                        struct sub_bitmaps *imgs)
 {
     GL *gl = ctx->gl;
+    bool ok = false;
 
-    int req_w = 0;
-    int req_h = 0;
-
-    if (imgs->packed) {
-        req_w = next_pow2(imgs->packed_w);
-        req_h = next_pow2(imgs->packed_h);
-    } else {
-        // assume 2x2 filter on scaling
-        osd->packer->padding = imgs->scaled;
-        int r = packer_pack_from_subbitmaps(osd->packer, imgs);
-        if (r < 0) {
-            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-                "supported size %dx%d.\n", osd->packer->w_max, osd->packer->h_max);
-            return false;
-        }
-        req_w = osd->packer->w;
-        req_h = osd->packer->h;
-    }
+    assert(imgs->packed);
 
-    if (req_w > ctx->max_tex_wh || req_h > ctx->max_tex_wh) {
-        MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-                "supported size %dx%d.\n", ctx->max_tex_wh, ctx->max_tex_wh);
-        return false;
-    }
+    int req_w = next_pow2(imgs->packed_w);
+    int req_h = next_pow2(imgs->packed_h);
 
     const struct gl_format *fmt = ctx->fmt_table[imgs->format];
     assert(fmt);
@@ -270,6 +153,17 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
         osd->w = FFMAX(32, req_w);
         osd->h = FFMAX(32, req_h);
 
+        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
+
+        GLint max_wh;
+        gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &max_wh);
+
+        if (osd->w > max_wh || osd->h > max_wh) {
+            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
+                   "supported size %dx%d.\n", max_wh, max_wh);
+            goto done;
+        }
+
         gl->TexImage2D(GL_TEXTURE_2D, 0, fmt->internal_format, osd->w, osd->h,
                        0, fmt->format, fmt->type, NULL);
 
@@ -277,24 +171,17 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-
-        if (gl->DeleteBuffers)
-            gl->DeleteBuffers(1, &osd->buffer);
-        osd->buffer = 0;
-
-        talloc_free(osd->upload);
-        osd->upload = NULL;
     }
 
-    bool uploaded = false;
-    if (ctx->use_pbo)
-        uploaded = upload(ctx, osd, imgs, true);
-    if (!uploaded)
-        upload(ctx, osd, imgs, false);
+    gl_pbo_upload_tex(&osd->pbo, gl, ctx->use_pbo, GL_TEXTURE_2D, fmt->format,
+                      fmt->type, osd->w, osd->h, imgs->packed->planes[0],
+                      imgs->packed->stride[0], 0, 0,
+                      imgs->packed_w, imgs->packed_h);
+    ok = true;
 
+done:
     gl->BindTexture(GL_TEXTURE_2D, 0);
-
-    return true;
+    return ok;
 }
 
 static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
@@ -319,13 +206,6 @@ static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
     MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
     memcpy(osd->subparts, imgs->parts,
            osd->num_subparts * sizeof(osd->subparts[0]));
-
-    if (!imgs->packed) {
-        for (int n = 0; n < osd->num_subparts; n++) {
-            osd->subparts[n].src_x = osd->packer->result[n].x;
-            osd->subparts[n].src_y = osd->packer->result[n].y;
-        }
-    }
 }
 
 static void write_quad(struct vertex *va, struct gl_transform t,
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
index 8f915a5..112012f 100644
--- a/video/out/opengl/user_shaders.c
+++ b/video/out/opengl/user_shaders.c
@@ -16,6 +16,7 @@
  */
 
 #include <ctype.h>
+#include <assert.h>
 
 #include "user_shaders.h"
 
@@ -69,6 +70,94 @@ static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
     return true;
 }
 
+// Returns whether successful. 'result' is left untouched on failure
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result)
+{
+    float stack[MAX_SZEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
+
+    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SZEXP_END:
+            goto done;
+
+        case SZEXP_CONST:
+            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SZEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
+
+        case SZEXP_OP1:
+            if (idx < 1) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            switch (expr[i].val.op) {
+            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: abort();
+            }
+            continue;
+
+        case SZEXP_OP2:
+            if (idx < 2) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SZEXP_OP_ADD: res = op1 + op2; break;
+            case SZEXP_OP_SUB: res = op1 - op2; break;
+            case SZEXP_OP_MUL: res = op1 * op2; break;
+            case SZEXP_OP_DIV: res = op1 / op2; break;
+            case SZEXP_OP_GT:  res = op1 > op2; break;
+            case SZEXP_OP_LT:  res = op1 < op2; break;
+            default: abort();
+            }
+
+            if (!isfinite(res)) {
+                mp_warn(log, "Illegal operation in RPN expression!\n");
+                return false;
+            }
+
+            stack[idx++] = res;
+            continue;
+
+        case SZEXP_VAR_W:
+        case SZEXP_VAR_H: {
+            struct bstr name = expr[i].val.varname;
+            float size[2];
+
+            if (!lookup(priv, name, size)) {
+                mp_warn(log, "Variable %.*s not found in RPN expression!\n",
+                        BSTR_P(name));
+                return false;
+            }
+
+            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? size[0] : size[1];
+            continue;
+            }
+        }
+    }
+
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        mp_warn(log, "Malformed stack after RPN expression!\n");
+        return false;
+    }
+
+    *result = stack[0];
+    return true;
+}
+
 // Returns false if no more shaders could be parsed
 bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
                             struct gl_user_shader *out)
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
index b8c287b..7527eb3 100644
--- a/video/out/opengl/user_shaders.h
+++ b/video/out/opengl/user_shaders.h
@@ -71,4 +71,9 @@ struct gl_user_shader {
 bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
                             struct gl_user_shader *out);
 
+// Evaluate a szexp, given a lookup function for named textures
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result);
+
 #endif
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 73b411e..72a748a 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -109,8 +109,10 @@ mp_image_t *gl_read_window_contents(GL *gl)
     mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, vp[2], vp[3]);
     if (!image)
         return NULL;
+    gl->BindFramebuffer(GL_FRAMEBUFFER, gl->main_fb);
+    GLenum obj = gl->main_fb ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
     gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
-    gl->ReadBuffer(GL_FRONT);
+    gl->ReadBuffer(obj);
     //flip image while reading (and also avoid stride-related trouble)
     for (int y = 0; y < vp[3]; y++) {
         gl->ReadPixels(vp[0], vp[1] + vp[3] - y - 1, vp[2], 1,
@@ -118,6 +120,7 @@ mp_image_t *gl_read_window_contents(GL *gl)
                        image->planes[0] + y * image->stride[0]);
     }
     gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
     return image;
 }
 
@@ -1121,3 +1124,73 @@ void gl_timer_stop(struct gl_timer *timer)
     if (gl->EndQuery)
         gl->EndQuery(GL_TIME_ELAPSED);
 }
+
+// Upload a texture, going through a PBO. PBO supposedly can facilitate
+// asynchronous copy from CPU to GPU, so this is an optimization. Note that
+// changing format/type/tex_w/tex_h or reusing the PBO in the same frame can
+// ruin performance.
+// This call is like gl_upload_tex(), plus PBO management/use.
+// target, format, type, dataptr, stride, x, y, w, h: texture upload params
+//                                                    (see gl_upload_tex())
+// tex_w, tex_h: maximum size of the used texture
+// use_pbo: for convenience, if false redirects the call to gl_upload_tex
+void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo,
+                       GLenum target, GLenum format, GLenum type,
+                       int tex_w, int tex_h, const void *dataptr, int stride,
+                       int x, int y, int w, int h)
+{
+    assert(x >= 0 && y >= 0 && w >= 0 && h >= 0);
+    assert(x + w <= tex_w && y + h <= tex_h);
+
+    if (!use_pbo || !gl->MapBufferRange)
+        goto no_pbo;
+
+    size_t pix_stride = gl_bytes_per_pixel(format, type);
+    size_t buffer_size = pix_stride * tex_w * tex_h;
+    size_t needed_size = pix_stride * w * h;
+
+    if (buffer_size != pbo->buffer_size)
+        gl_pbo_upload_uninit(pbo);
+
+    if (!pbo->buffers[0]) {
+        pbo->gl = gl;
+        pbo->buffer_size = buffer_size;
+        gl->GenBuffers(2, &pbo->buffers[0]);
+        for (int n = 0; n < 2; n++) {
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffers[n]);
+            gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size, NULL,
+                           GL_DYNAMIC_COPY);
+        }
+    }
+
+    pbo->index = (pbo->index + 1) % 2;
+
+    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffers[pbo->index]);
+    void *data = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, needed_size,
+                                    GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+    if (!data)
+        goto no_pbo;
+
+    memcpy_pic(data, dataptr, pix_stride * w,  h, pix_stride * w, stride);
+
+    if (!gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER)) {
+        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+        goto no_pbo;
+    }
+
+    gl_upload_tex(gl, target, format, type, NULL, pix_stride * w, x, y, w, h);
+
+    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    return;
+
+no_pbo:
+    gl_upload_tex(gl, target, format, type, dataptr, stride, x, y, w, h);
+}
+
+void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo)
+{
+    if (pbo->gl)
+        pbo->gl->DeleteBuffers(2, &pbo->buffers[0]);
+    *pbo = (struct gl_pbo_upload){0};
+}
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 9b4fd84..ec54d19 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -182,4 +182,17 @@ uint64_t gl_timer_last_us(struct gl_timer *timer);
 uint64_t gl_timer_avg_us(struct gl_timer *timer);
 uint64_t gl_timer_peak_us(struct gl_timer *timer);
 
+struct gl_pbo_upload {
+    GL *gl;
+    int index;
+    GLuint buffers[2];
+    size_t buffer_size;
+};
+
+void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo,
+                       GLenum target, GLenum format,  GLenum type,
+                       int tex_w, int tex_h, const void *dataptr, int stride,
+                       int x, int y, int w, int h);
+void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo);
+
 #endif
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index f46fdc1..468bee9 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -41,7 +41,6 @@
 #include "user_shaders.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
-#include "video/out/bitmap_packer.h"
 #include "video/out/dither.h"
 #include "video/out/vo.h"
 
@@ -97,13 +96,13 @@ struct texplane {
     GLenum gl_format;
     GLenum gl_type;
     GLuint gl_texture;
-    int gl_buffer;
     char swizzle[5];
+    bool flipped;
+    struct gl_pbo_upload pbo;
 };
 
 struct video_image {
     struct texplane planes[4];
-    bool image_flipped;
     struct mp_image *mpi;       // original input image
     bool hwdec_mapped;
 };
@@ -676,7 +675,7 @@ static int pass_bind(struct gl_video *p, struct img_tex tex)
 }
 
 // Rotation by 90° and flipping.
-static void get_plane_source_transform(struct gl_video *p, int w, int h,
+static void get_plane_source_transform(struct gl_video *p, struct texplane *t,
                                        struct gl_transform *out_tr)
 {
     struct gl_transform tr = identity_trans;
@@ -689,11 +688,11 @@ static void get_plane_source_transform(struct gl_video *p, int w, int h,
     // basically, recenter to keep the whole image in view
     float b[2] = {1, 1};
     gl_transform_vec(rot, &b[0], &b[1]);
-    tr.t[0] += b[0] < 0 ? w : 0;
-    tr.t[1] += b[1] < 0 ? h : 0;
+    tr.t[0] += b[0] < 0 ? t->w : 0;
+    tr.t[1] += b[1] < 0 ? t->h : 0;
 
-    if (p->image.image_flipped) {
-        struct gl_transform flip = {{{1, 0}, {0, -1}}, {0, h}};
+    if (t->flipped) {
+        struct gl_transform flip = {{{1, 0}, {0, -1}}, {0, t->h}};
         gl_transform_trans(flip, &tr);
     }
 
@@ -730,7 +729,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
 
     // The existing code assumes we just have a single tex multiplier for
     // all of the planes. This may change in the future
-    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.colorspace,
+    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.color.space,
                                          p->image_desc.component_bits,
                                          p->image_desc.component_full_bits);
 
@@ -764,7 +763,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
             .components = p->image_desc.components[n],
         };
         snprintf(tex[n].swizzle, sizeof(tex[n].swizzle), "%s", t->swizzle);
-        get_plane_source_transform(p, t->w, t->h, &tex[n].transform);
+        get_plane_source_transform(p, t, &tex[n].transform);
         if (p->image_params.rotate % 180 == 90)
             MPSWAP(int, tex[n].w, tex[n].h);
 
@@ -794,7 +793,7 @@ static void init_video(struct gl_video *p)
     mp_image_params_guess_csp(&p->image_params);
 
     int eq_caps = MP_CSP_EQ_CAPS_GAMMA;
-    if (p->image_params.colorspace != MP_CSP_BT_2020_C)
+    if (p->image_params.color.space != MP_CSP_BT_2020_C)
         eq_caps |= MP_CSP_EQ_CAPS_COLORMATRIX;
     if (p->image_desc.flags & MP_IMGFLAG_XYZ)
         eq_caps |= MP_CSP_EQ_CAPS_BRIGHTNESS;
@@ -879,7 +878,7 @@ static void uninit_video(struct gl_video *p)
         struct texplane *plane = &vimg->planes[n];
 
         gl->DeleteTextures(1, &plane->gl_texture);
-        gl->DeleteBuffers(1, &plane->gl_buffer);
+        gl_pbo_upload_uninit(&plane->pbo);
     }
     *vimg = (struct video_image){0};
 
@@ -1239,6 +1238,9 @@ static void load_shader(struct gl_video *p, struct bstr body)
     gl_sc_uniform_f(p->sc, "frame", p->frames_uploaded);
     gl_sc_uniform_vec2(p->sc, "image_size", (GLfloat[]){p->image_params.w,
                                                         p->image_params.h});
+    gl_sc_uniform_vec2(p->sc, "target_size",
+                       (GLfloat[]){p->dst_rect.x1 - p->dst_rect.x0,
+                                   p->dst_rect.y1 - p->dst_rect.y0});
 }
 
 static const char *get_custom_shader_fn(struct gl_video *p, const char *body)
@@ -1542,112 +1544,40 @@ static void user_hook_old(struct gl_video *p, struct img_tex tex,
     GLSLF("color = %s(HOOKED_raw, HOOKED_pos, HOOKED_size);\n", fn_name);
 }
 
-// Returns whether successful. 'result' is left untouched on failure
-static bool eval_szexpr(struct gl_video *p, struct img_tex tex,
-                        struct szexp expr[MAX_SZEXP_SIZE],
-                        float *result)
-{
-    float stack[MAX_SZEXP_SIZE] = {0};
-    int idx = 0; // points to next element to push
-
-    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
-        switch (expr[i].tag) {
-        case SZEXP_END:
-            goto done;
-
-        case SZEXP_CONST:
-            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
-            // impossible to overflow the stack
-            assert(idx < MAX_SZEXP_SIZE);
-            stack[idx++] = expr[i].val.cval;
-            continue;
-
-        case SZEXP_OP1:
-            if (idx < 1) {
-                MP_WARN(p, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            switch (expr[i].val.op) {
-            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
-            default: abort();
-            }
-            continue;
-
-        case SZEXP_OP2:
-            if (idx < 2) {
-                MP_WARN(p, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            // Pop the operands in reverse order
-            float op2 = stack[--idx];
-            float op1 = stack[--idx];
-            float res = 0.0;
-            switch (expr[i].val.op) {
-            case SZEXP_OP_ADD: res = op1 + op2; break;
-            case SZEXP_OP_SUB: res = op1 - op2; break;
-            case SZEXP_OP_MUL: res = op1 * op2; break;
-            case SZEXP_OP_DIV: res = op1 / op2; break;
-            case SZEXP_OP_GT:  res = op1 > op2; break;
-            case SZEXP_OP_LT:  res = op1 < op2; break;
-            default: abort();
-            }
-
-            if (!isfinite(res)) {
-                MP_WARN(p, "Illegal operation in RPN expression!\n");
-                return false;
-            }
-
-            stack[idx++] = res;
-            continue;
-
-        case SZEXP_VAR_W:
-        case SZEXP_VAR_H: {
-            struct bstr name = expr[i].val.varname;
-            struct img_tex var_tex;
-
-            // The size of OUTPUT is determined. It could be useful for certain
-            // user shaders to skip passes.
-            if (bstr_equals0(name, "OUTPUT")) {
-                int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
-                int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-                stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? vp_w : vp_h;
-                continue;
-            }
-
-            // HOOKED is a special case
-            if (bstr_equals0(name, "HOOKED")) {
-                var_tex = tex;
-                goto found_tex;
-            }
+struct szexp_ctx {
+    struct gl_video *p;
+    struct img_tex tex;
+};
 
-            for (int o = 0; o < p->saved_tex_num; o++) {
-                if (bstr_equals0(name, p->saved_tex[o].name)) {
-                    var_tex = p->saved_tex[o].tex;
-                    goto found_tex;
-                }
-            }
+static bool szexp_lookup(void *priv, struct bstr var, float size[2])
+{
+    struct szexp_ctx *ctx = priv;
+    struct gl_video *p = ctx->p;
 
-            MP_WARN(p, "Texture %.*s not found in RPN expression!\n", BSTR_P(name));
-            return false;
+    // The size of OUTPUT is determined. It could be useful for certain
+    // user shaders to skip passes.
+    if (bstr_equals0(var, "OUTPUT")) {
+        size[0] = p->dst_rect.x1 - p->dst_rect.x0;
+        size[1] = p->dst_rect.y1 - p->dst_rect.y0;
+        return true;
+    }
 
-found_tex:
-            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? var_tex.w : var_tex.h;
-            continue;
-            }
-        }
+    // HOOKED is a special case
+    if (bstr_equals0(var, "HOOKED")) {
+        size[0] = ctx->tex.w;
+        size[1] = ctx->tex.h;
+        return true;
     }
 
-done:
-    // Return the single stack element
-    if (idx != 1) {
-        MP_WARN(p, "Malformed stack after RPN expression!\n");
-        return false;
+    for (int o = 0; o < p->saved_tex_num; o++) {
+        if (bstr_equals0(var, p->saved_tex[o].name)) {
+            size[0] = p->saved_tex[o].tex.w;
+            size[1] = p->saved_tex[o].tex.h;
+            return true;
+        }
     }
 
-    *result = stack[0];
-    return true;
+    return false;
 }
 
 static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
@@ -1656,7 +1586,7 @@ static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
     assert(shader);
 
     float res = false;
-    eval_szexpr(p, tex, shader->cond, &res);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->cond, &res);
     return res;
 }
 
@@ -1674,8 +1604,8 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
     // to do this and display an error message than just crash OpenGL
     float w = 1.0, h = 1.0;
 
-    eval_szexpr(p, tex, shader->width, &w);
-    eval_szexpr(p, tex, shader->height, &h);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->width, &w);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->height, &h);
 
     *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
     gl_transform_trans(shader->offset, trans);
@@ -1983,7 +1913,7 @@ static void pass_convert_yuv(struct gl_video *p)
         GLSLF("color = color.%s;\n", p->color_swizzle);
 
     // Pre-colormatrix input gamma correction
-    if (cparams.colorspace == MP_CSP_XYZ)
+    if (cparams.color.space == MP_CSP_XYZ)
         GLSL(color.rgb = pow(color.rgb, vec3(2.6));) // linear light
 
     // We always explicitly normalize the range in pass_read_video
@@ -1998,7 +1928,7 @@ static void pass_convert_yuv(struct gl_video *p)
 
     GLSL(color.rgb = mat3(colormatrix) * color.rgb + colormatrix_c;)
 
-    if (p->image_params.colorspace == MP_CSP_BT_2020_C) {
+    if (p->image_params.color.space == MP_CSP_BT_2020_C) {
         // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
         // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
         //      = (B'-Y'c) / 1.5816  | C'bc >  0
@@ -2109,7 +2039,7 @@ static void pass_scale_main(struct gl_video *p)
     // Pre-conversion, like linear light/sigmoidization
     GLSLF("// scaler pre-conversion\n");
     if (p->use_linear) {
-        pass_linearize(p->sc, p->image_params.gamma);
+        pass_linearize(p->sc, p->image_params.color.gamma);
         pass_opt_hook_point(p, "LINEAR", NULL);
     }
 
@@ -2156,107 +2086,104 @@ static void pass_scale_main(struct gl_video *p)
     }
 }
 
-// Adapts the colors from the given color space to the display device's native
-// gamut.
-static void pass_colormanage(struct gl_video *p, float peak_src,
-                             enum mp_csp_prim prim_src,
-                             enum mp_csp_trc trc_src)
+// Adapts the colors to the right output color space. (Final pass during
+// rendering)
+// If OSD is true, ignore any changes that may have been made to the video
+// by previous passes (i.e. linear scaling)
+static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool osd)
 {
-    GLSLF("// color management\n");
-    enum mp_csp_trc trc_dst = p->opts.target_trc;
-    enum mp_csp_prim prim_dst = p->opts.target_prim;
-    float peak_dst = p->opts.target_brightness;
+    struct mp_colorspace ref = src;
+
+    if (p->use_linear && !osd)
+        src.gamma = MP_CSP_TRC_LINEAR;
+
+    // Figure out the target color space from the options, or auto-guess if
+    // none were set
+    struct mp_colorspace dst = {
+        .gamma = p->opts.target_trc,
+        .primaries = p->opts.target_prim,
+        .nom_peak = mp_csp_trc_nom_peak(p->opts.target_trc, p->opts.target_brightness),
+    };
 
     if (p->use_lut_3d) {
-        // The 3DLUT is always generated against the original source space
-        enum mp_csp_prim prim_orig = p->image_params.primaries;
-        enum mp_csp_trc trc_orig = p->image_params.gamma;
-
-        // One exception: SMPTE ST.2084 is not implemented by LittleCMS
-        // for technical limitation reasons, so we use a gamma 2.2 input curve
-        // here instead. We could pick any value we want here, the difference
-        // is just coding efficiency.
-        if (trc_orig == MP_CSP_TRC_SMPTE_ST2084)
+        // The 3DLUT is always generated against the video's original source
+        // space, *not* the reference space. (To avoid having to regenerate
+        // the 3DLUT for the OSD on every frame)
+        enum mp_csp_prim prim_orig = p->image_params.color.primaries;
+        enum mp_csp_trc trc_orig = p->image_params.color.gamma;
+
+        // One exception: HDR is not implemented by LittleCMS for technical
+        // limitation reasons, so we use a gamma 2.2 input curve here instead.
+        // We could pick any value we want here, the difference is just coding
+        // efficiency.
+        if (trc_orig == MP_CSP_TRC_SMPTE_ST2084 ||
+            trc_orig == MP_CSP_TRC_ARIB_STD_B67 ||
+            trc_orig == MP_CSP_TRC_V_LOG)
+        {
             trc_orig = MP_CSP_TRC_GAMMA22;
+        }
 
         if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
-            prim_dst = prim_orig;
-            trc_dst = trc_orig;
+            dst.primaries = prim_orig;
+            dst.gamma = trc_orig;
         }
     }
 
-    // When auto-guessing the output color params, just pick the source color
-    // params to preserve the authentic "look and feel" of wrong/naive players.
-    // Some exceptions apply to source spaces that even hardcore technoluddites
-    // would probably not enjoy viewing unaltered
-    if (prim_dst == MP_CSP_PRIM_AUTO) {
-        prim_dst = prim_src;
+    if (dst.primaries == MP_CSP_PRIM_AUTO) {
+        // The vast majority of people are on sRGB or BT.709 displays, so pick
+        // this as the default output color space.
+        dst.primaries = MP_CSP_PRIM_BT_709;
 
-        // Avoid outputting very wide gamut content automatically, since the
-        // majority target audience has standard gamut displays
-        if (prim_dst == MP_CSP_PRIM_BT_2020 || prim_dst == MP_CSP_PRIM_PRO_PHOTO)
-            prim_dst = MP_CSP_PRIM_BT_709;
+        if (ref.primaries == MP_CSP_PRIM_BT_601_525 ||
+            ref.primaries == MP_CSP_PRIM_BT_601_625)
+        {
+            // Since we auto-pick BT.601 and BT.709 based on the dimensions,
+            // combined with the fact that they're very similar to begin with,
+            // and to avoid confusing the average user, just don't adapt BT.601
+            // content automatically at all.
+            dst.primaries = ref.gamma;
+        }
     }
 
-    if (trc_dst == MP_CSP_TRC_AUTO) {
-        trc_dst = trc_src;
-        // Avoid outputting linear light at all costs. First try
-        // falling back to the image gamma (e.g. in the case that the input
-        // was linear light due to linear-scaling)
-        if (trc_dst == MP_CSP_TRC_LINEAR)
-            trc_dst = p->image_params.gamma;
-
-        // Failing that, pick gamma 2.2 as a reasonable default. This is also
-        // picked as a default for outputting HDR content
-        if (trc_dst == MP_CSP_TRC_LINEAR || trc_dst == MP_CSP_TRC_SMPTE_ST2084)
-            trc_dst = MP_CSP_TRC_GAMMA22;
-    }
+    if (dst.gamma == MP_CSP_TRC_AUTO) {
+        // Most people seem to complain when the image is darker or brighter
+        // than what they're "used to", so just avoid changing the gamma
+        // altogether by default. The only exceptions to this rule apply to
+        // very unusual TRCs, which even hardcode technoluddites would probably
+        // not enjoy viewing unaltered.
+        dst.gamma = ref.gamma;
 
-    if (!peak_src) {
-        // If the source has no information known, it's display-referred
-        // (and should be treated relative to the specified desired peak_dst)
-        peak_src = peak_dst;
+        // Avoid outputting linear light or HDR content "by default". For these
+        // just pick gamma 2.2 as a default, since it's a good estimate for
+        // the response of typical displays
+        if (dst.gamma == MP_CSP_TRC_LINEAR || mp_trc_is_hdr(dst.gamma))
+            dst.gamma = MP_CSP_TRC_GAMMA22;
     }
 
-    // All operations from here on require linear light as a starting point,
-    // so we linearize even if trc_src == trc_dst when one of the other
-    // operations needs it
-    bool need_gamma = trc_src != trc_dst || prim_src != prim_dst ||
-                      peak_src != peak_dst;
-    if (need_gamma)
-        pass_linearize(p->sc, trc_src);
-
-    // Adapt and tone map for a different reference peak brightness
-    if (peak_src != peak_dst)
-    {
-        GLSLF("// HDR tone mapping\n");
-        float rel_peak = peak_src / peak_dst;
-        // Normalize such that 1 is the target brightness (and values above
-        // 1 are out of range)
-        GLSLF("color.rgb *= vec3(%f);\n", rel_peak);
-        // Tone map back down to the range [0,1]
-        pass_tone_map(p->sc, rel_peak, p->opts.hdr_tone_mapping,
-                      p->opts.tone_mapping_param);
+    // For the src peaks, the correct brightness metadata may be present for
+    // sig_peak, nom_peak, both, or neither. To handle everything in a generic
+    // way, it's important to never automatically infer a sig_peak that is
+    // below the nom_peak (since we don't know what bits the image contains,
+    // doing so would potentially badly clip). The only time in which this
+    // may be the case is when the mastering metadata explicitly says so, i.e.
+    // the sig_peak was already set. So to simplify the logic as much as
+    // possible, make sure the nom_peak is present and correct first, and just
+    // set sig_peak = nom_peak if missing.
+    if (!src.nom_peak) {
+        // For display-referred colorspaces, we treat it as relative to
+        // target_brightness
+        src.nom_peak = mp_csp_trc_nom_peak(src.gamma, p->opts.target_brightness);
     }
 
-    // Adapt to the right colorspace if necessary
-    if (prim_src != prim_dst) {
-        struct mp_csp_primaries csp_src = mp_get_csp_primaries(prim_src),
-                                csp_dst = mp_get_csp_primaries(prim_dst);
-        float m[3][3] = {{0}};
-        mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
-        gl_sc_uniform_mat3(p->sc, "cms_matrix", true, &m[0][0]);
-        GLSL(color.rgb = cms_matrix * color.rgb;)
-    }
+    if (!src.sig_peak)
+        src.sig_peak = src.nom_peak;
 
-    if (need_gamma) {
-        // If the target encoding function has a fixed peak, we need to
-        // un-normalize back to the encoding signal range
-        if (trc_dst == MP_CSP_TRC_SMPTE_ST2084)
-            GLSLF("color.rgb *= vec3(%f);\n", peak_dst / 10000);
+    MP_DBG(p, "HDR src nom: %f sig: %f, dst: %f\n",
+           src.nom_peak, src.sig_peak, dst.nom_peak);
 
-        pass_delinearize(p->sc, trc_dst);
-    }
+    // Adapt from src to dst as necessary
+    pass_color_map(p->sc, src, dst, p->opts.hdr_tone_mapping,
+                   p->opts.tone_mapping_param);
 
     if (p->use_lut_3d) {
         gl_sc_uniform_sampler(p->sc, "lut_3d", GL_TEXTURE_3D, TEXUNIT_3DLUT);
@@ -2397,11 +2324,15 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
         default:
             abort();
         }
-        // Subtitle color management, they're assumed to be display-referred
-        // sRGB by default
+        // When subtitles need to be color managed, assume they're in sRGB
+        // (for lack of anything saner to do)
         if (cms) {
-            pass_colormanage(p, p->opts.target_brightness,
-                             MP_CSP_PRIM_BT_709, MP_CSP_TRC_SRGB);
+            static const struct mp_colorspace csp_srgb = {
+                .primaries = MP_CSP_PRIM_BT_709,
+                .gamma = MP_CSP_TRC_SRGB,
+            };
+
+            pass_colormanage(p, csp_srgb, true);
         }
         gl_sc_set_vao(p->sc, mpgl_osd_get_vao(p->osd));
         gl_sc_gen_shader_and_reset(p->sc);
@@ -2503,7 +2434,7 @@ static void pass_render_frame(struct gl_video *p)
         rect.mt *= scale[1]; rect.mb *= scale[1];
         // We should always blend subtitles in non-linear light
         if (p->use_linear) {
-            pass_delinearize(p->sc, p->image_params.gamma);
+            pass_delinearize(p->sc, p->image_params.color.gamma);
             p->use_linear = false;
         }
         finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h,
@@ -2532,8 +2463,7 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
         GLSL(color.rgb = pow(color.rgb, vec3(user_gamma));)
     }
 
-    pass_colormanage(p, p->image_params.peak, p->image_params.primaries,
-                     p->use_linear ? MP_CSP_TRC_LINEAR : p->image_params.gamma);
+    pass_colormanage(p, p->image_params.color, false);
 
     // Draw checkerboard pattern to indicate transparency
     if (p->has_alpha && p->opts.alpha_mode == ALPHA_BLEND_TILES) {
@@ -2564,7 +2494,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     if (t->still)
         gl_video_reset_surfaces(p);
 
-    // First of all, figure out if we have a frame availble at all, and draw
+    // First of all, figure out if we have a frame available at all, and draw
     // it manually + reset the queue if not
     if (p->surfaces[p->surface_now].pts == MP_NOPTS_VALUE) {
         if (!gl_video_upload_image(p, t->current))
@@ -2577,7 +2507,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     }
 
     // Find the right frame for this instant
-    if (t->current&& t->current->pts != MP_NOPTS_VALUE) {
+    if (t->current && t->current->pts != MP_NOPTS_VALUE) {
         int next = fbosurface_wrap(p->surface_now + 1);
         while (p->surfaces[next].pts != MP_NOPTS_VALUE &&
                p->surfaces[next].pts > p->surfaces[p->surface_now].pts &&
@@ -2615,7 +2545,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // this should be done before the step where we find the right frame, but
     // it only barely matters at the very beginning of playback, and this way
     // makes the code much more linear.
-    int surface_dst = fbosurface_wrap(p->surface_idx+1);
+    int surface_dst = fbosurface_wrap(p->surface_idx + 1);
     for (int i = 0; i < t->num_frames; i++) {
         // Avoid overwriting data we might still need
         if (surface_dst == surface_bse - 1)
@@ -2634,7 +2564,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
                             vp_w, vp_h, FBOTEX_FUZZY);
             p->surfaces[surface_dst].pts = f->pts;
             p->surface_idx = surface_dst;
-            surface_dst = fbosurface_wrap(surface_dst+1);
+            surface_dst = fbosurface_wrap(surface_dst + 1);
         }
     }
 
@@ -2645,7 +2575,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // end of playback or start of playback.
     bool valid = true;
     for (int i = surface_bse, ii; valid && i != surface_end; i = ii) {
-        ii = fbosurface_wrap(i+1);
+        ii = fbosurface_wrap(i + 1);
         if (p->surfaces[i].pts == MP_NOPTS_VALUE ||
             p->surfaces[ii].pts == MP_NOPTS_VALUE)
         {
@@ -2737,6 +2667,11 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
     GL *gl = p->gl;
     struct video_image *vimg = &p->image;
 
+    if (fbo && !(gl->mpgl_caps & MPGL_CAP_FB)) {
+        MP_FATAL(p, "Rendering to FBO requested, but no FBO extension found!\n");
+        return;
+    }
+
     p->broken_frame = false;
 
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
@@ -2773,7 +2708,7 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
                     goto done;
                 pass_render_frame(p);
 
-                // For the non-interplation case, we draw to a single "cache"
+                // For the non-interpolation case, we draw to a single "cache"
                 // FBO to speed up subsequent re-draws (if any exist)
                 int dest_fbo = fbo;
                 if (frame->num_vsyncs > 1 && frame->display_synced &&
@@ -2781,7 +2716,7 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
                 {
                     fbotex_change(&p->output_fbo, p->gl, p->log,
                                   p->vp_w, abs(p->vp_h),
-                                  p->opts.fbo_format, 0);
+                                  p->opts.fbo_format, FBOTEX_FUZZY);
                     dest_fbo = p->output_fbo.fbo;
                     p->output_fbo_valid = true;
                 }
@@ -2880,54 +2815,6 @@ struct voctrl_performance_data gl_video_perfdata(struct gl_video *p)
     };
 }
 
-static bool unmap_image(struct gl_video *p, struct mp_image *mpi)
-{
-    GL *gl = p->gl;
-    bool ok = true;
-    struct video_image *vimg = &p->image;
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
-        ok = gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER) && ok;
-        mpi->planes[n] = NULL; // PBO offset 0
-    }
-    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-    return ok;
-}
-
-static bool map_image(struct gl_video *p, struct mp_image *mpi)
-{
-    GL *gl = p->gl;
-
-    if (!p->opts.pbo)
-        return false;
-
-    struct video_image *vimg = &p->image;
-
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
-        mpi->stride[n] = mp_image_plane_w(mpi, n) * p->image_desc.bytes[n];
-        size_t buffer_size = mp_image_plane_h(mpi, n) * mpi->stride[n];
-        if (!plane->gl_buffer) {
-            gl->GenBuffers(1, &plane->gl_buffer);
-            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
-            gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size,
-                           NULL, GL_DYNAMIC_DRAW);
-        }
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
-        mpi->planes[n] = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0,
-                                            buffer_size, GL_MAP_WRITE_BIT |
-                                                GL_MAP_INVALIDATE_BUFFER_BIT);
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-        if (!mpi->planes[n]) {
-            unmap_image(p, mpi);
-            return false;
-        }
-    }
-    memset(mpi->bufs, 0, sizeof(mpi->bufs));
-    return true;
-}
-
 // This assumes nv12, with textures set to GL_NEAREST filtering.
 static void reinterleave_vdpau(struct gl_video *p, struct gl_hwdec_frame *frame)
 {
@@ -3024,32 +2911,19 @@ static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi)
 
     gl_timer_start(p->upload_timer);
 
-    mp_image_t pbo_mpi = *mpi;
-    bool pbo = map_image(p, &pbo_mpi);
-    if (pbo) {
-        mp_image_copy(&pbo_mpi, mpi);
-        if (unmap_image(p, &pbo_mpi)) {
-            mpi = &pbo_mpi;
-        } else {
-            MP_FATAL(p, "Video PBO upload failed. Disabling PBOs.\n");
-            pbo = false;
-            p->opts.pbo = 0;
-        }
-    }
 
-    vimg->image_flipped = mpi->stride[0] < 0;
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *plane = &vimg->planes[n];
-        if (pbo)
-            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
-        gl->ActiveTexture(GL_TEXTURE0 + n);
+
+        plane->flipped = mpi->stride[0] < 0;
+
         gl->BindTexture(plane->gl_target, plane->gl_texture);
-        gl_upload_tex(gl, plane->gl_target, plane->gl_format, plane->gl_type,
-                      mpi->planes[n], mpi->stride[n], 0, 0, plane->w, plane->h);
+        gl_pbo_upload_tex(&plane->pbo, gl, p->opts.pbo, plane->gl_target,
+                          plane->gl_format, plane->gl_type, plane->w, plane->h,
+                          mpi->planes[n], mpi->stride[n],
+                          0, 0, plane->w, plane->h);
+        gl->BindTexture(plane->gl_target, 0);
     }
-    gl->ActiveTexture(GL_TEXTURE0);
-    if (pbo)
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
     gl_timer_stop(p->upload_timer);
 
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
index 1f37f4f..ff87b99 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/opengl/video_shaders.c
@@ -227,13 +227,31 @@ static const float HDR_M1 = 2610./4096 * 1./4,
                    HDR_C2 = 2413./4096 * 32,
                    HDR_C3 = 2392./4096 * 32;
 
-// Linearize (expand), given a TRC as input
+// Common constants for ARIB STD-B67 (Hybrid Log-gamma)
+static const float B67_A = 0.17883277,
+                   B67_B = 0.28466892,
+                   B67_C = 0.55991073;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+                   VLOG_C = 0.241514,
+                   VLOG_D = 0.598206,
+                   VLOG_R = 46.085527; // nominal peak
+
+// Linearize (expand), given a TRC as input. This corresponds to the EOTF
+// in ITU-R terminology.
 void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 {
     if (trc == MP_CSP_TRC_LINEAR)
         return;
 
+    // Note that this clamp may technically violate the definition of
+    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+    // displayed on the display where such would be possible. That said, the
+    // problem is that not all gamma curves are well-defined on the values
+    // outside this range, so we ignore it and just clip anyway for sanity.
     GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+
     switch (trc) {
     case MP_CSP_TRC_SRGB:
         GLSL(color.rgb = mix(color.rgb / vec3(12.92),
@@ -265,12 +283,34 @@ void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
               HDR_C1, HDR_C2, HDR_C3);
         GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", HDR_M1);
         break;
+    case MP_CSP_TRC_ARIB_STD_B67:
+        GLSLF("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,\n"
+              "                exp((color.rgb - vec3(%f)) / vec3(%f)) + vec3(%f),\n"
+              "                lessThan(vec3(0.5), color.rgb));\n",
+              B67_C, B67_A, B67_B);
+        // Since the ARIB function's signal value of 1.0 corresponds to
+        // a peak of 12.0, we need to renormalize to prevent GL textures
+        // from clipping. (In general, mpv's internal conversions always
+        // assume 1.0 is the maximum brightness, not the reference peak)
+        GLSL(color.rgb /= vec3(12.0);)
+        break;
+    case MP_CSP_TRC_V_LOG:
+        GLSLF("color.rgb = mix((color.rgb - vec3(0.125)) / vec3(5.6), \n"
+              "    pow(vec3(10.0), (color.rgb - vec3(%f)) / vec3(%f)) \n"
+              "              - vec3(%f),                              \n"
+              "    lessThanEqual(vec3(0.181), color.rgb));            \n",
+              VLOG_D, VLOG_C, VLOG_B);
+        // Same deal as with the B67 function, renormalize to texture range
+        GLSLF("color.rgb /= vec3(%f);\n", VLOG_R);
+        GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+        break;
     default:
         abort();
     }
 }
 
-// Delinearize (compress), given a TRC as output
+// Delinearize (compress), given a TRC as output. This corresponds to the
+// inverse EOTF (not the OETF) in ITU-R terminology.
 void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 {
     if (trc == MP_CSP_TRC_LINEAR)
@@ -308,15 +348,32 @@ void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
               HDR_C1, HDR_C2, HDR_C3);
         GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", HDR_M2);
         break;
+    case MP_CSP_TRC_ARIB_STD_B67:
+        GLSL(color.rgb *= vec3(12.0);)
+        GLSLF("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),\n"
+              "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),\n"
+              "                lessThan(vec3(1.0), color.rgb));\n",
+              B67_A, B67_B, B67_C);
+        break;
+    case MP_CSP_TRC_V_LOG:
+        GLSLF("color.rgb *= vec3(%f);\n", VLOG_R);
+        GLSLF("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),   \n"
+              "                vec3(%f) * log(color.rgb + vec3(%f))   \n"
+              "                    + vec3(%f),                        \n"
+              "                lessThanEqual(vec3(0.01), color.rgb)); \n",
+              VLOG_C / M_LN10, VLOG_B, VLOG_D);
+        break;
     default:
         abort();
     }
 }
 
 // Tone map from a known peak brightness to the range [0,1]
-void pass_tone_map(struct gl_shader_cache *sc, float peak,
-                   enum tone_mapping algo, float param)
+static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
+                          enum tone_mapping algo, float param)
 {
+    GLSLF("// HDR tone mapping\n");
+
     switch (algo) {
     case TONE_MAPPING_CLIP:
         GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
@@ -326,7 +383,7 @@ void pass_tone_map(struct gl_shader_cache *sc, float peak,
         float contrast = isnan(param) ? 0.5 : param,
               offset = (1.0 - contrast) / contrast;
         GLSLF("color.rgb = color.rgb / (color.rgb + vec3(%f));\n", offset);
-        GLSLF("color.rgb *= vec3(%f);\n", (peak + offset) / peak);
+        GLSLF("color.rgb *= vec3(%f);\n", (ref_peak + offset) / ref_peak);
         break;
     }
 
@@ -337,20 +394,20 @@ void pass_tone_map(struct gl_shader_cache *sc, float peak,
                A, C*B, D*E, A, B, D*F, E/F);
         GLSLHF("}\n");
 
-        GLSLF("color.rgb = hable(color.rgb) / hable(vec3(%f));\n", peak);
+        GLSLF("color.rgb = hable(color.rgb) / hable(vec3(%f));\n", ref_peak);
         break;
     }
 
     case TONE_MAPPING_GAMMA: {
         float gamma = isnan(param) ? 1.8 : param;
         GLSLF("color.rgb = pow(color.rgb / vec3(%f), vec3(%f));\n",
-              peak, 1.0/gamma);
+              ref_peak, 1.0/gamma);
         break;
     }
 
     case TONE_MAPPING_LINEAR: {
         float coeff = isnan(param) ? 1.0 : param;
-        GLSLF("color.rgb = vec3(%f) * color.rgb;\n", coeff / peak);
+        GLSLF("color.rgb = vec3(%f) * color.rgb;\n", coeff / ref_peak);
         break;
     }
 
@@ -359,6 +416,61 @@ void pass_tone_map(struct gl_shader_cache *sc, float peak,
     }
 }
 
+// Map colors from one source space to another. These source spaces
+// must be known (i.e. not MP_CSP_*_AUTO), as this function won't perform
+// any auto-guessing.
+void pass_color_map(struct gl_shader_cache *sc,
+                    struct mp_colorspace src, struct mp_colorspace dst,
+                    enum tone_mapping algo, float tone_mapping_param)
+{
+    GLSLF("// color mapping\n");
+
+    // All operations from here on require linear light as a starting point,
+    // so we linearize even if src.gamma == dst.gamma when one of the other
+    // operations needs it
+    bool need_gamma = src.gamma != dst.gamma ||
+                      src.primaries != dst.primaries ||
+                      src.nom_peak != dst.nom_peak ||
+                      src.sig_peak > dst.nom_peak;
+
+    if (need_gamma)
+        pass_linearize(sc, src.gamma);
+
+    // NOTE: When src.gamma = MP_CSP_TRC_ARIB_STD_B67, we would technically
+    // need to apply the reference OOTF as part of the EOTF (which is what we
+    // implement with pass_linearize), since HLG considers OOTF to be part of
+    // the display's EOTF (as opposed to the camera's OETF). But since this is
+    // stupid, complicated, arbitrary, and more importantly depends on the
+    // target display's signal peak (which is != the nom_peak in the case of
+    // HDR displays, and mpv already has enough target-specific display
+    // options), we just ignore its implementation entirely. (Plus, it doesn't
+    // even really make sense with tone mapping to begin with.) But just in
+    // case somebody ends up complaining about HLG looking different from a
+    // reference HLG display, this comment might be why.
+
+    // Stretch the signal value to renormalize to the dst nominal peak
+    if (src.nom_peak != dst.nom_peak)
+        GLSLF("color.rgb *= vec3(%f);\n", src.nom_peak / dst.nom_peak);
+
+    // Tone map to prevent clipping when the source signal peak exceeds the
+    // encodable range.
+    if (src.sig_peak > dst.nom_peak)
+        pass_tone_map(sc, src.sig_peak / dst.nom_peak, algo, tone_mapping_param);
+
+    // Adapt to the right colorspace if necessary
+    if (src.primaries != dst.primaries) {
+        struct mp_csp_primaries csp_src = mp_get_csp_primaries(src.primaries),
+                                csp_dst = mp_get_csp_primaries(dst.primaries);
+        float m[3][3] = {{0}};
+        mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
+        gl_sc_uniform_mat3(sc, "cms_matrix", true, &m[0][0]);
+        GLSL(color.rgb = cms_matrix * color.rgb;)
+    }
+
+    if (need_gamma)
+        pass_delinearize(sc, dst.gamma);
+}
+
 // Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
 // Obtain random numbers by calling rand(h), followed by h = permute(h) to
 // update the state. Assumes the texture was hooked.
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
index 0ee3d81..3bc2f21 100644
--- a/video/out/opengl/video_shaders.h
+++ b/video/out/opengl/video_shaders.h
@@ -38,8 +38,9 @@ void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
 void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
 void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
 
-void pass_tone_map(struct gl_shader_cache *sc, float peak,
-                   enum tone_mapping algo, float param);
+void pass_color_map(struct gl_shader_cache *sc,
+                    struct mp_colorspace src, struct mp_colorspace dst,
+                    enum tone_mapping algo, float tone_mapping_param);
 
 void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
                         AVLFG *lfg);
diff --git a/video/out/vo_direct3d.c b/video/out/vo_direct3d.c
index 5190095..74ddb23 100644
--- a/video/out/vo_direct3d.c
+++ b/video/out/vo_direct3d.c
@@ -38,7 +38,6 @@
 #include "common/common.h"
 #include "w32_common.h"
 #include "sub/osd.h"
-#include "bitmap_packer.h"
 
 // shaders generated by fxc.exe from d3d_shader_yuv.hlsl
 #include "d3d_shader_420p.h"
@@ -105,7 +104,6 @@ struct osdpart {
     struct d3dtex texture;
     int num_vertices;
     vertex_osd *vertices;
-    struct bitmap_packer *packer;
 };
 
 enum shaders {
@@ -822,7 +820,7 @@ static bool resize_d3d(d3d_priv *priv)
     MP_VERBOSE(priv, "resize_d3d %dx%d called.\n",
                priv->vo->dwidth, priv->vo->dheight);
 
-    /* Make sure that backbuffer is large enough to accomodate the new
+    /* Make sure that backbuffer is large enough to accommodate the new
        viewport dimensions. Grow it if necessary. */
 
     bool backbuf_resize = priv->vo->dwidth > priv->cur_backbuf_width ||
@@ -1215,13 +1213,8 @@ static int preinit(struct vo *vo)
     priv->vo = vo;
     priv->log = vo->log;
 
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct osdpart *osd = talloc_ptrtype(priv, osd);
-        *osd = (struct osdpart) {
-            .packer = talloc_zero(osd, struct bitmap_packer),
-        };
-        priv->osd[n] = osd;
-    }
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        priv->osd[n] = talloc_zero(priv, struct osdpart);
 
     priv->d3d9_dll = LoadLibraryA("d3d9.dll");
     if (!priv->d3d9_dll) {
@@ -1536,34 +1529,45 @@ error_exit:
     return NULL;
 }
 
+static D3DCOLOR ass_to_d3d_color(uint32_t color)
+{
+    uint32_t r = (color >> 24) & 0xff;
+    uint32_t g = (color >> 16) & 0xff;
+    uint32_t b = (color >> 8) & 0xff;
+    uint32_t a = 0xff - (color & 0xff);
+    return D3DCOLOR_ARGB(a, r, g, b);
+}
+
+static int next_pow2(int v)
+{
+    for (int x = 0; x < 30; x++) {
+        if ((1 << x) >= v)
+            return 1 << x;
+    }
+    return INT_MAX;
+}
+
 static bool upload_osd(d3d_priv *priv, struct osdpart *osd,
                        struct sub_bitmaps *imgs)
 {
     D3DFORMAT fmt = priv->osd_fmt_table[imgs->format];
 
-    osd->packer->w_max = priv->max_texture_width;
-    osd->packer->h_max = priv->max_texture_height;
+    assert(imgs->packed);
 
-    osd->packer->padding = imgs->scaled; // assume 2x2 filter on scaling
-    int r = packer_pack_from_subbitmaps(osd->packer, imgs);
-    if (r < 0) {
-        MP_ERR(priv, "OSD bitmaps do not fit on "
-            "a surface with the maximum supported size %dx%d.\n",
-            osd->packer->w_max, osd->packer->h_max);
-        return false;
-    }
+    osd->change_id = imgs->change_id;
+    osd->num_vertices = 0;
 
-    if (osd->packer->w > osd->texture.tex_w
-        || osd->packer->h > osd->texture.tex_h
+    if (imgs->packed_w > osd->texture.tex_w
+        || imgs->packed_h > osd->texture.tex_h
         || osd->format != imgs->format)
     {
         osd->format = imgs->format;
 
-        int new_w = osd->packer->w;
-        int new_h = osd->packer->h;
+        int new_w = next_pow2(imgs->packed_w);
+        int new_h = next_pow2(imgs->packed_h);
         d3d_fix_texture_size(priv, &new_w, &new_h);
 
-        MP_DBG(priv, "reallocate OSD surface.\n");
+        MP_DBG(priv, "reallocate OSD surface to %dx%d.\n", new_w, new_h);
 
         d3dtex_release(priv, &osd->texture);
         d3dtex_allocate(priv, &osd->texture, fmt, new_w, new_h);
@@ -1572,9 +1576,7 @@ static bool upload_osd(d3d_priv *priv, struct osdpart *osd,
             return false; // failed to allocate
     }
 
-    struct pos bb[2];
-    packer_get_bb(osd->packer, bb);
-    RECT dirty_rc = { bb[0].x, bb[0].y, bb[1].x, bb[1].y };
+    RECT dirty_rc = { 0, 0, imgs->packed_w, imgs->packed_h };
 
     D3DLOCKED_RECT locked_rect;
 
@@ -1586,15 +1588,50 @@ static bool upload_osd(d3d_priv *priv, struct osdpart *osd,
     }
 
     int ps = fmt == D3DFMT_A8 ? 1 : 4;
-    packer_copy_subbitmaps(osd->packer, imgs, locked_rect.pBits, ps,
-                           locked_rect.Pitch);
+    memcpy_pic(locked_rect.pBits, imgs->packed->planes[0], ps * imgs->packed_w,
+               imgs->packed_h, locked_rect.Pitch, imgs->packed->stride[0]);
 
     if (FAILED(IDirect3DTexture9_UnlockRect(osd->texture.system, 0))) {
         MP_ERR(priv, "OSD texture unlock failed.\n");
         return false;
     }
 
-    return d3dtex_update(priv, &osd->texture);
+    if (!d3dtex_update(priv, &osd->texture))
+        return false;
+
+    // We need 2 primitives per quad which makes 6 vertices.
+    osd->num_vertices = imgs->num_parts * 6;
+    MP_TARRAY_GROW(osd, osd->vertices, osd->num_vertices);
+
+    float tex_w = osd->texture.tex_w;
+    float tex_h = osd->texture.tex_h;
+
+    for (int n = 0; n < imgs->num_parts; n++) {
+        struct sub_bitmap *b = &imgs->parts[n];
+
+        D3DCOLOR color = imgs->format == SUBBITMAP_LIBASS
+                        ? ass_to_d3d_color(b->libass.color)
+                        : D3DCOLOR_ARGB(255, 255, 255, 255);
+
+        float x0 = b->x;
+        float y0 = b->y;
+        float x1 = b->x + b->dw;
+        float y1 = b->y + b->dh;
+        float tx0 = b->src_x / tex_w;
+        float ty0 = b->src_y / tex_h;
+        float tx1 = (b->src_x + b->w) / tex_w;
+        float ty1 = (b->src_y + b->h) / tex_h;
+
+        vertex_osd *v = &osd->vertices[n * 6];
+        v[0] = (vertex_osd) { x0, y0, 0, color, tx0, ty0 };
+        v[1] = (vertex_osd) { x1, y0, 0, color, tx1, ty0 };
+        v[2] = (vertex_osd) { x0, y1, 0, color, tx0, ty1 };
+        v[3] = (vertex_osd) { x1, y1, 0, color, tx1, ty1 };
+        v[4] = v[2];
+        v[5] = v[1];
+    }
+
+    return true;
 }
 
 static struct osdpart *generate_osd(d3d_priv *priv, struct sub_bitmaps *imgs)
@@ -1604,24 +1641,10 @@ static struct osdpart *generate_osd(d3d_priv *priv, struct sub_bitmaps *imgs)
 
     struct osdpart *osd = priv->osd[imgs->render_index];
 
-    if (imgs->change_id != osd->change_id) {
-        if (!upload_osd(priv, osd, imgs))
-            osd->packer->count = 0;
+    if (imgs->change_id != osd->change_id)
+        upload_osd(priv, osd, imgs);
 
-        osd->change_id = imgs->change_id;
-        osd->num_vertices = 0;
-    }
-
-    return osd->packer->count ? osd : NULL;
-}
-
-static D3DCOLOR ass_to_d3d_color(uint32_t color)
-{
-    uint32_t r = (color >> 24) & 0xff;
-    uint32_t g = (color >> 16) & 0xff;
-    uint32_t b = (color >> 8) & 0xff;
-    uint32_t a = 0xff - (color & 0xff);
-    return D3DCOLOR_ARGB(a, r, g, b);
+    return osd->num_vertices ? osd : NULL;
 }
 
 static void draw_osd_cb(void *ctx, struct sub_bitmaps *imgs)
@@ -1632,44 +1655,6 @@ static void draw_osd_cb(void *ctx, struct sub_bitmaps *imgs)
     if (!osd)
         return;
 
-    if (osd->packer->count && !osd->num_vertices) {
-        // We need 2 primitives per quad which makes 6 vertices (we could reduce
-        // the number of vertices by using an indexed vertex array, but it's
-        // probably not worth doing)
-        osd->num_vertices = osd->packer->count * 6;
-        osd->vertices = talloc_realloc(osd, osd->vertices, vertex_osd,
-                                       osd->num_vertices);
-
-        float tex_w = osd->texture.tex_w;
-        float tex_h = osd->texture.tex_h;
-
-        for (int n = 0; n < osd->packer->count; n++) {
-            struct sub_bitmap *b = &imgs->parts[n];
-            struct pos p = osd->packer->result[n];
-
-            D3DCOLOR color = imgs->format == SUBBITMAP_LIBASS
-                             ? ass_to_d3d_color(b->libass.color)
-                             : D3DCOLOR_ARGB(255, 255, 255, 255);
-
-            float x0 = b->x;
-            float y0 = b->y;
-            float x1 = b->x + b->dw;
-            float y1 = b->y + b->dh;
-            float tx0 = p.x / tex_w;
-            float ty0 = p.y / tex_h;
-            float tx1 = (p.x + b->w) / tex_w;
-            float ty1 = (p.y + b->h) / tex_h;
-
-            vertex_osd *v = &osd->vertices[n * 6];
-            v[0] = (vertex_osd) { x0, y0, 0, color, tx0, ty0 };
-            v[1] = (vertex_osd) { x1, y0, 0, color, tx1, ty0 };
-            v[2] = (vertex_osd) { x0, y1, 0, color, tx0, ty1 };
-            v[3] = (vertex_osd) { x1, y1, 0, color, tx1, ty1 };
-            v[4] = v[2];
-            v[5] = v[1];
-        }
-    }
-
     d3d_begin_scene(priv);
 
     IDirect3DDevice9_SetRenderState(priv->d3d_device,
diff --git a/video/out/vo_lavc.c b/video/out/vo_lavc.c
index 188a575..1721136 100644
--- a/video/out/vo_lavc.c
+++ b/video/out/vo_lavc.c
@@ -22,6 +22,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+
+#include "config.h"
 #include "common/common.h"
 #include "options/options.h"
 #include "video/fmt-conversion.h"
@@ -34,8 +36,6 @@
 #include "sub/osd.h"
 
 struct priv {
-    uint8_t *buffer;
-    size_t buffer_size;
     AVStream *stream;
     AVCodecContext *codec;
     int have_first_packet;
@@ -155,20 +155,12 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     vc->codec->height = height;
     vc->codec->pix_fmt = pix_fmt;
 
-    encode_lavc_set_csp(vo->encode_lavc_ctx, vc->codec, params->colorspace);
-    encode_lavc_set_csp_levels(vo->encode_lavc_ctx, vc->codec, params->colorlevels);
+    encode_lavc_set_csp(vo->encode_lavc_ctx, vc->codec, params->color.space);
+    encode_lavc_set_csp_levels(vo->encode_lavc_ctx, vc->codec, params->color.levels);
 
     if (encode_lavc_open_codec(vo->encode_lavc_ctx, vc->codec) < 0)
         goto error;
 
-    vc->buffer_size = 6 * width * height + 200;
-    if (vc->buffer_size < FF_MIN_BUFFER_SIZE)
-        vc->buffer_size = FF_MIN_BUFFER_SIZE;
-    if (vc->buffer_size < sizeof(AVPicture))
-        vc->buffer_size = sizeof(AVPicture);
-
-    vc->buffer = talloc_size(vc, vc->buffer_size);
-
 done:
     pthread_mutex_unlock(&vo->encode_lavc_ctx->lock);
     return 0;
@@ -194,82 +186,120 @@ static int query_format(struct vo *vo, int format)
     return flags;
 }
 
-static void write_packet(struct vo *vo, int size, AVPacket *packet)
+static void write_packet(struct vo *vo, AVPacket *packet)
 {
     struct priv *vc = vo->priv;
 
-    if (size < 0) {
-        MP_ERR(vo, "error encoding\n");
-        return;
+    packet->stream_index = vc->stream->index;
+    if (packet->pts != AV_NOPTS_VALUE) {
+        packet->pts = av_rescale_q(packet->pts,
+                                   vc->codec->time_base,
+                                   vc->stream->time_base);
+    } else {
+        MP_VERBOSE(vo, "codec did not provide pts\n");
+        packet->pts = av_rescale_q(vc->lastipts,
+                                   vc->worst_time_base,
+                                   vc->stream->time_base);
+    }
+    if (packet->dts != AV_NOPTS_VALUE) {
+        packet->dts = av_rescale_q(packet->dts,
+                                   vc->codec->time_base,
+                                   vc->stream->time_base);
+    }
+    if (packet->duration > 0) {
+        packet->duration = av_rescale_q(packet->duration,
+                                        vc->codec->time_base,
+                                        vc->stream->time_base);
+    } else {
+        // HACK: libavformat calculates dts wrong if the initial packet
+        // duration is not set, but ONLY if the time base is "high" and if we
+        // have b-frames!
+        if (!packet->duration)
+            if (!vc->have_first_packet)
+                if (vc->codec->has_b_frames
+                        || vc->codec->max_b_frames)
+                    if (vc->stream->time_base.num * 1000LL <=
+                            vc->stream->time_base.den)
+                        packet->duration = FFMAX(1, av_rescale_q(1,
+                             vc->codec->time_base, vc->stream->time_base));
     }
 
-    if (size > 0) {
-        packet->stream_index = vc->stream->index;
-        if (packet->pts != AV_NOPTS_VALUE) {
-            packet->pts = av_rescale_q(packet->pts,
-                                       vc->codec->time_base,
-                                       vc->stream->time_base);
-        } else {
-            MP_VERBOSE(vo, "codec did not provide pts\n");
-            packet->pts = av_rescale_q(vc->lastipts, vc->worst_time_base,
-                                       vc->stream->time_base);
-        }
-        if (packet->dts != AV_NOPTS_VALUE) {
-            packet->dts = av_rescale_q(packet->dts,
-                                       vc->codec->time_base,
-                                       vc->stream->time_base);
-        }
-        if (packet->duration > 0) {
-            packet->duration = av_rescale_q(packet->duration,
-                                       vc->codec->time_base,
-                                       vc->stream->time_base);
-        } else {
-            // HACK: libavformat calculates dts wrong if the initial packet
-            // duration is not set, but ONLY if the time base is "high" and if we
-            // have b-frames!
-            if (!packet->duration)
-                if (!vc->have_first_packet)
-                    if (vc->codec->has_b_frames
-                            || vc->codec->max_b_frames)
-                        if (vc->stream->time_base.num * 1000LL <=
-                                vc->stream->time_base.den)
-                            packet->duration = FFMAX(1, av_rescale_q(1,
-                                 vc->codec->time_base, vc->stream->time_base));
-        }
-
-        if (encode_lavc_write_frame(vo->encode_lavc_ctx,
-                                    vc->stream, packet) < 0) {
-            MP_ERR(vo, "error writing\n");
-            return;
-        }
-
-        vc->have_first_packet = 1;
+    if (encode_lavc_write_frame(vo->encode_lavc_ctx,
+                                vc->stream, packet) < 0) {
+        MP_ERR(vo, "error writing at %d %d/%d\n",
+               (int) packet->pts,
+               vc->stream->time_base.num,
+               vc->stream->time_base.den);
+        return;
     }
+
+    vc->have_first_packet = 1;
 }
 
-static int encode_video(struct vo *vo, AVFrame *frame, AVPacket *packet)
+static void encode_video_and_write(struct vo *vo, AVFrame *frame)
 {
     struct priv *vc = vo->priv;
-    int got_packet = 0;
-    int status = avcodec_encode_video2(vc->codec, packet,
-                                        frame, &got_packet);
-    int size = (status < 0) ? status : got_packet ? packet->size : 0;
-
-    if (frame)
-        MP_DBG(vo, "got pts %f; out size: %d\n",
-               frame->pts * (double) vc->codec->time_base.num /
-               (double) vc->codec->time_base.den, size);
-
-    if (got_packet)
+    AVPacket packet = {0};
+
+#if HAVE_AVCODEC_NEW_CODEC_API
+    int status = avcodec_send_frame(vc->codec, frame);
+    if (status < 0) {
+        MP_ERR(vo, "error encoding at %d %d/%d\n",
+               frame ? (int) frame->pts : -1,
+               vc->codec->time_base.num,
+               vc->codec->time_base.den);
+        return;
+    }
+    for (;;) {
+        av_init_packet(&packet);
+        status = avcodec_receive_packet(vc->codec, &packet);
+        if (status == AVERROR(EAGAIN)) { // No more packets for now.
+            if (frame == NULL) {
+                MP_ERR(vo, "sent flush frame, got EAGAIN");
+            }
+            break;
+        }
+        if (status == AVERROR_EOF) { // No more packets, ever.
+            if (frame != NULL) {
+                MP_ERR(vo, "sent image frame, got EOF");
+            }
+            break;
+        }
+        if (status < 0) {
+            MP_ERR(vo, "error encoding at %d %d/%d\n",
+                   frame ? (int) frame->pts : -1,
+                   vc->codec->time_base.num,
+                   vc->codec->time_base.den);
+            break;
+        }
         encode_lavc_write_stats(vo->encode_lavc_ctx, vc->codec);
-    return size;
+        write_packet(vo, &packet);
+        av_packet_unref(&packet);
+    }
+#else
+    av_init_packet(&packet);
+    int got_packet = 0;
+    int status = avcodec_encode_video2(vc->codec, &packet, frame, &got_packet);
+    if (status < 0) {
+        MP_ERR(vo, "error encoding at %d %d/%d\n",
+               frame ? (int) frame->pts : -1,
+               vc->codec->time_base.num,
+               vc->codec->time_base.den);
+        return;
+    }
+    if (!got_packet) {
+        return;
+    }
+    encode_lavc_write_stats(vo->encode_lavc_ctx, vc->codec);
+    write_packet(vo, &packet);
+    av_packet_unref(&packet);
+#endif
 }
 
 static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
 {
     struct priv *vc = vo->priv;
     struct encode_lavc_context *ectx = vo->encode_lavc_ctx;
-    int size;
     AVCodecContext *avc;
     int64_t frameipts;
     double nextpts;
@@ -398,7 +428,6 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
         // we have a valid image in lastimg
         while (vc->lastimg && vc->lastipts < frameipts) {
             int64_t thisduration = vc->harddup ? 1 : (frameipts - vc->lastipts);
-            AVPacket packet;
 
             // we will ONLY encode this frame if it can be encoded at at least
             // vc->mindeltapts after the last encoded frame!
@@ -417,20 +446,13 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
                 // this is a nop, unless the worst time base is the STREAM time base
                 frame->pts = av_rescale_q(vc->lastipts + skipframes,
                                           vc->worst_time_base, avc->time_base);
-
                 frame->pict_type = 0; // keep this at unknown/undefined
-
                 frame->quality = avc->global_quality;
+                encode_video_and_write(vo, frame);
+                av_frame_free(&frame);
 
-                av_init_packet(&packet);
-                packet.data = vc->buffer;
-                packet.size = vc->buffer_size;
-                size = encode_video(vo, frame, &packet);
-                write_packet(vo, size, &packet);
                 ++vc->lastdisplaycount;
                 vc->lastencodedipts = vc->lastipts + skipframes;
-
-                av_frame_free(&frame);
             }
 
             vc->lastipts += thisduration;
@@ -439,14 +461,7 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
 
     if (!mpi) {
         // finish encoding
-        do {
-            AVPacket packet;
-            av_init_packet(&packet);
-            packet.data = vc->buffer;
-            packet.size = vc->buffer_size;
-            size = encode_video(vo, NULL, &packet);
-            write_packet(vo, size, &packet);
-        } while (size > 0);
+        encode_video_and_write(vo, NULL);
     } else {
         if (frameipts >= vc->lastframeipts) {
             if (vc->lastframeipts != AV_NOPTS_VALUE && vc->lastdisplaycount != 1)
diff --git a/video/out/vo_opengl.c b/video/out/vo_opengl.c
index 08b9b11..095308f 100644
--- a/video/out/vo_opengl.c
+++ b/video/out/vo_opengl.c
@@ -305,8 +305,10 @@ static int control(struct vo *vo, uint32_t request, void *data)
         struct mp_image *screen = gl_read_window_contents(p->gl);
         // set image parameters according to the display, if possible
         if (screen) {
-            screen->params.primaries = p->renderer_opts->target_prim;
-            screen->params.gamma = p->renderer_opts->target_trc;
+            screen->params.color = (struct mp_colorspace) {
+                .primaries = p->renderer_opts->target_prim,
+                .gamma = p->renderer_opts->target_trc,
+            };
             if (p->glctx->flip_v)
                 mp_image_vflip(screen);
         }
diff --git a/video/out/vo_rpi.c b/video/out/vo_rpi.c
index cd37362..947e630 100644
--- a/video/out/vo_rpi.c
+++ b/video/out/vo_rpi.c
@@ -529,7 +529,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     input->format->es->video.height = MP_ALIGN_UP(params->h, ALIGN_H);
     input->format->es->video.crop = (MMAL_RECT_T){0, 0, params->w, params->h};
     input->format->es->video.par = (MMAL_RATIONAL_T){params->p_w, params->p_h};
-    input->format->es->video.color_space = map_csp(params->colorspace);
+    input->format->es->video.color_space = map_csp(params->color.space);
 
     if (mmal_port_format_commit(input))
         return -1;
diff --git a/video/out/vo_vaapi.c b/video/out/vo_vaapi.c
index dc8aaac..11bb469 100644
--- a/video/out/vo_vaapi.c
+++ b/video/out/vo_vaapi.c
@@ -32,8 +32,9 @@
 #include "common/msg.h"
 #include "video/out/vo.h"
 #include "video/mp_image_pool.h"
-#include "sub/osd.h"
+#include "video/sws_utils.h"
 #include "sub/img_convert.h"
+#include "sub/osd.h"
 #include "x11_common.h"
 
 #include "video/mp_image.h"
@@ -58,7 +59,6 @@ struct vaapi_osd_part {
     int change_id;
     struct vaapi_osd_image image;
     struct vaapi_subpic subpic;
-    struct osd_conv_cache *conv_cache;
 };
 
 #define MAX_OUTPUT_SURFACES 2
@@ -225,7 +225,7 @@ static bool render_to_screen(struct priv *p, struct mp_image *mpi)
         }
     }
 
-    int flags = va_get_colorspace_flag(p->image_params.colorspace) |
+    int flags = va_get_colorspace_flag(p->image_params.color.space) |
                 p->scaling | VA_FRAME_PICTURE;
     status = vaPutSurface(p->display,
                           surface,
@@ -336,8 +336,6 @@ static void draw_osd_cb(void *pctx, struct sub_bitmaps *imgs)
     if (imgs->change_id != part->change_id) {
         part->change_id = imgs->change_id;
 
-        osd_scale_rgba(part->conv_cache, imgs);
-
         struct mp_rect bb;
         if (!mp_sub_bitmaps_bb(imgs, &bb))
             goto error;
@@ -365,6 +363,25 @@ static void draw_osd_cb(void *pctx, struct sub_bitmaps *imgs)
         for (int n = 0; n < imgs->num_parts; n++) {
             struct sub_bitmap *sub = &imgs->parts[n];
 
+            struct mp_image src = {0};
+            mp_image_setfmt(&src, IMGFMT_BGRA);
+            mp_image_set_size(&src, sub->w, sub->h);
+            src.planes[0] = sub->bitmap;
+            src.stride[0] = sub->stride;
+
+            struct mp_image *bmp = &src;
+
+            struct mp_image *tmp = NULL;
+            if (sub->dw != sub->w || sub->dh != sub->h) {
+                tmp = mp_image_alloc(IMGFMT_BGRA, sub->dw, sub->dh);
+                if (!tmp)
+                    goto error;
+
+                mp_image_swscale(tmp, &src, mp_sws_fast_flags);
+
+                bmp = tmp;
+            }
+
             // Note: nothing guarantees that the sub-bitmaps don't overlap.
             //       But in all currently existing cases, they don't.
             //       We simply hope that this won't change, and nobody will
@@ -373,8 +390,10 @@ static void draw_osd_cb(void *pctx, struct sub_bitmaps *imgs)
             size_t dst = (sub->y - bb.y0) * vaimg.stride[0] +
                          (sub->x - bb.x0) * 4;
 
-            memcpy_pic(vaimg.planes[0] + dst, sub->bitmap, sub->w * 4, sub->h,
-                       vaimg.stride[0], sub->stride);
+            memcpy_pic(vaimg.planes[0] + dst, bmp->planes[0], sub->dw * 4,
+                       sub->dh, vaimg.stride[0], bmp->stride[0]);
+
+            talloc_free(tmp);
         }
 
         if (!va_image_unmap(p->mpvaapi, &img->image))
@@ -630,7 +649,6 @@ static int preinit(struct vo *vo)
         struct vaapi_osd_part *part = &p->osd_parts[n];
         part->image.image.image_id = VA_INVALID_ID;
         part->image.subpic_id = VA_INVALID_ID;
-        part->conv_cache = talloc_steal(vo, osd_conv_cache_new());
     }
 
     int max_display_attrs = vaMaxNumDisplayAttributes(p->display);
diff --git a/video/out/vo_vdpau.c b/video/out/vo_vdpau.c
index 15472b2..6dd31e9 100644
--- a/video/out/vo_vdpau.c
+++ b/video/out/vo_vdpau.c
@@ -47,7 +47,6 @@
 #include "options/m_option.h"
 #include "video/mp_image.h"
 #include "osdep/timer.h"
-#include "bitmap_packer.h"
 
 // Returns x + a, but wrapped around to the range [0, m)
 // a must be within [-m, m], x within [0, m)
@@ -125,9 +124,7 @@ struct vdpctx {
     struct osd_bitmap_surface {
         VdpRGBAFormat format;
         VdpBitmapSurface surface;
-        uint32_t max_width;
-        uint32_t max_height;
-        struct bitmap_packer *packer;
+        uint32_t surface_w, surface_h;
         // List of surfaces to be rendered
         struct osd_target {
             VdpRect source;
@@ -144,18 +141,17 @@ struct vdpctx {
 };
 
 static bool status_ok(struct vo *vo);
-static void draw_osd(struct vo *vo);
 
-static int render_video_to_output_surface(struct vo *vo,
-                                          VdpOutputSurface output_surface,
-                                          VdpRect *output_rect,
-                                          VdpRect *video_rect)
+static int video_to_output_surface(struct vo *vo, struct mp_image *mpi)
 {
     struct vdpctx *vc = vo->priv;
     struct vdp_functions *vdp = vc->vdp;
     VdpTime dummy;
     VdpStatus vdp_st;
-    struct mp_image *mpi = vc->current_image;
+
+    VdpOutputSurface output_surface = vc->output_surfaces[vc->surface_num];
+    VdpRect *output_rect = &vc->out_rect_vid;
+    VdpRect *video_rect = &vc->src_rect_vid;
 
     vdp_st = vdp->presentation_queue_block_until_surface_idle(vc->flip_queue,
                                                               output_surface,
@@ -238,17 +234,6 @@ static int render_video_to_output_surface(struct vo *vo,
     return 0;
 }
 
-static int video_to_output_surface(struct vo *vo)
-{
-    struct vdpctx *vc = vo->priv;
-
-    int r = render_video_to_output_surface(vo,
-                                           vc->output_surfaces[vc->surface_num],
-                                           &vc->out_rect_vid, &vc->src_rect_vid);
-    draw_osd(vo);
-    return r;
-}
-
 static void forget_frames(struct vo *vo, bool seek_reset)
 {
     struct vdpctx *vc = vo->priv;
@@ -455,7 +440,6 @@ static void mark_vdpau_objects_uninitialized(struct vo *vo)
     vc->vdp_device = VDP_INVALID_HANDLE;
     for (int i = 0; i < MAX_OSD_PARTS; i++) {
         struct osd_bitmap_surface *sfc = &vc->osd_surfaces[i];
-        talloc_free(sfc->packer);
         sfc->change_id = 0;
         *sfc = (struct osd_bitmap_surface){
             .surface = VDP_INVALID_HANDLE,
@@ -531,22 +515,6 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     return 0;
 }
 
-static struct bitmap_packer *make_packer(struct vo *vo, VdpRGBAFormat format)
-{
-    struct vdpctx *vc = vo->priv;
-    struct vdp_functions *vdp = vc->vdp;
-
-    struct bitmap_packer *packer = talloc_zero(vo, struct bitmap_packer);
-    uint32_t w_max = 0, h_max = 0;
-    VdpStatus vdp_st = vdp->
-        bitmap_surface_query_capabilities(vc->vdp_device, format,
-                                          &(VdpBool){0}, &w_max, &h_max);
-    CHECK_VDP_WARNING(vo, "Query to get max OSD surface size failed");
-    packer->w_max = w_max;
-    packer->h_max = h_max;
-    return packer;
-}
-
 static void draw_osd_part(struct vo *vo, int index)
 {
     struct vdpctx *vc = vo->priv;
@@ -590,91 +558,99 @@ static void draw_osd_part(struct vo *vo, int index)
     }
 }
 
+static int next_pow2(int v)
+{
+    for (int x = 0; x < 30; x++) {
+        if ((1 << x) >= v)
+            return 1 << x;
+    }
+    return INT_MAX;
+}
+
 static void generate_osd_part(struct vo *vo, struct sub_bitmaps *imgs)
 {
     struct vdpctx *vc = vo->priv;
     struct vdp_functions *vdp = vc->vdp;
     VdpStatus vdp_st;
     struct osd_bitmap_surface *sfc = &vc->osd_surfaces[imgs->render_index];
-    bool need_upload = false;
 
     if (imgs->change_id == sfc->change_id)
         return; // Nothing changed and we still have the old data
 
+    sfc->change_id = imgs->change_id;
     sfc->render_count = 0;
 
     if (imgs->format == SUBBITMAP_EMPTY || imgs->num_parts == 0)
         return;
 
-    need_upload = true;
     VdpRGBAFormat format;
-    int format_size;
     switch (imgs->format) {
     case SUBBITMAP_LIBASS:
         format = VDP_RGBA_FORMAT_A8;
-        format_size = 1;
         break;
     case SUBBITMAP_RGBA:
         format = VDP_RGBA_FORMAT_B8G8R8A8;
-        format_size = 4;
         break;
     default:
         abort();
     };
-    if (sfc->format != format) {
-        talloc_free(sfc->packer);
-        sfc->packer = NULL;
-    };
-    sfc->format = format;
-    if (!sfc->packer)
-        sfc->packer = make_packer(vo, format);
-    sfc->packer->padding = imgs->scaled; // assume 2x2 filter on scaling
-    int r = packer_pack_from_subbitmaps(sfc->packer, imgs);
-    if (r < 0) {
-        MP_ERR(vo, "OSD bitmaps do not fit on a surface with the maximum "
-               "supported size\n");
-        return;
-    } else if (r == 1) {
+
+    assert(imgs->packed);
+
+    int r_w = next_pow2(imgs->packed_w);
+    int r_h = next_pow2(imgs->packed_h);
+
+    if (sfc->format != format || sfc->surface == VDP_INVALID_HANDLE ||
+        sfc->surface_w < r_w || sfc->surface_h < r_h)
+    {
+        MP_VERBOSE(vo, "Allocating a %dx%d surface for OSD bitmaps.\n", r_w, r_h);
+
+        uint32_t m_w = 0, m_h = 0;
+        vdp_st = vdp->bitmap_surface_query_capabilities(vc->vdp_device, format,
+                                                        &(VdpBool){0}, &m_w, &m_h);
+        CHECK_VDP_WARNING(vo, "Query to get max OSD surface size failed");
+
+        if (r_w > m_w || r_h > m_h) {
+            MP_ERR(vo, "OSD bitmaps do not fit on a surface with the maximum "
+                   "supported size\n");
+            return;
+        }
+
         if (sfc->surface != VDP_INVALID_HANDLE) {
             vdp_st = vdp->bitmap_surface_destroy(sfc->surface);
             CHECK_VDP_WARNING(vo, "Error when calling vdp_bitmap_surface_destroy");
         }
-        MP_VERBOSE(vo, "Allocating a %dx%d surface for OSD bitmaps.\n",
-                   sfc->packer->w, sfc->packer->h);
+
+        VdpBitmapSurface surface;
         vdp_st = vdp->bitmap_surface_create(vc->vdp_device, format,
-                                            sfc->packer->w, sfc->packer->h,
-                                            true, &sfc->surface);
-        if (vdp_st != VDP_STATUS_OK)
-            sfc->surface = VDP_INVALID_HANDLE;
+                                            r_w, r_h, true, &surface);
         CHECK_VDP_WARNING(vo, "OSD: error when creating surface");
-    }
-    if (imgs->scaled) {
-        char *zeros = calloc(sfc->packer->used_width, format_size);
-        if (!zeros)
+        if (vdp_st != VDP_STATUS_OK)
             return;
-        vdp_st = vdp->bitmap_surface_put_bits_native(sfc->surface,
-                &(const void *){zeros}, &(uint32_t){0},
-                &(VdpRect){0, 0, sfc->packer->used_width,
-                                 sfc->packer->used_height});
-        CHECK_VDP_WARNING(vo, "OSD: error uploading OSD bitmap");
-        free(zeros);
-    }
 
-    if (sfc->surface == VDP_INVALID_HANDLE)
-        return;
-    if (sfc->packer->count > sfc->targets_size) {
-        talloc_free(sfc->targets);
-        sfc->targets_size = sfc->packer->count;
-        sfc->targets = talloc_size(vc, sfc->targets_size
-                                       * sizeof(*sfc->targets));
+        sfc->surface = surface;
+        sfc->surface_w = r_w;
+        sfc->surface_h = r_h;
+        sfc->format = format;
     }
 
-    for (int i = 0 ;i < sfc->packer->count; i++) {
+    void *data = imgs->packed->planes[0];
+    int stride = imgs->packed->stride[0];
+    VdpRect rc = {0, 0, imgs->packed_w, imgs->packed_h};
+    vdp_st = vdp->bitmap_surface_put_bits_native(sfc->surface,
+                                                 &(const void *){data},
+                                                 &(uint32_t){stride},
+                                                 &rc);
+    CHECK_VDP_WARNING(vo, "OSD: putbits failed");
+
+    MP_TARRAY_GROW(vc, sfc->targets, imgs->num_parts);
+    sfc->render_count = imgs->num_parts;
+
+    for (int i = 0; i < imgs->num_parts; i++) {
         struct sub_bitmap *b = &imgs->parts[i];
-        struct osd_target *target = sfc->targets + sfc->render_count;
-        int x = sfc->packer->result[i].x;
-        int y = sfc->packer->result[i].y;
-        target->source = (VdpRect){x, y, x + b->w, y + b->h};
+        struct osd_target *target = &sfc->targets[i];
+        target->source = (VdpRect){b->src_x, b->src_y,
+                                   b->src_x + b->w, b->src_y + b->h};
         target->dest = (VdpRect){b->x, b->y, b->x + b->dw, b->y + b->dh};
         target->color = (VdpColor){1, 1, 1, 1};
         if (imgs->format == SUBBITMAP_LIBASS) {
@@ -684,18 +660,7 @@ static void generate_osd_part(struct vo *vo, struct sub_bitmaps *imgs)
             target->color.green = ((color >> 16) & 0xff) / 255.0;
             target->color.red   = ((color >> 24) & 0xff) / 255.0;
         }
-        if (need_upload) {
-            vdp_st = vdp->
-                bitmap_surface_put_bits_native(sfc->surface,
-                                               &(const void *){b->bitmap},
-                                               &(uint32_t){b->stride},
-                                               &target->source);
-                CHECK_VDP_WARNING(vo, "OSD: putbits failed");
-        }
-        sfc->render_count++;
     }
-
-    sfc->change_id = imgs->change_id;
 }
 
 static void draw_osd_cb(void *ctx, struct sub_bitmaps *imgs)
@@ -922,8 +887,10 @@ static void draw_frame(struct vo *vo, struct vo_frame *frame)
     vc->current_pts = frame->pts;
     vc->current_duration = frame->duration;
 
-    if (status_ok(vo))
-        video_to_output_surface(vo);
+    if (status_ok(vo)) {
+        video_to_output_surface(vo, vc->current_image);
+        draw_osd(vo);
+    }
 }
 
 // warning: the size and pixel format of surface must match that of the
diff --git a/video/out/vo_x11.c b/video/out/vo_x11.c
index 7676155..01928b7 100644
--- a/video/out/vo_x11.c
+++ b/video/out/vo_x11.c
@@ -407,8 +407,8 @@ static int preinit(struct vo *vo)
         goto error;
 
     p->gc = XCreateGC(x11->display, x11->window, 0, NULL);
-    MP_WARN(vo, "Warning: this legacy VO has bad performance. Consider fixing"
-                "your graphic drivers, or not forcing the x11 VO.\n");
+    MP_WARN(vo, "Warning: this legacy VO has bad performance. Consider fixing "
+                "your graphics drivers, or not forcing the x11 VO.\n");
     return 0;
 
 error:
diff --git a/video/out/vo_xv.c b/video/out/vo_xv.c
index 1e7ae7c..121dff0 100644
--- a/video/out/vo_xv.c
+++ b/video/out/vo_xv.c
@@ -518,7 +518,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     ctx->current_buf = 0;
     ctx->current_ip_buf = 0;
 
-    int is_709 = params->colorspace == MP_CSP_BT_709;
+    int is_709 = params->color.space == MP_CSP_BT_709;
     xv_set_eq(vo, ctx->xv_port, "bt_709", is_709 * 200 - 100);
     read_xv_csp(vo);
 
@@ -533,6 +533,8 @@ static bool allocate_xvimage(struct vo *vo, int foo)
     struct vo_x11_state *x11 = vo->x11;
     // align it for faster OSD rendering (draw_bmp.c swscale usage)
     int aligned_w = FFALIGN(ctx->image_width, 32);
+    // round up the height to next chroma boundary too
+    int aligned_h = FFALIGN(ctx->image_height, 2);
 #if HAVE_SHM && HAVE_XEXT
     if (x11->display_is_local && XShmQueryExtension(x11->display)) {
         ctx->Shmem_Flag = 1;
@@ -546,7 +548,7 @@ static bool allocate_xvimage(struct vo *vo, int foo)
         ctx->xvimage[foo] =
             (XvImage *) XvShmCreateImage(x11->display, ctx->xv_port,
                                          ctx->xv_format, NULL,
-                                         aligned_w, ctx->image_height,
+                                         aligned_w, aligned_h,
                                          &ctx->Shminfo[foo]);
         if (!ctx->xvimage[foo])
             return false;
@@ -569,7 +571,7 @@ static bool allocate_xvimage(struct vo *vo, int foo)
         ctx->xvimage[foo] =
             (XvImage *) XvCreateImage(x11->display, ctx->xv_port,
                                       ctx->xv_format, NULL, aligned_w,
-                                      ctx->image_height);
+                                      aligned_h);
         if (!ctx->xvimage[foo])
             return false;
         ctx->xvimage[foo]->data = av_malloc(ctx->xvimage[foo]->data_size);
@@ -578,16 +580,16 @@ static bool allocate_xvimage(struct vo *vo, int foo)
         XSync(x11->display, False);
     }
 
-    if ((ctx->xvimage[foo]->width != aligned_w) ||
-        (ctx->xvimage[foo]->height != ctx->image_height)) {
-        MP_ERR(vo, "Got XvImage with incorrect size: %ux%u (expected %ux%u)\n",
+    if ((ctx->xvimage[foo]->width < aligned_w) ||
+        (ctx->xvimage[foo]->height < aligned_h)) {
+        MP_ERR(vo, "Got XvImage with too small size: %ux%u (expected %ux%u)\n",
                ctx->xvimage[foo]->width, ctx->xvimage[foo]->height,
                aligned_w, ctx->image_height);
         return false;
     }
 
     struct mp_image img = get_xv_buffer(vo, foo);
-    img.w = aligned_w;
+    mp_image_set_size(&img, aligned_w, aligned_h);
     mp_image_clear(&img, 0, 0, img.w, img.h);
     return true;
 }
@@ -659,7 +661,7 @@ static struct mp_image get_xv_buffer(struct vo *vo, int buf_index)
     if (vo->params) {
         struct mp_image_params params = *vo->params;
         if (ctx->cached_csp)
-            params.colorspace = ctx->cached_csp;
+            params.color.space = ctx->cached_csp;
         mp_image_set_attributes(&img, &params);
     }
 
@@ -854,7 +856,7 @@ static int preinit(struct vo *vo)
 
     MP_WARN(vo, "Warning: this legacy VO has bad quality and performance, "
                 "and will in particular result in blurry OSD and subtitles. "
-                "You should fix your graphic drivers, or not force the xv VO.\n");
+                "You should fix your graphics drivers, or not force the xv VO.\n");
     return 0;
 
   error:
diff --git a/video/out/w32_common.c b/video/out/w32_common.c
index f3b59f1..e78e941 100644
--- a/video/out/w32_common.c
+++ b/video/out/w32_common.c
@@ -341,38 +341,36 @@ static LRESULT borderless_nchittest(struct vo_w32_state *w32, int x, int y)
     POINT mouse = { x, y };
     ScreenToClient(w32->window, &mouse);
 
+    // The horizontal frame should be the same size as the vertical frame,
+    // since the NONCLIENTMETRICS structure does not distinguish between them
+    int frame_size = GetSystemMetrics(SM_CXFRAME) +
+                     GetSystemMetrics(SM_CXPADDEDBORDER);
     // The diagonal size handles are slightly wider than the side borders
-    int handle_width = GetSystemMetrics(SM_CXSMSIZE) +
-                       GetSystemMetrics(SM_CXBORDER);
+    int diagonal_width = frame_size * 2 + GetSystemMetrics(SM_CXBORDER);
 
     // Hit-test top border
-    int frame_height = GetSystemMetrics(SM_CYFRAME) +
-                       GetSystemMetrics(SM_CXPADDEDBORDER);
-    if (mouse.y < frame_height) {
-        if (mouse.x < handle_width)
+    if (mouse.y < frame_size) {
+        if (mouse.x < diagonal_width)
             return HTTOPLEFT;
-        if (mouse.x > w32->dw - handle_width)
+        if (mouse.x >= w32->dw - diagonal_width)
             return HTTOPRIGHT;
         return HTTOP;
     }
 
     // Hit-test bottom border
-    if (mouse.y > w32->dh - frame_height) {
-        if (mouse.x < handle_width)
+    if (mouse.y >= w32->dh - frame_size) {
+        if (mouse.x < diagonal_width)
             return HTBOTTOMLEFT;
-        if (mouse.x > w32->dw - handle_width)
+        if (mouse.x >= w32->dw - diagonal_width)
             return HTBOTTOMRIGHT;
         return HTBOTTOM;
     }
 
     // Hit-test side borders
-    int frame_width = GetSystemMetrics(SM_CXFRAME) +
-                      GetSystemMetrics(SM_CXPADDEDBORDER);
-    if (mouse.x < frame_width)
+    if (mouse.x < frame_size)
         return HTLEFT;
-    if (mouse.x > w32->dw - frame_width)
+    if (mouse.x >= w32->dw - frame_size)
         return HTRIGHT;
-
     return HTCLIENT;
 }
 
@@ -1182,7 +1180,7 @@ static void gui_thread_reconfig(void *ptr)
     }
 
     // Recenter window around old position on new video size
-    // excluding the case when initial positon handled by win_state.
+    // excluding the case when initial position handled by win_state.
     if (!pos_init) {
         w32->window_x += w32->dw / 2 - vo->dwidth / 2;
         w32->window_y += w32->dh / 2 - vo->dheight / 2;
diff --git a/video/out/x11_common.c b/video/out/x11_common.c
index 647a910..ce94679 100644
--- a/video/out/x11_common.c
+++ b/video/out/x11_common.c
@@ -514,7 +514,7 @@ static void *screensaver_thread(void *arg)
             break;
 
         char *args[] = {"xdg-screensaver", "reset", NULL};
-        int status = mp_subprocess(args, NULL, NULL, NULL, NULL, &(char*){0});
+        int status = mp_subprocess(args, NULL, NULL, mp_devnull, mp_devnull, &(char*){0});
         if (status) {
             MP_VERBOSE(x11, "Disabling screensaver failed (%d). Make sure the "
                             "xdg-screensaver script is installed.\n", status);
@@ -1506,7 +1506,7 @@ static void vo_x11_map_window(struct vo *vo, struct mp_rect rc)
 
     // map window
     int events = StructureNotifyMask | ExposureMask | PropertyChangeMask |
-                 LeaveWindowMask | EnterWindowMask;
+                 LeaveWindowMask | EnterWindowMask | FocusChangeMask;
     if (mp_input_mouse_enabled(x11->input_ctx))
         events |= PointerMotionMask | ButtonPressMask | ButtonReleaseMask;
     if (mp_input_vo_keyboard_enabled(x11->input_ctx))
diff --git a/video/sws_utils.c b/video/sws_utils.c
index ce44c67..45918b1 100644
--- a/video/sws_utils.c
+++ b/video/sws_utils.c
@@ -192,11 +192,11 @@ int mp_sws_reinit(struct mp_sws_context *ctx)
         return -1;
     }
 
-    int s_csp = mp_csp_to_sws_colorspace(src->colorspace);
-    int s_range = src->colorlevels == MP_CSP_LEVELS_PC;
+    int s_csp = mp_csp_to_sws_colorspace(src->color.space);
+    int s_range = src->color.levels == MP_CSP_LEVELS_PC;
 
-    int d_csp = mp_csp_to_sws_colorspace(dst->colorspace);
-    int d_range = dst->colorlevels == MP_CSP_LEVELS_PC;
+    int d_csp = mp_csp_to_sws_colorspace(dst->color.space);
+    int d_range = dst->color.levels == MP_CSP_LEVELS_PC;
 
     // Work around libswscale bug #1852 (fixed in ffmpeg commit 8edf9b1fa):
     // setting range flags for RGB gives random bogus results.
diff --git a/wscript b/wscript
index 1915f7c..9269d95 100644
--- a/wscript
+++ b/wscript
@@ -498,10 +498,17 @@ FFmpeg/Libav libraries. You need at least {0}. Aborting.".format(libav_versions_
                                 '(void)offsetof(AVFrame, hw_frames_ctx)',
                                 use='libav'),
     }, {
-        'name': 'avutil-st2084',
-        'desc': 'libavutil AVCOL_TRC_SMPTEST2084',
+        'name': 'avutil-hdr',
+        'desc': 'libavutil HDR TRCs',
         'func': check_statement('libavutil/pixfmt.h',
-                                'AVCOL_TRC_SMPTEST2084',
+                                'AVCOL_TRC_SMPTEST2084,'
+                                'AVCOL_TRC_ARIB_STD_B67',
+                                use='libav'),
+    }, {
+        'name': 'avutil-mastering-metadata',
+        'desc': 'libavutil mastering display metadata struct',
+        'func': check_statement('libavutil/mastering_display_metadata.h',
+                                'AV_FRAME_DATA_MASTERING_DISPLAY_METADATA',
                                 use='libav'),
     }
 ]
diff --git a/wscript_build.py b/wscript_build.py
index 54518b3..8f2b85c 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -288,7 +288,7 @@ def build(ctx):
         ( "video/decode/dec_video.c"),
         ( "video/decode/dxva2.c",                "d3d-hwaccel" ),
         ( "video/decode/d3d11va.c",              "d3d-hwaccel" ),
-        ( "video/decode/d3d.c",                  "d3d-hwaccel" ),
+        ( "video/decode/d3d.c",                  "win32" ),
         ( "video/decode/vaapi.c",                "vaapi-hwaccel" ),
         ( "video/decode/vd_lavc.c" ),
         ( "video/decode/videotoolbox.c",         "videotoolbox-hwaccel" ),
@@ -327,7 +327,6 @@ def build(ctx):
         ( "video/out/dither.c" ),
         ( "video/out/filter_kernels.c" ),
         ( "video/out/opengl/angle_dynamic.c",    "egl-angle" ),
-        ( "video/out/opengl/angle_common.c",     "egl-angle" ),
         ( "video/out/opengl/common.c",           "gl" ),
         ( "video/out/opengl/context.c",          "gl" ),
         ( "video/out/opengl/context_angle.c",    "egl-angle" ),

-- 
mpv packaging