[SCM] libav/experimental: vc-1: Optimise parser (with special attention to ARM)

Sun Aug 10 16:04:13 UTC 2014

The following commit has been merged in the experimental branch:
commit 701e8b42e12ad625c64ceae2252acb1de390278c
Author: Ben Avison <bavison at riscosopen.org>
Date:   Mon Jul 21 14:53:09 2014 +0100

    vc-1: Optimise parser (with special attention to ARM)
    
    The previous implementation of the parser made four passes over each input
    buffer (reduced to two if the container format already guaranteed the input
    buffer corresponded to frames, such as with MKV). But these buffers are
    often 200K in size, certainly enough to flush the data out of L1 cache, and
    for many CPUs, all the way out to main memory. The passes were:
    
    1) locate frame boundaries (not needed for MKV etc)
    2) copy the data into a contiguous block (not needed for MKV etc)
    3) locate the start codes within each frame
    4) unescape the data between start codes
    
    After this, the unescaped data was parsed to extract certain header fields,
    but because the unescape operation was so large, this was usually also
    effectively operating on uncached memory. Most of the unescaped data was
    simply thrown away and never processed further. Only step 2 - because it
    used memcpy - was using prefetch, making things even worse.
    
    This patch reorganises these steps so that, aside from the copying, the
    operations are performed in parallel, maximising cache utilisation. No more
    than the worst-case number of bytes needed for header parsing is unescaped.
    Most of the data is, in practice, only read in order to search for a start
    code, for which optimised implementations already existed in the H264 codec
    (notably the ARM version uses prefetch, so we end up doing both remaining
    passes at maximum speed). For MKV files, we know when we've found the last
    start code of interest in a given frame, so we are able to avoid doing even
    that one remaining pass for most of the buffer.
    
    In some use-cases (such as the Raspberry Pi) video decode is handled by the
    GPU, but the entire elementary stream is still fed through the parser to
    pick out certain elements of the header which are necessary to manage the
    decode process. As you might expect, in these cases, the performance of the
    parser is significant.
    
    To measure parser performance, I used the same VC-1 elementary stream in
    either an MPEG-2 transport stream or a MKV file, and fed it through avconv
    with -c:v copy -c:a copy -f null. These are the gperftools counts for
    those streams, both filtered to only include vc1_parse() and its callees,
    and unfiltered (to include the whole binary). Lower numbers are better:
    
                    Before          After
    File  Filtered  Mean   StdDev   Mean   StdDev  Confidence  Change
    M2TS  No        861.7  8.2      650.5  8.1     100.0%      +32.5%
    MKV   No        868.9  7.4      731.7  9.0     100.0%      +18.8%
    M2TS  Yes       250.0  11.2     27.2   3.4     100.0%      +817.9%
    MKV   Yes       149.0  12.8     1.7    0.8     100.0%      +8526.3%
    
    Yes, that last case shows vc1_parse() running 86 times faster! The M2TS
    case does show a larger absolute improvement though, since it was worse
    to begin with.
    
    This patch has been tested with the FATE suite (albeit on x86 for speed).
    
    Signed-off-by: Luca Barbato <lu_zero at gentoo.org>

diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
index 1bedd98..43ca0ed 100644
--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
@@ -30,117 +30,84 @@
 #include "vc1.h"
 #include "get_bits.h"
 
+/** The maximum number of bytes of a sequence, entry point or
+ *  frame header whose values we pay any attention to */
+#define UNESCAPED_THRESHOLD 37
+
+/** The maximum number of bytes of a sequence, entry point or
+ *  frame header which must be valid memory (because they are
+ *  used to update the bitstream cache in skip_bits() calls)
+ */
+#define UNESCAPED_LIMIT 144
+
+typedef enum {
+    NO_MATCH,
+    ONE_ZERO,
+    TWO_ZEROS,
+    ONE
+} VC1ParseSearchState;
+
 typedef struct {
     ParseContext pc;
     VC1Context v;
+    uint8_t prev_start_code;
+    size_t bytes_to_skip;
+    uint8_t unesc_buffer[UNESCAPED_LIMIT];
+    size_t unesc_index;
+    VC1ParseSearchState search_state;
 } VC1ParseContext;
 
-static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx,
-                                const uint8_t *buf, int buf_size)
+static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
+                               const uint8_t *buf, int buf_size)
 {
+    /* Parse the header we just finished unescaping */
     VC1ParseContext *vpc = s->priv_data;
     GetBitContext gb;
-    const uint8_t *start, *end, *next;
-    uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
-
     vpc->v.s.avctx = avctx;
     vpc->v.parse_only = 1;
-    next = buf;
-    s->repeat_pict = 0;
-
-    for(start = buf, end = buf + buf_size; next < end; start = next){
-        int buf2_size, size;
-
-        next = find_next_marker(start + 4, end);
-        size = next - start - 4;
-        buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
-        init_get_bits(&gb, buf2, buf2_size * 8);
-        if(size <= 0) continue;
-        switch(AV_RB32(start)){
-        case VC1_CODE_SEQHDR:
-            ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
-            break;
-        case VC1_CODE_ENTRYPOINT:
-            ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
-            break;
-        case VC1_CODE_FRAME:
-            if(vpc->v.profile < PROFILE_ADVANCED)
-                ff_vc1_parse_frame_header    (&vpc->v, &gb);
-            else
-                ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
-
-            /* keep AV_PICTURE_TYPE_BI internal to VC1 */
-            if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
-                s->pict_type = AV_PICTURE_TYPE_B;
-            else
-                s->pict_type = vpc->v.s.pict_type;
-
-            if (avctx->ticks_per_frame > 1){
-                // process pulldown flags
-                s->repeat_pict = 1;
-                // Pulldown flags are only valid when 'broadcast' has been set.
-                // So ticks_per_frame will be 2
-                if (vpc->v.rff){
-                    // repeat field
-                    s->repeat_pict = 2;
-                }else if (vpc->v.rptfrm){
-                    // repeat frames
-                    s->repeat_pict = vpc->v.rptfrm * 2 + 1;
-                }
-            }
-
-            if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
-                s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
-            else
-                s->field_order = AV_FIELD_PROGRESSIVE;
-
-            break;
-        }
-    }
+    init_get_bits(&gb, buf, buf_size * 8);
+    switch (vpc->prev_start_code) {
+    case VC1_CODE_SEQHDR & 0xFF:
+        ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
+        break;
+    case VC1_CODE_ENTRYPOINT & 0xFF:
+        ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
+        break;
+    case VC1_CODE_FRAME & 0xFF:
+        if(vpc->v.profile < PROFILE_ADVANCED)
+            ff_vc1_parse_frame_header    (&vpc->v, &gb);
+        else
+            ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
 
-    av_free(buf2);
-}
+        /* keep AV_PICTURE_TYPE_BI internal to VC1 */
+        if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
+            s->pict_type = AV_PICTURE_TYPE_B;
+        else
+            s->pict_type = vpc->v.s.pict_type;
 
-/**
- * Find the end of the current frame in the bitstream.
- * @return the position of the first byte of the next frame, or -1
- */
-static int vc1_find_frame_end(ParseContext *pc, const uint8_t *buf,
-                               int buf_size) {
-    int pic_found, i;
-    uint32_t state;
-
-    pic_found= pc->frame_start_found;
-    state= pc->state;
-
-    i=0;
-    if(!pic_found){
-        for(i=0; i<buf_size; i++){
-            state= (state<<8) | buf[i];
-            if(state == VC1_CODE_FRAME || state == VC1_CODE_FIELD){
-                i++;
-                pic_found=1;
-                break;
+        if (avctx->ticks_per_frame > 1){
+            // process pulldown flags
+            s->repeat_pict = 1;
+            // Pulldown flags are only valid when 'broadcast' has been set.
+            // So ticks_per_frame will be 2
+            if (vpc->v.rff){
+                // repeat field
+                s->repeat_pict = 2;
+            }else if (vpc->v.rptfrm){
+                // repeat frames
+                s->repeat_pict = vpc->v.rptfrm * 2 + 1;
             }
+        }else{
+            s->repeat_pict = 0;
         }
-    }
 
-    if(pic_found){
-        /* EOF considered as end of frame */
-        if (buf_size == 0)
-            return 0;
-        for(; i<buf_size; i++){
-            state= (state<<8) | buf[i];
-            if(IS_MARKER(state) && state != VC1_CODE_FIELD && state != VC1_CODE_SLICE){
-                pc->frame_start_found=0;
-                pc->state=-1;
-                return i-3;
-            }
-        }
+        if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
+            s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
+        else
+            s->field_order = AV_FIELD_PROGRESSIVE;
+
+        break;
     }
-    pc->frame_start_found= pic_found;
-    pc->state= state;
-    return END_NOT_FOUND;
 }
 
 static int vc1_parse(AVCodecParserContext *s,
@@ -148,22 +115,125 @@ static int vc1_parse(AVCodecParserContext *s,
                            const uint8_t **poutbuf, int *poutbuf_size,
                            const uint8_t *buf, int buf_size)
 {
+    /* Here we do the searching for frame boundaries and headers at
+     * the same time. Only a minimal amount at the start of each
+     * header is unescaped. */
     VC1ParseContext *vpc = s->priv_data;
-    int next;
+    int pic_found = vpc->pc.frame_start_found;
+    uint8_t *unesc_buffer = vpc->unesc_buffer;
+    size_t unesc_index = vpc->unesc_index;
+    VC1ParseSearchState search_state = vpc->search_state;
+    int next = END_NOT_FOUND;
+    int i = vpc->bytes_to_skip;
+
+    if (pic_found && buf_size == 0) {
+        /* EOF considered as end of frame */
+        memset(unesc_buffer + unesc_index, 0, UNESCAPED_THRESHOLD - unesc_index);
+        vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+        next = 0;
+    }
+    while (i < buf_size) {
+        int start_code_found = 0;
+        uint8_t b;
+        while (i < buf_size && unesc_index < UNESCAPED_THRESHOLD) {
+            b = buf[i++];
+            unesc_buffer[unesc_index++] = b;
+            if (search_state <= ONE_ZERO)
+                search_state = b ? NO_MATCH : search_state + 1;
+            else if (search_state == TWO_ZEROS) {
+                if (b == 1)
+                    search_state = ONE;
+                else if (b > 1) {
+                    if (b == 3)
+                        unesc_index--; // swallow emulation prevention byte
+                    search_state = NO_MATCH;
+                }
+            }
+            else { // search_state == ONE
+                // Header unescaping terminates early due to detection of next start code
+                search_state = NO_MATCH;
+                start_code_found = 1;
+                break;
+            }
+        }
+        if ((s->flags & PARSER_FLAG_COMPLETE_FRAMES) &&
+                unesc_index >= UNESCAPED_THRESHOLD &&
+                vpc->prev_start_code == (VC1_CODE_FRAME & 0xFF))
+        {
+            // No need to keep scanning the rest of the buffer for
+            // start codes if we know it contains a complete frame and
+            // we've already unescaped all we need of the frame header
+            vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+            break;
+        }
+        if (unesc_index >= UNESCAPED_THRESHOLD && !start_code_found) {
+            while (i < buf_size) {
+                if (search_state == NO_MATCH) {
+                    i += vpc->v.vc1dsp.startcode_find_candidate(buf + i, buf_size - i);
+                    if (i < buf_size) {
+                        search_state = ONE_ZERO;
+                    }
+                    i++;
+                } else {
+                    b = buf[i++];
+                    if (search_state == ONE_ZERO)
+                        search_state = b ? NO_MATCH : TWO_ZEROS;
+                    else if (search_state == TWO_ZEROS) {
+                        if (b >= 1)
+                            search_state = b == 1 ? ONE : NO_MATCH;
+                    }
+                    else { // search_state == ONE
+                        search_state = NO_MATCH;
+                        start_code_found = 1;
+                        break;
+                    }
+                }
+            }
+        }
+        if (start_code_found) {
+            vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+
+            vpc->prev_start_code = b;
+            unesc_index = 0;
+
+            if (!(s->flags & PARSER_FLAG_COMPLETE_FRAMES)) {
+                if (!pic_found && (b == (VC1_CODE_FRAME & 0xFF) || b == (VC1_CODE_FIELD & 0xFF))) {
+                    pic_found = 1;
+                }
+                else if (pic_found && b != (VC1_CODE_FIELD & 0xFF) && b != (VC1_CODE_SLICE & 0xFF)) {
+                    next = i - 4;
+                    pic_found = b == (VC1_CODE_FRAME & 0xFF);
+                    break;
+                }
+            }
+        }
+    }
 
-    if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
-        next= buf_size;
-    }else{
-        next= vc1_find_frame_end(&vpc->pc, buf, buf_size);
+    vpc->pc.frame_start_found = pic_found;
+    vpc->unesc_index = unesc_index;
+    vpc->search_state = search_state;
 
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
         if (ff_combine_frame(&vpc->pc, next, &buf, &buf_size) < 0) {
+            vpc->bytes_to_skip = 0;
             *poutbuf = NULL;
             *poutbuf_size = 0;
             return buf_size;
         }
     }
 
-    vc1_extract_headers(s, avctx, buf, buf_size);
+    /* If we return with a valid pointer to a combined frame buffer
+     * then on the next call then we'll have been unhelpfully rewound
+     * by up to 4 bytes (depending upon whether the start code
+     * overlapped the input buffer, and if so by how much). We don't
+     * want this: it will either cause spurious second detections of
+     * the start code we've already seen, or cause extra bytes to be
+     * inserted at the start of the unescaped buffer. */
+    vpc->bytes_to_skip = 4;
+    if (next < 0)
+        vpc->bytes_to_skip += next;
 
     *poutbuf = buf;
     *poutbuf_size = buf_size;
@@ -194,6 +264,10 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
 {
     VC1ParseContext *vpc = s->priv_data;
     vpc->v.s.slice_context_count = 1;
+    vpc->prev_start_code = 0;
+    vpc->bytes_to_skip = 0;
+    vpc->unesc_index = 0;
+    vpc->search_state = NO_MATCH;
     return ff_vc1_init_common(&vpc->v);
 }
 

-- 
Libav/FFmpeg packaging