Source: lib/text/mp4_vtt_parser.js

/** @license
 * Copyright 2016 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

goog.provide('shaka.text.Mp4VttParser');

goog.require('goog.asserts');
goog.require('shaka.log');
goog.require('shaka.text.Cue');
goog.require('shaka.text.TextEngine');
goog.require('shaka.text.VttTextParser');
goog.require('shaka.util.DataViewReader');
goog.require('shaka.util.Error');
goog.require('shaka.util.Functional');
goog.require('shaka.util.Iterables');
goog.require('shaka.util.Mp4Parser');
goog.require('shaka.util.StringUtils');
goog.require('shaka.util.TextParser');


/**
 * @implements {shaka.extern.TextParser}
 * @export
 */
shaka.text.Mp4VttParser = class {
  constructor() {
    /**
     * The current time scale used by the VTT parser.
     *
     * @type {?number}
     * @private
     */
    this.timescale_ = null;
  }

  /**
   * @override
   * @export
   */
  parseInit(data) {
    const Mp4Parser = shaka.util.Mp4Parser;

    let sawWVTT = false;

    new Mp4Parser()
        .box('moov', Mp4Parser.children)
        .box('trak', Mp4Parser.children)
        .box('mdia', Mp4Parser.children)
        .fullBox('mdhd', (box) => {
          goog.asserts.assert(
              box.version == 0 || box.version == 1,
              'MDHD version can only be 0 or 1');
          if (box.version == 0) {
            box.reader.skip(4); // Skip "creation_time".
            box.reader.skip(4); // Skip "modification_time".
            this.timescale_ = box.reader.readUint32();
            box.reader.skip(4); // Skip "duration".
          } else {
            box.reader.skip(8); // Skip "creation_time".
            box.reader.skip(8); // Skip "modification_time".
            this.timescale_ = box.reader.readUint32();
            box.reader.skip(8); // Skip "duration".
          }
          box.reader.skip(4); // Skip "pad", "language", and "pre-defined".
        })
        .box('minf', Mp4Parser.children)
        .box('stbl', Mp4Parser.children)
        .fullBox('stsd', Mp4Parser.sampleDescription)
        .box('wvtt', (box) => {
          // A valid vtt init segment, though we have no actual subtitles yet.
          sawWVTT = true;
        }).parse(data);

    if (!this.timescale_) {
      // Missing timescale for VTT content. It should be located in the MDHD.
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL,
          shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.INVALID_MP4_VTT);
    }

    if (!sawWVTT) {
      // A WVTT box should have been seen (a valid vtt init segment with no
      // actual subtitles).
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL,
          shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.INVALID_MP4_VTT);
    }
  }

  /**
   * @override
   * @export
   */
  parseMedia(data, time) {
    if (!this.timescale_) {
      // Missing timescale for VTT content. We should have seen the init
      // segment.
      shaka.log.error('No init segment for MP4+VTT!');
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL,
          shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.INVALID_MP4_VTT);
    }

    const Mp4VttParser = shaka.text.Mp4VttParser;
    const Mp4Parser = shaka.util.Mp4Parser;

    let baseTime = 0;
    /** @type {!Array.<shaka.text.Mp4VttParser.TimeSegment>} */
    let presentations = [];
    /** @type {!Uint8Array} */
    let rawPayload;
    /** @type {!Array.<shaka.text.Cue>} */
    const cues = [];

    let sawTFDT = false;
    let sawTRUN = false;
    let sawMDAT = false;
    let defaultDuration = null;

    const parser = new Mp4Parser()
        .box('moof', Mp4Parser.children)
        .box('traf', Mp4Parser.children)
        .fullBox('tfdt', (box) => {
          sawTFDT = true;
          goog.asserts.assert(
              box.version == 0 || box.version == 1,
              'TFDT version can only be 0 or 1');
          baseTime = (box.version == 0) ? box.reader.readUint32() :
                                          box.reader.readUint64();
        })
        .fullBox('tfhd', (box) => {
          goog.asserts.assert(
              box.flags != null,
              'A TFHD box should have a valid flags value');
          defaultDuration = Mp4VttParser.parseTFHD_(box.flags, box.reader);
        })
        .fullBox('trun', (box) => {
          sawTRUN = true;
          goog.asserts.assert(
              box.version != null,
              'A TRUN box should have a valid version value');
          goog.asserts.assert(
              box.flags != null,
              'A TRUN box should have a valid flags value');
          presentations =
              Mp4VttParser.parseTRUN_(box.version, box.flags, box.reader);
        })
        .box('mdat', Mp4Parser.allData((data) => {
          goog.asserts.assert(
              !sawMDAT,
              'VTT cues in mp4 with multiple MDAT are not currently supported');
          sawMDAT = true;
          rawPayload = data;
        }));
    parser.parse(data, /* partialOkay= */ false);

    if (!sawMDAT && !sawTFDT && !sawTRUN) {
      // A required box is missing.
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL,
          shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.INVALID_MP4_VTT);
    }

    let currentTime = baseTime;

    /** @type {!shaka.util.DataViewReader} */
    const reader = new shaka.util.DataViewReader(
        rawPayload, shaka.util.DataViewReader.Endianness.BIG_ENDIAN);

    for (const presentation of presentations) {
      // If one presentation corresponds to multiple payloads, it is assumed
      // that all of those payloads have the same start time and duration.
      const duration = presentation.duration || defaultDuration;
      const startTime = presentation.timeOffset ?
                      baseTime + presentation.timeOffset :
                      currentTime;
      currentTime = startTime + (duration || 0);

      // Read samples until it adds up to the given size.
      let totalSize = 0;
      do {
        // Read the payload size.
        const payloadSize = reader.readUint32();
        totalSize += payloadSize;

        // Skip the type.
        const payloadType = reader.readUint32();
        const payloadName = shaka.util.Mp4Parser.typeToString(payloadType);

        // Read the data payload.
        /** @type {Uint8Array} */
        let payload = null;
        if (payloadName == 'vttc') {
          if (payloadSize > 8) {
            payload = reader.readBytes(payloadSize - 8);
          }
        } else if (payloadName == 'vtte') {
          // It's a vtte, which is a vtt cue that is empty. Ignore any data that
          // does exist.
          reader.skip(payloadSize - 8);
        } else {
          shaka.log.error('Unknown box ' + payloadName + '! Skipping!');
          reader.skip(payloadSize - 8);
        }

        if (duration) {
          if (payload) {
            goog.asserts.assert(
                this.timescale_ != null, 'Timescale should not be null!');
            const cue = shaka.text.Mp4VttParser.parseVTTC_(
                payload,
                time.periodStart + startTime / this.timescale_,
                time.periodStart + currentTime / this.timescale_);
            cues.push(cue);
          }
        } else {
          shaka.log.error(
              'WVTT sample duration unknown, and no default found!');
        }

        goog.asserts.assert(
            !presentation.sampleSize || totalSize <= presentation.sampleSize,
            'The samples do not fit evenly into the sample sizes given in ' +
            'the TRUN box!');

        // If no sampleSize was specified, it's assumed that this presentation
        // corresponds to only a single cue.
      } while (presentation.sampleSize &&
               (totalSize < presentation.sampleSize));
    }

    goog.asserts.assert(
        !reader.hasMoreData(),
        'MDAT which contain VTT cues and non-VTT data are not currently ' +
        'supported!');

    return /** @type {!Array.<!shaka.extern.Cue>} */ (
      cues.filter(shaka.util.Functional.isNotNull));
  }

  /**
   * @param {number} flags
   * @param {!shaka.util.DataViewReader} reader
   * @return {?number} The default_sample_duration field, if present.
   * @private
   */
  static parseTFHD_(flags, reader) {
    // Skip "track_ID".
    reader.skip(4);

    // Skip "base_data_offset" if present.
    if (flags & 0x000001) {
      reader.skip(8);
    }

    // Skip "sample_description_index" if present.
    if (flags & 0x000002) {
      reader.skip(4);
    }

    // Read and return "default_sample_duration" if present.
    if (flags & 0x000008) {
      return reader.readUint32();
    }

    // There is no "default_sample_duration".
    return null;
  }

  /**
   * @param {number} version
   * @param {number} flags
   * @param {!shaka.util.DataViewReader} reader
   * @return {!Array.<shaka.text.Mp4VttParser.TimeSegment>}
   * @private
   */
  static parseTRUN_(version, flags, reader) {
    const sampleCount = reader.readUint32();

    // Skip "data_offset" if present.
    if (flags & 0x000001) {
      reader.skip(4);
    }

    // Skip "first_sample_flags" if present.
    if (flags & 0x000004) {
      reader.skip(4);
    }

    const samples = [];

    for (const _ of shaka.util.Iterables.range(sampleCount)) {
      shaka.util.Functional.ignored(_);
      /** @type {shaka.text.Mp4VttParser.TimeSegment} */
      const sample = {
        duration: null,
        sampleSize: null,
        timeOffset: null,
      };

      // Read "sample duration" if present.
      if (flags & 0x000100) {
        sample.duration = reader.readUint32();
      }

      // Read "sample_size" if present.
      if (flags & 0x000200) {
        sample.sampleSize = reader.readUint32();
      }

      // Skip "sample_flags" if present.
      if (flags & 0x000400) {
        reader.skip(4);
      }

      // Read "sample_time_offset" if present.
      if (flags & 0x000800) {
        sample.timeOffset = version == 0 ?
            reader.readUint32() :
            reader.readInt32();
      }

      samples.push(sample);
    }

    return samples;
  }

  /**
   * Parses a vttc box into a cue.
   *
   * @param {!Uint8Array} data
   * @param {number} startTime
   * @param {number} endTime
   * @return {shaka.text.Cue}
   * @private
   */
  static parseVTTC_(data, startTime, endTime) {
    let payload;
    let id;
    let settings;

    new shaka.util.Mp4Parser()
        .box('payl', shaka.util.Mp4Parser.allData((data) => {
          payload = shaka.util.StringUtils.fromUTF8(data);
        }))
        .box('iden', shaka.util.Mp4Parser.allData((data) => {
          id = shaka.util.StringUtils.fromUTF8(data);
        }))
        .box('sttg', shaka.util.Mp4Parser.allData((data) => {
          settings = shaka.util.StringUtils.fromUTF8(data);
        }))
        .parse(data);

    if (payload) {
      return shaka.text.Mp4VttParser.assembleCue_(
          payload, id, settings, startTime, endTime);
    } else {
      return null;
    }
  }

  /**
   * Take the individual components that make a cue and create a vttc cue.
   *
   * @param {string} payload
   * @param {?string} id
   * @param {?string} settings
   * @param {number} startTime
   * @param {number} endTime
   * @return {!shaka.text.Cue}
   * @private
   */
  static assembleCue_(payload, id, settings, startTime, endTime) {
    const cue = new shaka.text.Cue(startTime, endTime, payload);

    if (id) {
      cue.id = id;
    }

    if (settings) {
      const parser = new shaka.util.TextParser(settings);

      let word = parser.readWord();

      while (word) {
        // TODO: Check WebVTTConfigurationBox for region info.
        if (!shaka.text.VttTextParser.parseCueSetting(
            cue, word, /* VTTRegions= */[])) {
          shaka.log.warning(
              'VTT parser encountered an invalid VTT setting: ', word,
              ' The setting will be ignored.');
        }

        parser.skipWhitespace();
        word = parser.readWord();
      }
    }

    return cue;
  }
};

/**
 * @typedef {{
 *    duration: ?number,
 *    sampleSize: ?number,
 *    timeOffset: ?number
 *  }}
 *
 * @property {?number} duration
 *    The length of the segment in timescale units.
 * @property {?number} sampleSize
 *    The size of the segment in bytes.
 * @property {?number} timeOffset
 *    The time since the start of the segment in timescale units. Time
 *    offset is based of the start of the segment. If this value is
 *    missing, the accumated durations preceeding this time segment will
 *    be used to create the start time.
 */
shaka.text.Mp4VttParser.TimeSegment;

shaka.text.TextEngine.registerParser(
    'application/mp4; codecs="wvtt"', () => new shaka.text.Mp4VttParser());