Package rst.audition

Audio signal processing, sometimes referred to as audio processing, is the intentional alteration of auditory signals, or sound.

This package contains data type definitions related to audio processing.

See also

Wikipedia article containing the definition above
http://en.wikipedia.org/wiki/Audio_signal_processing

Messages

digraph message_graph { fontname="Arial"; fontsize=11; stylesheet="../_static/graphs.css"; node [fontsize=11,fontname="Arial"] edge [fontsize=11,fontname="Arial"] "5" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.Utterance" TITLE="Structure rst.audition.Utterance" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/message.svg"></IMG></TD><TD ALIGN="left">Utterance</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.PhonemeCollection" TITLE="Structure rst.audition.PhonemeCollection" TARGET="_parent">PhonemeCollection</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.Utterance.phonemes" TITLE="Field rst.audition.Utterance.phonemes" TARGET="_parent" PORT="phonemes">phonemes</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk" TITLE="Structure rst.audition.SoundChunk" TARGET="_parent">SoundChunk</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.Utterance.audio" TITLE="Field rst.audition.Utterance.audio" TARGET="_parent" PORT="audio">audio</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-ASCII-STRING.html#ASCII-STRING" TITLE="Fundamental ASCII-STRING" TARGET="_parent">ASCII-STRING</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.Utterance.textual_representation" TITLE="Field rst.audition.Utterance.textual_representation" TARGET="_parent" PORT="textual_representation">textual_representation</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "6" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.PhonemeCollection" TITLE="Structure rst.audition.PhonemeCollection" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/message.svg"></IMG></TD><TD ALIGN="left">PhonemeCollection</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.Phoneme" TITLE="Structure rst.audition.Phoneme" TARGET="_parent">Phoneme</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.PhonemeCollection.element" TITLE="Field rst.audition.PhonemeCollection.element" TARGET="_parent" PORT="element">element</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "7" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.Phoneme" TITLE="Structure rst.audition.Phoneme" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/message.svg"></IMG></TD><TD ALIGN="left">Phoneme</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-ASCII-STRING.html#ASCII-STRING" TITLE="Fundamental ASCII-STRING" TARGET="_parent">ASCII-STRING</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.Phoneme.symbol" TITLE="Field rst.audition.Phoneme.symbol" TARGET="_parent" PORT="symbol">symbol</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-UINT32.html#UINT32" TITLE="Fundamental UINT32" TARGET="_parent">UINT32</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.Phoneme.duration" TITLE="Field rst.audition.Phoneme.duration" TARGET="_parent" PORT="duration">duration</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "1" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunkCollection" TITLE="Structure rst.audition.SoundChunkCollection" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/message.svg"></IMG></TD><TD ALIGN="left">SoundChunkCollection</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk" TITLE="Structure rst.audition.SoundChunk" TARGET="_parent">SoundChunk</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunkCollection.element" TITLE="Field rst.audition.SoundChunkCollection.element" TARGET="_parent" PORT="element">element</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "2" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk" TITLE="Structure rst.audition.SoundChunk" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/message.svg"></IMG></TD><TD ALIGN="left">SoundChunk</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-OCTET-VECTOR.html#OCTET-VECTOR" TITLE="Fundamental OCTET-VECTOR" TARGET="_parent">OCTET-VECTOR</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.data" TITLE="Field rst.audition.SoundChunk.data" TARGET="_parent" PORT="data">data</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-UINT32.html#UINT32" TITLE="Fundamental UINT32" TARGET="_parent">UINT32</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.sample_count" TITLE="Field rst.audition.SoundChunk.sample_count" TARGET="_parent" PORT="sample_count">sample_count</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-UINT32.html#UINT32" TITLE="Fundamental UINT32" TARGET="_parent">UINT32</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.channels" TITLE="Field rst.audition.SoundChunk.channels" TARGET="_parent" PORT="channels">channels</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-__rosetta-UINT32.html#UINT32" TITLE="Fundamental UINT32" TARGET="_parent">UINT32</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.rate" TITLE="Field rst.audition.SoundChunk.rate" TARGET="_parent" PORT="rate">rate</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType" TITLE="Enum rst.audition.SoundChunk.SampleType" TARGET="_parent">SampleType</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.sample_type" TITLE="Field rst.audition.SoundChunk.sample_type" TARGET="_parent" PORT="sample_type">sample_type</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.EndianNess" TITLE="Enum rst.audition.SoundChunk.EndianNess" TARGET="_parent">EndianNess</TD><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.endianness" TITLE="Field rst.audition.SoundChunk.endianness" TARGET="_parent" PORT="endianness">endianness</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "4" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.EndianNess" TITLE="Enum rst.audition.SoundChunk.EndianNess" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/enum.svg"></IMG></TD><TD ALIGN="left">EndianNess</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.EndianNess.ENDIAN_LITTLE" TITLE="Value rst.audition.SoundChunk.EndianNess.ENDIAN_LITTLE" TARGET="_parent">ENDIAN_LITTLE</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.EndianNess.ENDIAN_LITTLE" TITLE="Value rst.audition.SoundChunk.EndianNess.ENDIAN_LITTLE" TARGET="_parent">0</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.EndianNess.ENDIAN_BIG" TITLE="Value rst.audition.SoundChunk.EndianNess.ENDIAN_BIG" TARGET="_parent">ENDIAN_BIG</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.EndianNess.ENDIAN_BIG" TITLE="Value rst.audition.SoundChunk.EndianNess.ENDIAN_BIG" TARGET="_parent">1</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "3" [label=<<TABLE BORDER="0"><TR><TD COLSPAN="2" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType" TITLE="Enum rst.audition.SoundChunk.SampleType" TARGET="_parent"><TABLE BORDER="0"><TR><TD ALIGN="right"><IMG SRC="../_static/enum.svg"></IMG></TD><TD ALIGN="left">SampleType</TD></TR></TABLE></TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_S8" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_S8" TARGET="_parent">SAMPLE_S8</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_S8" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_S8" TARGET="_parent">0</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_U8" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_U8" TARGET="_parent">SAMPLE_U8</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_U8" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_U8" TARGET="_parent">1</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_S16" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_S16" TARGET="_parent">SAMPLE_S16</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_S16" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_S16" TARGET="_parent">2</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_U16" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_U16" TARGET="_parent">SAMPLE_U16</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_U16" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_U16" TARGET="_parent">4</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_S24" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_S24" TARGET="_parent">SAMPLE_S24</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_S24" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_S24" TARGET="_parent">8</TD></TR><TR><TD ALIGN="left" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_U24" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_U24" TARGET="_parent">SAMPLE_U24</TD><TD ALIGN="right" HREF="../generated/stable/package-rst-audition.html#rst.audition.SoundChunk.SampleType.SAMPLE_U24" TITLE="Value rst.audition.SoundChunk.SampleType.SAMPLE_U24" TARGET="_parent">16</TD></TR></TABLE>>,shape=box,style=filled,fillcolor="white"]; "5":audio -> "2" []; "5":phonemes -> "6" []; "6":element -> "7" []; "1":element -> "2" []; "2" -> "4" []; "2" -> "3" []; "2":endianness -> "4" []; "2":sample_type -> "3" []; }

clearer: should be made invisible via css

Message SoundChunkCollection

class rst.audition.SoundChunkCollection

Collection of SoundChunk instances.

Auto-generated.

element
Type:array of rst.audition.SoundChunk

The individual elements of the collection.

Constraints regarding the empty collection, sorting, duplicated entries etc. are use case specific.

Download this file

message SoundChunkCollection {

    /**
     * The individual elements of the collection.
     *
     * Constraints regarding the empty collection, sorting, duplicated
     * entries etc. are use case specific.
     */
    repeated SoundChunk element = 1;

}

Message Utterance

class rst.audition.Utterance

Objects of this represent a single utterances of speech.

The data describes a single utterance in three different forms:

  • phonemes describes the utterance as a list of phone symbols and durations (useful e.g. for lip animation).
  • audio is a <Could not resolve reference to SoundChunk> that can be played back on audio devices containing the realization (e.g. by a TTS system) of the included phoneme list
  • <Could not resolve reference to .description> is a textual description of the utterance for debugging purposes.

Code author: Simon Schulz <sschulz@techfak.uni-bielefeld.de>

phonemes
Type:rst.audition.PhonemeCollection

A collection of phonemes. Will be played back in the same ordering as given by Phoneme

audio
Type:rst.audition.SoundChunk

A chunk of audio data that can be played back containing the realization (e.g. by a TTS system) of the included phoneme list

textual_representation
Type:ASCII-STRING

Textual representation of the utterance.

Download this file

message Utterance {

    /**
     * A collection of phonemes. Will be played back in the same
     * ordering as given by @ref .Phoneme
     */
    required PhonemeCollection phonemes = 1;

    /**
     * A chunk of audio data that can be played back containing the
     * realization (e.g. by a TTS system) of the included phoneme list
     */
    required SoundChunk audio = 2;

    /**
     * Textual representation of the utterance.
     */
    required string textual_representation = 3;

}

Message SoundChunk

class rst.audition.SoundChunk

Constraint: len(.data) == 8 * .channels * .sample_count * TODO(.sample_type)

Objects of this represent a chunk of an audio stream.

The audio information for one or more channels is stored in data as a sequence of sample_count encoded samples, the encoding of which is described by endianness and sample_type.

Depending on the sample rate (rate), such a chunk of audio corresponds to a certain amount of time during which its samples have been recorded.

Interpretation of RSB timestamps:

create:
Capture time of the audio buffer. More precisely, the timestamp should correspond to the first sample contained in the buffer.

Code author: David Klotz <dklotz@techfak.uni-bielefeld.de>

@create_collection

data
Type:OCTET-VECTOR

The sequences of bytes representing the samples of this sound chunk.

The value of this field must be interpreted according to the values of the sample_count, channels, sample_type and endianness fields.

sample_count
Type:UINT32

Unit: number

The number of samples contained in data.

channels
Type:UINT32

Unit: number

The number of channels for which samples are stored in data.

rate
Type:UINT32

Unit: hz

The rate with which the samples stored in data haven been recorded or should be played.

sample_type
Type:rst.audition.SoundChunk.SampleType

The data type used for the representation of samples in data.

endianness
Type:rst.audition.SoundChunk.EndianNess

The Endianness used for the representation of samples in data.

Download this file

message SoundChunk {

    /**
     * The possible data types for representing individual samples.
     */
    enum SampleType {

        /**
         * Signed 8-bit samples.
         */
        SAMPLE_S8 = 0;

        /**
         * Unsigned 8-bit samples.
         */
        SAMPLE_U8 = 1;

        /**
         * Signed 16-bit samples.
         */
        SAMPLE_S16 = 2;

        /**
         * Unsigned 16-bit samples.
         */
        SAMPLE_U16 = 4;

        /**
         * Signed 24-bit samples.
         */
        SAMPLE_S24 = 8;

        /**
         * Unsigned 24-bit samples.
         */
        SAMPLE_U24 = 16;

    }

    /**
     * The possible byte-orders for representing samples.
     */
    enum EndianNess {

        /**
         * Samples are represented with little Endian byte-order.
         */
        ENDIAN_LITTLE = 0;

        /**
         * Samples are represented with big Endian byte-order.
         */
        ENDIAN_BIG = 1;
    }

    /**
     * The sequences of bytes representing the samples of this sound
     * chunk.
     *
     * The value of this field must be interpreted according to the
     * values of the @ref .sample_count, @ref .channels, @ref
     * .sample_type and @ref .endianness fields.
     */
    required bytes data = 1;

    /**
     * The number of samples contained in @ref .data.
     */
    // @unit(number)
    required uint32 sample_count = 2;

    /**
     * The number of channels for which samples are stored in @ref
     * .data.
     */
    // @unit(number)
    optional uint32 channels = 3 [default = 1];

    /**
     * The rate with which the samples stored in @ref .data haven been
     * recorded or should be played.
     */
    // @unit(hz)
    optional uint32 rate = 4 [default = 44100];

    /**
     * The data type used for the representation of samples in @ref
     * .data.
     */
    optional SampleType sample_type = 5 [default = SAMPLE_S16];

    /**
     * The Endianness used for the representation of samples in @ref
     * .data.
     */
    optional EndianNess endianness = 6 [default = ENDIAN_LITTLE];

    // TODO: interleaving type?

}

Message SampleType

class rst.audition.SoundChunk.SampleType

The possible data types for representing individual samples.

SAMPLE_S8
= 0

Signed 8-bit samples.

SAMPLE_U8
= 1

Unsigned 8-bit samples.

SAMPLE_S16
= 2

Signed 16-bit samples.

SAMPLE_U16
= 4

Unsigned 16-bit samples.

SAMPLE_S24
= 8

Signed 24-bit samples.

SAMPLE_U24
= 16

Unsigned 24-bit samples.

Download this file

    enum SampleType {

        /**
         * Signed 8-bit samples.
         */
        SAMPLE_S8 = 0;

        /**
         * Unsigned 8-bit samples.
         */
        SAMPLE_U8 = 1;

        /**
         * Signed 16-bit samples.
         */
        SAMPLE_S16 = 2;

        /**
         * Unsigned 16-bit samples.
         */
        SAMPLE_U16 = 4;

        /**
         * Signed 24-bit samples.
         */
        SAMPLE_S24 = 8;

        /**
         * Unsigned 24-bit samples.
         */
        SAMPLE_U24 = 16;

    }

Message EndianNess

class rst.audition.SoundChunk.EndianNess

The possible byte-orders for representing samples.

ENDIAN_LITTLE
= 0

Samples are represented with little Endian byte-order.

ENDIAN_BIG
= 1

Samples are represented with big Endian byte-order.

Download this file

    enum EndianNess {

        /**
         * Samples are represented with little Endian byte-order.
         */
        ENDIAN_LITTLE = 0;

        /**
         * Samples are represented with big Endian byte-order.
         */
        ENDIAN_BIG = 1;
    }

Message PhonemeCollection

class rst.audition.PhonemeCollection

Collection of Phoneme instances.

Auto-generated.

element
Type:array of rst.audition.Phoneme

The individual elements of the collection.

Constraints regarding the empty collection, sorting, duplicated entries etc. are use case specific.

Download this file

message PhonemeCollection {

    /**
     * The individual elements of the collection.
     *
     * Constraints regarding the empty collection, sorting, duplicated
     * entries etc. are use case specific.
     */
    repeated Phoneme element = 1;

}

Message Phoneme

class rst.audition.Phoneme

Objects of this represent a single phoneme-duration pair.

A list of elements of this type can be used to describe words or whole sentences of speech.

Code author: Simon Schulz <sschulz@techfak.uni-bielefeld.de>

@create_collection

symbol
Type:ASCII-STRING

A single phone symbol (such as aI, E, C, R, _, ...).

e.g. see https://en.wikipedia.org/wiki/Phoneme
or http://www.phon.ucl.ac.uk/home/sampa/german.htm (german) examples
duration
Type:UINT32

Unit: millisecond

The duration of this symbol.

Download this file

message Phoneme {

    /**
     * A single phone symbol (such as aI, E, C, R, _, ...).
     *
     * e.g. see https://en.wikipedia.org/wiki/Phoneme
     *      or http://www.phon.ucl.ac.uk/home/sampa/german.htm (german)
     *      examples
     */
    required string symbol = 1;

    /**
     * The duration of this symbol.
     */
    // @unit(millisecond)
    required uint32 duration = 2;

}