Module audioopy.ipus

Class SearchForIPUs

Description

An automatic silence versus sounding segments segmentation system.

This segmentation aims at finding IPUs - Inter-Pausal Units, also called sounding segments, in speech. IPUs are blocks of speech bounded by silent pauses of more than X ms, and time-aligned on the speech signal.

See the following reference publication:

Brigitte Bigi, Béatrice Priego-Valverde (2022). The automatic search for sounding segments of SPPAS: application to Cheese! corpus. Human Language Technology. Challenges for Computer Science and Linguistics, LNAI, LNCS 13212, pp. 16-27. https://hal.archives-ouvertes.fr/hal-03697808

Constructor

Create a new SearchIPUs instance.

The class is particularly useful for identifying segments of speech bounded by silent pauses.

Fields:

winlen (inherited): Window length for RMS estimation.
_vagueness (inherited): Windows length to refine the silence boundaries estimation.
_channel (inherited): Channel instance to use.
minsil_dur: Minimum duration for a silence.
minipu_dur: Minimum duration for an IPU.
volthreshold: Volume threshold for silence detection.
autothreshold: Automatically estimated volume threshold for silence detection.
shiftstart: Start shift value.
shiftend: End shift value.

Parameters

channel: (Channel)

View Source

def __init__(self, channel: Channel, win_len: float=0.02):
    """Create a new SearchIPUs instance.

    The class is particularly useful for identifying segments of speech
    bounded by silent pauses.

    Fields:

    - _win_len (inherited): Window length for RMS estimation.
    - _vagueness (inherited): Windows length to refine the silence boundaries estimation.
    - _channel (inherited): Channel instance to use.
    - _min_sil_dur: Minimum duration for a silence.
    - _min_ipu_dur: Minimum duration for an IPU.
    - _vol_threshold: Volume threshold for silence detection.
    - _auto_threshold: Automatically estimated volume threshold for silence detection.
    - _shift_start: Start shift value.
    - _shift_end: End shift value.

    :param channel: (Channel)

    """
    super(SearchForIPUs, self).__init__(channel, win_len, win_len / 4.0)
    self._min_sil_dur = SearchForIPUs.DEFAULT_MIN_SIL_DUR
    self._min_ipu_dur = SearchForIPUs.DEFAULT_MIN_IPU_DUR
    self._vol_threshold = SearchForIPUs.DEFAULT_VOL_THRESHOLD
    self._auto_threshold = SearchForIPUs.DEFAULT_VOL_THRESHOLD
    self._shift_start = SearchForIPUs.DEFAULT_SHIFT_START
    self._shift_end = SearchForIPUs.DEFAULT_SHIFT_END

Public functions

get_track_data

Return the audio data of tracks.

Parameters

tracks: List of tracks. A track is a tuple(start, end).

Returns

List of audio data

View Source

def get_track_data(self, tracks: list) -> list:
    """Return the audio data of tracks.

        :param tracks: List of tracks. A track is a tuple (start, end).
        :return: List of audio data

        """
    return self.__track_data(tracks)

get_vol_threshold

Return the initial volume threshold used to search for silences.

View Source

def get_vol_threshold(self) -> int:
    """Return the initial volume threshold used to search for silences."""
    return self._vol_threshold

get_effective_threshold

Return the threshold volume estimated automatically to search for silences.

View Source

def get_effective_threshold(self) -> int:
    """Return the threshold volume estimated automatically to search for silences."""
    return self._auto_threshold

get_min_sil_dur

Return the minimum duration of a silence.

View Source

def get_min_sil_dur(self) -> float:
    """Return the minimum duration of a silence."""
    return self._min_sil_dur

get_min_ipu_dur

Return the minimum duration of a track.

View Source

def get_min_ipu_dur(self) -> float:
    """Return the minimum duration of a track."""
    return self._min_ipu_dur

get_shift_start

View Source

def get_shift_start(self) -> float:
    return self._shift_start

get_shift_end

View Source

def get_shift_end(self) -> float:
    return self._shift_end

set_vol_threshold

Fix the default minimum volume value to find silences.

It won't affect the current list of silence values. Use search_sil().

Parameters

vol_threshold: (int) RMS value

View Source

def set_vol_threshold(self, vol_threshold: int) -> None:
    """Fix the default minimum volume value to find silences.

        It won't affect the current list of silence values. Use search_sil().

        :param vol_threshold: (int) RMS value

        """
    vol_threshold = int(vol_threshold)
    if vol_threshold < 0:
        self._vol_threshold = SearchForIPUs.DEFAULT_VOL_THRESHOLD
    else:
        self._vol_threshold = vol_threshold

set_min_sil

Fix the default minimum duration of a silence.

Parameters

minsildur: (float) Duration in seconds.

Raises

ValueError: Invalid given minsildur value.

View Source

def set_min_sil(self, min_sil_dur: float) -> None:
    """Fix the default minimum duration of a silence.

        :param min_sil_dur: (float) Duration in seconds.
        :raises: ValueError: Invalid given min_sil_dur value.

        """
    min_sil_dur = float(min_sil_dur)
    self._min_sil_dur = max(float(min_sil_dur), SearchForIPUs.MIN_SIL_DUR)

set_min_ipu

Fix the default minimum duration of an IPU.

Parameters

minipudur: (float) Duration in seconds.

Raises

ValueError: Invalid given minipudur value.

View Source

def set_min_ipu(self, min_ipu_dur: float) -> None:
    """Fix the default minimum duration of an IPU.

        :param min_ipu_dur: (float) Duration in seconds.
        :raises: ValueError: Invalid given min_ipu_dur value.

        """
    min_ipu_dur = float(min_ipu_dur)
    self._min_ipu_dur = max(min_ipu_dur, SearchForIPUs.MIN_IPU_DUR)

set_shift_start

Fix the default minimum boundary shift value.

Parameters

s: (float) Duration in seconds.

Raises

ValueError: Invalid given s value.

View Source

def set_shift_start(self, s: float) -> None:
    """Fix the default minimum boundary shift value.

        :param s: (float) Duration in seconds.
        :raises: ValueError: Invalid given s value.

        """
    s = float(s)
    if -self._min_ipu_dur < s < self._min_sil_dur:
        self._shift_start = s

set_shift_end

Fix the default minimum boundary shift value.

Parameters

s: (float) Duration in seconds.

View Source

def set_shift_end(self, s: float) -> None:
    """Fix the default minimum boundary shift value.

        :param s: (float) Duration in seconds.

        """
    s = float(s)
    if -self._min_ipu_dur < s < self._min_sil_dur:
        self._shift_end = s

min_channel_duration

Return the minimum duration we expect for a channel.

View Source

def min_channel_duration(self) -> float:
    """Return the minimum duration we expect for a channel."""
    d = max(self._min_sil_dur, self._min_ipu_dur)
    return d + self._shift_start + self._shift_end

get_rms_stats

Return min, max, mean, median, stdev of the RMS.

View Source

def get_rms_stats(self) -> list:
    """Return min, max, mean, median, stdev of the RMS."""
    vs = self.get_volstats()
    return [vs.min(), vs.max(), vs.mean(), vs.median(), vs.coefvariation()]

get_tracks

Return a list of tuples (from,to) of tracks.

(from,to) values are converted, or not, into the time-domain.

The tracks are found from the current list of silences, which is firstly filtered with the minsildur.

Using this method requires the following members to be fixed: - the volume threshold - the minimum duration for a silence, - the minimum duration for a track, - the duration to remove to the start boundary, - the duration to add to the end boundary.

Parameters

time_domain: (bool) Convert from/to values in seconds

Returns

(list of tuples) with (from,to) of the tracks

View Source

def get_tracks(self, time_domain: bool=False) -> list:
    """Return a list of tuples (from,to) of tracks.

        (from,to) values are converted, or not, into the time-domain.

        The tracks are found from the current list of silences, which is
        firstly filtered with the min_sil_dur.

        Using this method requires the following members to be fixed:
            - the volume threshold
            - the minimum duration for a silence,
            - the minimum duration for a track,
            - the duration to remove to the start boundary,
            - the duration to add to the end boundary.

        :param time_domain: (bool) Convert from/to values in seconds
        :return: (list of tuples) with (from,to) of the tracks

        """
    self._auto_threshold = self.search_silences(self._vol_threshold)
    msd = self._min_sil_dur + self._shift_start + self._shift_end
    thr = self._auto_threshold // 2
    self.filter_silences(thr, msd)
    tracks = self.extract_tracks(self._min_ipu_dur, self._shift_start, self._shift_end)
    if time_domain is True:
        time_tracks = []
        for i, (from_pos, to_pos) in enumerate(tracks):
            f = float(from_pos) / float(self._channel.get_framerate())
            t = float(to_pos) / float(self._channel.get_framerate())
            time_tracks.append((f, t))
        return time_tracks
    return tracks

Protected functions

__track_data

Yield the tracks data: a set of frames for each track.

Parameters

tracks: (list of tuples) List of (frompos,topos)

Raises

TypeError: Invalid given tracks
ValueError: Invalid frame position

View Source

def __track_data(self, tracks: list) -> None:
    """Yield the tracks data: a set of frames for each track.

        :param tracks: (list of tuples) List of (from_pos,to_pos)
        :raises: TypeError: Invalid given tracks
        :raises: ValueError: Invalid frame position

        """
    if self._channel is None:
        return
    for v in tracks:
        try:
            if len(v) != 2:
                raise
            int(v[0])
            int(v[1])
        except:
            raise TypeError('Expected a list of 2 int values, got {} instead'.format(v))
    nframes = self._channel.get_nframes()
    for from_pos, to_pos in tracks:
        if nframes < from_pos:
            if nframes < from_pos - 10:
                raise ValueError('Position %d not in range(%d)' % (from_pos, nframes))
            else:
                from_pos = nframes
        self._channel.seek(from_pos)
        yield self._channel.get_frames(to_pos - from_pos)