Source code for bt4vt.groups

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Created on 01-05-2022
# @author: wiebket, AnnaLesch

import itertools
import numpy as np


[docs]def split_scores_by_speaker_groups(scores, speaker_metadata, speaker_groups, id_delimiter):
    """ Construction of a dictionary that holds a list of tuples (label, score) for the speaker groups as defined in the config file and their corresponding subgroups.

    :param scores: DataFrame that contains reference and test utterances and corresponding labels and scores
    :type scores: DataFrame
    :param speaker_metadata: DataFrame that contains speaker metadata with speaker ids and speaker groups attributes as specified in config file
    :type speaker_metadata: DataFrame
    :param speaker_groups: List of speaker groups as specified in config file
    :type speaker_groups: list
    :param id_delimiter: If not specified in config file, default is "/"
    :type id_delimiter: string

    :returns: scores_by_speaker_groups
    :rtype: dict

    """

    scores_by_speaker_groups = dict()

    # create id column for scores, first split by dot to get rid of .wav, then by id_delimiter
    scores['ref_id'] = scores['ref'].apply(lambda x: x.split(".")[0]).apply(lambda x: x.split(id_delimiter)[0])

    for group in speaker_groups:
        subgroup_per_group = dict()
        group_copy = group.copy()

        while len(group_copy) > 0:
            group_name = group_copy[0]
            subgroups = list(speaker_metadata[group_name].unique())
            subgroup_per_group.update({group_name: subgroups})
            group_copy.pop(0)

        scores_by_speaker_groups["_".join(subgroup_per_group.keys())] = dict()
        # for a list of subgroups for groups create category combination
        # e.g. Gender: [m, f], Nationality: [India] becomes [(m, India), (f, India)]
        subgroups_combinations = list(itertools.product(*subgroup_per_group.values()))

        for combination in subgroups_combinations:
            subgroup_dataframe = speaker_metadata
            for index, subcategory in enumerate(combination):
                subgroup_dataframe = subgroup_dataframe.loc[subgroup_dataframe[list(subgroup_per_group.keys())[index]] == subcategory]

            # subgroup combination not available in speaker_metadata
            if subgroup_dataframe.empty:
                scores_by_speaker_groups["_".join(subgroup_per_group.keys())].update({"_".join(combination): [(np.nan, np.nan)]})
                # TODO logging here
                continue

            id_list = subgroup_dataframe["id"]
            scores_filtered = scores[scores['ref_id'].isin(id_list)]

            # speaker id in metadata but no scores provided
            if scores_filtered.empty:
                scores_by_speaker_groups["_".join(subgroup_per_group.keys())].update({"_".join(combination): [(np.nan, np.nan)]})
                # TODO Logging here
                continue

            label_score_list = scores_filtered[["label", "score"]].to_records(index=False)
            scores_by_speaker_groups["_".join(subgroup_per_group.keys())].update({"_".join(combination): label_score_list})
    return scores_by_speaker_groups