#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Created on 01-05-2022
# @author: wiebket, AnnaLesch
import itertools
import numpy as np
[docs]def split_scores_by_speaker_groups(scores, speaker_metadata, speaker_groups, id_delimiter):
""" Construction of a dictionary that holds a list of tuples (label, score) for the speaker groups as defined in the config file and their corresponding subgroups.
:param scores: DataFrame that contains reference and test utterances and corresponding labels and scores
:type scores: DataFrame
:param speaker_metadata: DataFrame that contains speaker metadata with speaker ids and speaker groups attributes as specified in config file
:type speaker_metadata: DataFrame
:param speaker_groups: List of speaker groups as specified in config file
:type speaker_groups: list
:param id_delimiter: If not specified in config file, default is "/"
:type id_delimiter: string
:returns: scores_by_speaker_groups
:rtype: dict
"""
scores_by_speaker_groups = dict()
# create id column for scores, first split by dot to get rid of .wav, then by id_delimiter
scores['ref_id'] = scores['ref'].apply(lambda x: x.split(".")[0]).apply(lambda x: x.split(id_delimiter)[0])
for group in speaker_groups:
subgroup_per_group = dict()
group_copy = group.copy()
while len(group_copy) > 0:
group_name = group_copy[0]
subgroups = list(speaker_metadata[group_name].unique())
subgroup_per_group.update({group_name: subgroups})
group_copy.pop(0)
scores_by_speaker_groups["_".join(subgroup_per_group.keys())] = dict()
# for a list of subgroups for groups create category combination
# e.g. Gender: [m, f], Nationality: [India] becomes [(m, India), (f, India)]
subgroups_combinations = list(itertools.product(*subgroup_per_group.values()))
for combination in subgroups_combinations:
subgroup_dataframe = speaker_metadata
for index, subcategory in enumerate(combination):
subgroup_dataframe = subgroup_dataframe.loc[subgroup_dataframe[list(subgroup_per_group.keys())[index]] == subcategory]
# subgroup combination not available in speaker_metadata
if subgroup_dataframe.empty:
scores_by_speaker_groups["_".join(subgroup_per_group.keys())].update({"_".join(combination): [(np.nan, np.nan)]})
# TODO logging here
continue
id_list = subgroup_dataframe["id"]
scores_filtered = scores[scores['ref_id'].isin(id_list)]
# speaker id in metadata but no scores provided
if scores_filtered.empty:
scores_by_speaker_groups["_".join(subgroup_per_group.keys())].update({"_".join(combination): [(np.nan, np.nan)]})
# TODO Logging here
continue
label_score_list = scores_filtered[["label", "score"]].to_records(index=False)
scores_by_speaker_groups["_".join(subgroup_per_group.keys())].update({"_".join(combination): label_score_list})
return scores_by_speaker_groups