@inproceedings{jahan-etal-2024-finding-spoken,
title = "Finding Spoken Identifications: Using {GPT}-4 Annotation for an Efficient and Fast Dataset Creation Pipeline",
author = "Jahan, Maliha and
Wang, Helin and
Thebaud, Thomas and
Sun, Yinglun and
Le, Giang Ha and
Fagyal, Zsuzsanna and
Scharenborg, Odette and
Hasegawa-Johnson, Mark and
Moro Velazquez, Laureano and
Dehak, Najim",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.641",
pages = "7296--7306",
abstract = "The growing emphasis on fairness in speech-processing tasks requires datasets with speakers from diverse subgroups that allow training and evaluating fair speech technology systems. However, creating such datasets through manual annotation can be costly. To address this challenge, we present a semi-automated dataset creation pipeline that leverages large language models. We use this pipeline to generate a dataset of speakers identifying themself or another speaker as belonging to a particular race, ethnicity, or national origin group. We use OpenaAI{'}s GPT-4 to perform two complex annotation tasks- separating files relevant to our intended dataset from the irrelevant ones (filtering) and finding and extracting information on identifications within a transcript (tagging). By evaluating GPT-4{'}s performance using human annotations as ground truths, we show that it can reduce resources required by dataset annotation while barely losing any important information. For the filtering task, GPT-4 had a very low miss rate of 6.93{\%}. GPT-4{'}s tagging performance showed a trade-off between precision and recall, where the latter got as high as 97{\%}, but precision never exceeded 45{\%}. Our approach reduces the time required for the filtering and tagging tasks by 95{\%} and 80{\%}, respectively. We also present an in-depth error analysis of GPT-4{'}s performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jahan-etal-2024-finding-spoken">
<titleInfo>
<title>Finding Spoken Identifications: Using GPT-4 Annotation for an Efficient and Fast Dataset Creation Pipeline</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maliha</namePart>
<namePart type="family">Jahan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Thebaud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinglun</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giang</namePart>
<namePart type="given">Ha</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zsuzsanna</namePart>
<namePart type="family">Fagyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Odette</namePart>
<namePart type="family">Scharenborg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Hasegawa-Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laureano</namePart>
<namePart type="family">Moro Velazquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najim</namePart>
<namePart type="family">Dehak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The growing emphasis on fairness in speech-processing tasks requires datasets with speakers from diverse subgroups that allow training and evaluating fair speech technology systems. However, creating such datasets through manual annotation can be costly. To address this challenge, we present a semi-automated dataset creation pipeline that leverages large language models. We use this pipeline to generate a dataset of speakers identifying themself or another speaker as belonging to a particular race, ethnicity, or national origin group. We use OpenaAI’s GPT-4 to perform two complex annotation tasks- separating files relevant to our intended dataset from the irrelevant ones (filtering) and finding and extracting information on identifications within a transcript (tagging). By evaluating GPT-4’s performance using human annotations as ground truths, we show that it can reduce resources required by dataset annotation while barely losing any important information. For the filtering task, GPT-4 had a very low miss rate of 6.93%. GPT-4’s tagging performance showed a trade-off between precision and recall, where the latter got as high as 97%, but precision never exceeded 45%. Our approach reduces the time required for the filtering and tagging tasks by 95% and 80%, respectively. We also present an in-depth error analysis of GPT-4’s performance.</abstract>
<identifier type="citekey">jahan-etal-2024-finding-spoken</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.641</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>7296</start>
<end>7306</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Finding Spoken Identifications: Using GPT-4 Annotation for an Efficient and Fast Dataset Creation Pipeline
%A Jahan, Maliha
%A Wang, Helin
%A Thebaud, Thomas
%A Sun, Yinglun
%A Le, Giang Ha
%A Fagyal, Zsuzsanna
%A Scharenborg, Odette
%A Hasegawa-Johnson, Mark
%A Moro Velazquez, Laureano
%A Dehak, Najim
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F jahan-etal-2024-finding-spoken
%X The growing emphasis on fairness in speech-processing tasks requires datasets with speakers from diverse subgroups that allow training and evaluating fair speech technology systems. However, creating such datasets through manual annotation can be costly. To address this challenge, we present a semi-automated dataset creation pipeline that leverages large language models. We use this pipeline to generate a dataset of speakers identifying themself or another speaker as belonging to a particular race, ethnicity, or national origin group. We use OpenaAI’s GPT-4 to perform two complex annotation tasks- separating files relevant to our intended dataset from the irrelevant ones (filtering) and finding and extracting information on identifications within a transcript (tagging). By evaluating GPT-4’s performance using human annotations as ground truths, we show that it can reduce resources required by dataset annotation while barely losing any important information. For the filtering task, GPT-4 had a very low miss rate of 6.93%. GPT-4’s tagging performance showed a trade-off between precision and recall, where the latter got as high as 97%, but precision never exceeded 45%. Our approach reduces the time required for the filtering and tagging tasks by 95% and 80%, respectively. We also present an in-depth error analysis of GPT-4’s performance.
%U https://aclanthology.org/2024.lrec-main.641
%P 7296-7306
Markdown (Informal)
[Finding Spoken Identifications: Using GPT-4 Annotation for an Efficient and Fast Dataset Creation Pipeline](https://aclanthology.org/2024.lrec-main.641) (Jahan et al., LREC-COLING 2024)
ACL
- Maliha Jahan, Helin Wang, Thomas Thebaud, Yinglun Sun, Giang Ha Le, Zsuzsanna Fagyal, Odette Scharenborg, Mark Hasegawa-Johnson, Laureano Moro Velazquez, and Najim Dehak. 2024. Finding Spoken Identifications: Using GPT-4 Annotation for an Efficient and Fast Dataset Creation Pipeline. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 7296–7306, Torino, Italia. ELRA and ICCL.