@inproceedings{steingrimsson-etal-2020-effectively,
title = "Effectively Aligning and Filtering Parallel Corpora under Sparse Data Conditions",
author = "Steingr{\'\i}msson, Stein{\th}{\'o}r and
Loftsson, Hrafn and
Way, Andy",
editor = "Rijhwani, Shruti and
Liu, Jiangming and
Wang, Yizhong and
Dror, Rotem",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-srw.25",
doi = "10.18653/v1/2020.acl-srw.25",
pages = "182--190",
abstract = "Parallel corpora are key to developing good machine translation systems. However, abundant parallel data are hard to come by, especially for languages with a low number of speakers. When rich morphology exacerbates the data sparsity problem, it is imperative to have accurate alignment and filtering methods that can help make the most of what is available by maximising the number of correctly translated segments in a corpus and minimising noise by removing incorrect translations and segments containing extraneous data. This paper sets out a research plan for improving alignment and filtering methods for parallel texts in low-resource settings. We propose an effective unsupervised alignment method to tackle the alignment problem. Moreover, we propose a strategy to supplement state-of-the-art models with automatically extracted information using basic NLP tools to effectively handle rich morphology.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="steingrimsson-etal-2020-effectively">
<titleInfo>
<title>Effectively Aligning and Filtering Parallel Corpora under Sparse Data Conditions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stein\thór</namePart>
<namePart type="family">Steingrímsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andy</namePart>
<namePart type="family">Way</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiangming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yizhong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Parallel corpora are key to developing good machine translation systems. However, abundant parallel data are hard to come by, especially for languages with a low number of speakers. When rich morphology exacerbates the data sparsity problem, it is imperative to have accurate alignment and filtering methods that can help make the most of what is available by maximising the number of correctly translated segments in a corpus and minimising noise by removing incorrect translations and segments containing extraneous data. This paper sets out a research plan for improving alignment and filtering methods for parallel texts in low-resource settings. We propose an effective unsupervised alignment method to tackle the alignment problem. Moreover, we propose a strategy to supplement state-of-the-art models with automatically extracted information using basic NLP tools to effectively handle rich morphology.</abstract>
<identifier type="citekey">steingrimsson-etal-2020-effectively</identifier>
<identifier type="doi">10.18653/v1/2020.acl-srw.25</identifier>
<location>
<url>https://aclanthology.org/2020.acl-srw.25</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>182</start>
<end>190</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Effectively Aligning and Filtering Parallel Corpora under Sparse Data Conditions
%A Steingrímsson, Stein\thór
%A Loftsson, Hrafn
%A Way, Andy
%Y Rijhwani, Shruti
%Y Liu, Jiangming
%Y Wang, Yizhong
%Y Dror, Rotem
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F steingrimsson-etal-2020-effectively
%X Parallel corpora are key to developing good machine translation systems. However, abundant parallel data are hard to come by, especially for languages with a low number of speakers. When rich morphology exacerbates the data sparsity problem, it is imperative to have accurate alignment and filtering methods that can help make the most of what is available by maximising the number of correctly translated segments in a corpus and minimising noise by removing incorrect translations and segments containing extraneous data. This paper sets out a research plan for improving alignment and filtering methods for parallel texts in low-resource settings. We propose an effective unsupervised alignment method to tackle the alignment problem. Moreover, we propose a strategy to supplement state-of-the-art models with automatically extracted information using basic NLP tools to effectively handle rich morphology.
%R 10.18653/v1/2020.acl-srw.25
%U https://aclanthology.org/2020.acl-srw.25
%U https://doi.org/10.18653/v1/2020.acl-srw.25
%P 182-190
Markdown (Informal)
[Effectively Aligning and Filtering Parallel Corpora under Sparse Data Conditions](https://aclanthology.org/2020.acl-srw.25) (Steingrímsson et al., ACL 2020)
ACL