@inproceedings{4e400a886fcb4ba1ae10675b004fd96f,
title = "De-identification of Privacy-related Entities in Job Postings",
abstract = "De-identification is the task of detecting privacy-related entities in text, such as person names, emails and contact data. It has been well-studied within the medical domain. The need for de-identification technology is increasing, as privacy-preserving data handling is in high demand in many domains. In this paper, we focus on job postings. We present JobStack, a new corpus for de-identification of personal data in job vacancies on Stackoverflow. We introduce baselines, comparing Long-Short Term Memory (LSTM) and Transformer models. To improve upon these baselines, we experiment with contextualized embeddings and distantly related auxiliary data via multi-task learning. Our results show that auxiliary data improves de-identification performance.",
keywords = "De-identification, Privacy-related entities, Medical domain, Privacy-preserving data, JobStack corpus, Job postings, Personal data, Baselines, Long-Short Term Memory (LSTM), Transformer models, De-identification, Privacy-related entities, Medical domain, Privacy-preserving data, JobStack corpus, Job postings, Personal data, Baselines, Long-Short Term Memory (LSTM), Transformer models",
author = "Jensen, {Kristian N{\o}rgaard} and Mike Zhang and Barbara Plank",
year = "2021",
month = may,
day = "21",
language = "English",
series = "Link{\"o}ping Electronic Conference Proceedings",
publisher = "Association for Computational Linguistics",
number = "21",
pages = "210--221",
booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics",
address = "United States",
note = "NoDaLiDa 2021 ; Conference date: 31-05-2021",
}