@inproceedings{b17e5bd9cbd74aaaa438aee192f065ea,
title = "A spectral algorithm for learning class-based n-gram models of natural language",
abstract = "The Brown clustering algorithm (Brown et al., 1992) is widely used in natural language processing (NLP) to derive lexical representations that are then used to improve performance on various NLP problems. The algorithm assumes an underlying model that is essentially an HMM, with the restriction that each word in the vocabulary is emitted from a single state. A greedy, bottom-up method is then used to find the clustering; this method does not have a guarantee of finding the correct underlying clustering. In this paper we describe a new algorithm for clustering under the Brown et al. model. The method relies on two steps: first, the use of canonical correlation analysis to derive a low-dimensional representation of words; second, a bottom-up hierarchical clustering over these representations. We show that given a sufficient number of training examples sampled from the Brown et al. model, the method is guaranteed to recover the correct clustering. Experiments show that the method recovers clusters of comparable quality to the algorithm of Brown et al. (1992), but is an order of magnitude more efficient.",
author = "Karl Stratos and Kim, {Do Kyum} and Michael Collins and Daniel Hsu",
year = "2014",
language = "American English",
series = "Uncertainty in Artificial Intelligence - Proceedings of the 30th Conference, UAI 2014",
publisher = "AUAI Press",
pages = "762--771",
editor = "Zhang, {Nevin L.} and Jin Tian",
booktitle = "Uncertainty in Artificial Intelligence - Proceedings of the 30th Conference, UAI 2014",
note = "30th Conference on Uncertainty in Artificial Intelligence, UAI 2014 ; Conference date: 23-07-2014 Through 27-07-2014",
}