Latent Dirichlet Allocation¶
TODO: intro, link to colab w/ these examples
TODO: math
TODO: diagram
TODO: explain in terms of \(\boldsymbol{\varphi}\) and \(\boldsymbol{\theta}\) parameter matrixes
import probflow as pf
class LDA(pf.Model):
def __init__(self, Nt, Nd, Nw):
self.phi = pf.DirichletParameter(Nw, Nt) #per-topic word dists
self.theta = pf.DirichletParameter(Nt, Nd) #per-document topic dists
def __call__(self, x):
probs = self.theta[x[:, 0]] @ self.phi()
return pf.OneHotCategorical(probs=probs)
import probflow as pf
import torch
class LDA(pf.Model):
def __init__(self, Nt, Nd, Nw):
self.phi = pf.DirichletParameter(Nw, Nt) #per-topic word dists
self.theta = pf.DirichletParameter(Nt, Nd) #per-document topic dists
def __call__(self, x):
x = torch.tensor(x)
probs = self.theta[x[:, 0]] @ self.phi()
return pf.OneHotCategorical(probs=probs)
To fit the model in this way, x
will be document IDs, and y
will be
a matrix of size (Ndocuments, Nwords)
.
# Nt = number of topics to use
# Nd = number of documents
# Nw = number of words in the vocabulary
# W = (Nd, Nw)-size matrix of per-document word probabilities
doc_id = np.arange(W.shape[0])
model = LDA(Nt, Nd, Nw)
model.fit(doc_id, W)
TODO: Alternatively, when you have a LOT of documents, it’s inefficient to try and infer that huge Nd-by-Nt matrix of parameters, so you can use a neural net to estimate the topic distribution from the word distributions (amortize). Its… kinda sorta like an autoencoder, where you’re encoding documents into weighted mixtures of topics, and then decoding the word distributions from those topic distributions.
class LdaNet(pf.Model):
def __init__(self, dims):
self.phi = pf.DirichletParameter(dims[0], dims[-1])
self.net = pf.DenseNetwork(dims)
def __call__(self, x):
probs = self.net(x) @ self.phi()
return pf.OneHotCategorical(probs=probs)
class LdaNet(pf.Model):
def __init__(self, dims):
self.phi = pf.DirichletParameter(dims[0], dims[-1])
self.net = pf.DenseNetwork(dims)
def __call__(self, x):
x = torch.tensor(x)
probs = self.net(x) @ self.phi()
return pf.OneHotCategorical(probs=probs)
TODO: And then when fitting the model we’ll use the per-document word frequency
matrix as both x
and y
:
model = LdaNet([Nw, 128, 128, 128, Nt])
model.fit(W, W)