import numpy as np

# there are 2000 words in the corpus
alpha = np.full (2000, .1)

# there are 100 topics
beta = np.full (100, .1)

# this gets us the probabilty of each word happening in each of the 100 topics
wordsInTopic = np.random.dirichlet (alpha, 100)

# wordsInCorpus[i] will give us the vector of words in document i
wordsInCorpus = np.zeros ((50, 2000))

# generate each doc
for doc in range (0, 50):
        #
        # get the topic probabilities for this doc
        topicsInDoc = np.random.dirichlet (beta)
        #
        # assign each of the 1000 words in this doc to a topic
        wordsToTopic = np.random.multinomial (1000, topicsInDoc)
        #
        # and generate each of the 1000 words
        for topic in range (0, 100):
		wordsFromCurrentTopic = np.random.multinomial (wordsToTopic[topic], wordsInTopic[topic])
                wordsInCorpus[doc] = np.add (wordsInCorpus[doc], wordsFromCurrentTopic)