import numpy as np

# this returns a number whose probability of occurence is p
def sampleValue (p):
        return np.flatnonzero (np.random.multinomial (1, p, 1))[0]

# there are 2000 words in the corpus
alpha = np.full (2000, .1)

# there are 100 topics
beta = np.full (100, .1)

# this gets us the probabilty of each word happening in each of the 100 topics
wordsInTopic = np.random.dirichlet (alpha, 100)
# wordsInCorpus[i] will give us the number of each word in the document
wordsInCorpus = {}

# generate each doc
for doc in range (0, 50):
        #
        # no words in this doc yet
        wordsInDoc = {}
        #
        # get the topic probabilities for this doc
        topicsInDoc = np.random.dirichlet (beta)
        #
        # generate each of the 1000 words in this document
        for word in range (0, 1000):
                #
                # select the topci and the word
                whichTopic = sampleValue (topicsInDoc)
                whichWord = sampleValue (wordsInTopic[whichTopic])
                #
                # and record the word
                wordsInDoc [whichWord] = wordsInDoc.get (whichWord, 0) + 1
                #
        # now, remember this document
        wordsInCorpus [doc] = wordsInDoc