import numpy as np # there are 2000 words in the corpus alpha = np.full (2000, .1) # there are 100 topics beta = np.full (100, .1) # this gets us the probabilty of each word happening in each of the 100 topics wordsInTopic = np.random.dirichlet (alpha, 100) # wordsInCorpus[i] will give us the vector of words in document i wordsInCorpus = np.zeros ((50, 2000)) # generate each doc for doc in range (0, 50): # # get the topic probabilities for this doc topicsInDoc = np.random.dirichlet (beta) # # assign each of the 1000 words in this doc to a topic wordsToTopic = np.random.multinomial (1000, topicsInDoc) # # and generate each of the 1000 words for topic in range (0, 100): wordsFromCurrentTopic = np.random.multinomial (wordsToTopic[topic], wordsInTopic[topic]) wordsInCorpus[doc] = np.add (wordsInCorpus[doc], wordsFromCurrentTopic)