##################################################################################################################### # # kNN classifier for the 20 newsgroups data set, using cosine distance over bag-of-words count vectors # # run this code, and then type, for example: getPrediction ("god jesus allah", 30) # # This will come back with a prediction as to the membership of this text string in one of the 20 different # nesgroups. This particular query will return: # # [('/soc.religion.christian/', 15)] # # meaning that 15/30 closest articles to the string "god jesus allah" were in the '/soc.religion.christian/' newsgroup # and that this was the most common newsgroup in the top 30. Pretty good! # # But it is not always so good. getPrediction ("how many goals Vancouver score last year?",30) returns: # # [('/comp.graphics/', 6)] # ##################################################################################################################### import re import numpy as np # load up all of the 19997 documents in the corpus corpus = sc.textFile ("s3://chrisjermainebucket/comp330_A6/20_news_same_line.txt") # each entry in validLines will be a line from the text file validLines = corpus.filter(lambda x : 'id' in x) # now we transform it into a bunch of (docID, text) pairs keyAndText = validLines.map(lambda x : (x[x.index('id="') + 4 : x.index('" url=')], x[x.index('">') + 2:])) # now we split the text in each (docID, text) pair into a list of words # after this, we have a data set with (docID, ["word1", "word2", "word3", ...]) # we have a bit of fancy regular expression stuff here to make sure that we do not # die on some of the documents regex = re.compile('[^a-zA-Z]') keyAndListOfWords = keyAndText.map(lambda x : (str(x[0]), regex.sub(' ', x[1]).lower().split())) # now get the top 20,000 words... first change (docID, ["word1", "word2", "word3", ...]) # to ("word1", 1) ("word2", 1)... allWords = keyAndListOfWords.flatMap(lambda x: ((j, 1) for j in x[1])) # now, count all of the words, giving us ("word1", 1433), ("word2", 3423423), etc. allCounts = allWords.reduceByKey (lambda a, b: a + b) # and get the top 20,000 words in a local array # each entry is a ("word1", count) pair topWords = allCounts.top (20000, lambda x : x[1]) # and we'll create a RDD that has a bunch of (word, dictNum) pairs # start by creating an RDD that has the number 0 thru 20000 # 20000 is the number of words that will be in our dictionary twentyK = sc.parallelize(range(20000)) # now, we transform (0), (1), (2), ... to ("mostcommonword", 0) ("nextmostcommon", 1), ... # the number will be the spot in the dictionary used to tell us where the word is located # HINT: make use of topWords dictionary = dictionary = twentyK.map (??????????????????????) # finally, print out some of the dictionary, just for debugging dictionary.top (10)