# Simplified TagCloud solution by jay@summet.com # # Uses the map and filter functional programming # constructs liberally to demonstrate how they can # replace traditional itterative programming methods. # # Released to the public domain, October 2008 inFile = "art_of_war.txt" outFile = "test.html" cwFile = "common.txt" #Stores the "stop words" that we ignore. commonWords = [] #Stores the words of the text in a dictionary tags = {} #Information about each word is stored in a two #element list organized as follows: # # [ WordString, NumberOfOccurrences] # # WordString - just the text of the word # NumberOfOcurances - count of how many times it appeared #After calling sort Tags, this list holds the top #N words sorted alphabetically. topWords = [] #Load the "common" or stop words. def loadCommonWords(filename): global commonWords #Load the common words f = open(filename,"r") commonWords = f.readlines() f.close() #Strip trailing whitespace (\n \r, etc...) commonWords = map( lambda x: x.rstrip(), commonWords) ### Above MAP replaces: #striped = [] #for word in commonWords: # striped = striped + [ word.rstrip() ] # #commonWords = striped #Given a string, returns a new string where #anything that isn't a letter is removed and uppercase letters #are converted to lowercase. "words" that are all numbers or punctuation #result in an "empty" word (string of length zero) def cleanup(word): from string import letters newWord = filter(lambda x: x in letters, word) ##Above Filter replaces the following: #newWord = "" #for letter in word: # if letter in letters: # newWord = newWord + letter #Convert to lower case newWord = newWord.lower() return(newWord) #Loads the main text, storing words in a dictionary and #keeping track of their occurrences and average line position within #the main text file. def loadText(filename): global tags f = open(filename,"r") line = f.readline() while( len(line) != 0): line = line.rstrip() #remote \n \r etc... words = line.split() #get individual words. #Cleanup the words cleanWords = map(cleanup, words) cleanWords = filter(lambda x: len(x) > 0, cleanWords) ## Above map and filter replaces: #cleanWords = [] #for w in words: # newWord = cleanup(w) # if len(newWord) > 0: # cleanWords = cleanWords + [ cleanup(w) ] #filter out the common words! cleanWords = filter(lambda x: not x in commonWords, cleanWords) #add the words to the dictionary! for w in cleanWords: #If the word is not already in the dictionary #We start it off with zero occurrences wordInfo = tags.get(w, [w,0]) #Add one to the word occurrence count. wordInfo[1] = wordInfo[1] + 1 #Put the data back into the dictionary tags[w] = wordInfo line = f.readline() f.close() def sortTags(N): global topWords words = tags.values() #Sort list based upon the number at position 1 using #a lambda function to extract the number of occurrences #of the tag/word to use as the key. words.sort( key = lambda x: x[1], reverse=True ) #Take only the top N words topWords = words[0:N] pos = 0 from math import log #We have a log (base E!) size category system #This loop clasifies each word into a size category # based upon the log (base e!) of the position. while(pos < len(topWords)): item = topWords[pos] sizeClass = int( 1 + log(pos+1) ) item[1] = sizeClass pos = pos+1 #Now, Sort the top N words alphabetically topWords.sort() #Writes the HTML for a specific word to a file descriptor. def writeWord(fd,word): string = "%s\n" % (word[1],word[0]) fd.write(string) #Writes the HTML head including CSS style, plus each word. def saveHTML(filename): global topWords numWords = len(topWords) f = open(filename,"w") f.write(""" """) for w in topWords: writeWord(f,w) f.write("") f.close() #this is the actual program! loadCommonWords(cwFile) loadText(inFile) print "number of unique words:", len(tags.keys()) sortTags( 100 ) saveHTML(outFile) print "all done writing webpage:", outFile