# Simplified TagCloud solution by jay@summet.com
#
# Uses the map and filter functional programming
# constructs liberally to demonstrate how they can
# replace traditional itterative programming methods.
#
# Released to the public domain, October 2008


inFile = "art_of_war.txt"
outFile = "test.html"
cwFile = "common.txt"

#Stores the "stop words" that we ignore.
commonWords = []

#Stores the words of the text in a dictionary
tags = {}

#Information about each word is stored in a two
#element list organized as follows:
#
# [ WordString, NumberOfOccurrences]
#
# WordString - just the text of the word
# NumberOfOcurances - count of how many times it appeared


#After calling sort Tags, this list holds the top
#N words sorted alphabetically.
topWords = []


#Load the "common" or stop words.
def loadCommonWords(filename):
    global commonWords

    #Load the common words
    f = open(filename,"r")
    commonWords = f.readlines()
    f.close()

    
    #Strip trailing whitespace (\n \r, etc...)
    commonWords =    map( lambda x: x.rstrip(), commonWords)

    ### Above MAP replaces:
    #striped = []
    #for word in commonWords:
    #    striped = striped + [ word.rstrip() ]
    #    
    #commonWords = striped


#Given a string, returns a new string where
#anything that isn't a letter is removed and uppercase letters
#are converted to lowercase. "words" that are all numbers or punctuation
#result in an "empty" word (string of length zero)
    
def cleanup(word):
    from string import letters
    
    newWord = filter(lambda x: x in letters, word)
    
    ##Above Filter replaces the following:
    #newWord = ""
    #for letter in word:
    #    if letter in letters:
    #        newWord = newWord + letter

    
    #Convert to lower case
    newWord = newWord.lower()
    
    return(newWord)

    
#Loads the main text, storing words in a dictionary and
#keeping track of their occurrences and average line position within
#the main text file.
def loadText(filename):
    global tags
    
   
    f = open(filename,"r")    
    
    line = f.readline()

    while( len(line) != 0):
        line = line.rstrip()  #remote \n \r etc...
        words = line.split()  #get individual words.

        #Cleanup the words
        cleanWords = map(cleanup, words)
        cleanWords = filter(lambda x: len(x) > 0, cleanWords)

        ## Above map and filter replaces:
        #cleanWords = []
        #for w in words:
        #    newWord = cleanup(w)
        #    if len(newWord) > 0:
        #        cleanWords = cleanWords + [ cleanup(w) ]


        #filter out the common words!
        cleanWords = filter(lambda x: not x in commonWords, cleanWords)

       
        #add the words to the dictionary!
        for w in cleanWords:
            #If the word is not already in the dictionary
            #We start it off with zero occurrences
            wordInfo = tags.get(w, [w,0])
                
            #Add one to the word occurrence count.
            wordInfo[1] = wordInfo[1] + 1
                
            #Put the data back into the dictionary
            tags[w] = wordInfo
                 

        line = f.readline()
        
    f.close()
        
    
def sortTags(N):
    global topWords
    
    words = tags.values()

    
    #Sort list based upon the number at position 1 using
    #a lambda function to extract the number of occurrences
    #of the tag/word to use as the key.
    words.sort( key = lambda x: x[1], reverse=True   )

    #Take only the top N words
    topWords = words[0:N]

    pos = 0
    
    from math import log
    
    #We have a log (base E!)  size category system
    #This loop clasifies each word into a size category
    # based upon the log (base e!) of the position.
    while(pos < len(topWords)):
        item = topWords[pos]
        sizeClass = int( 1 + log(pos+1) )
        item[1] = sizeClass
        pos = pos+1
    
    
    #Now, Sort the top N words alphabetically
    topWords.sort()
    

#Writes the HTML for a specific word to a file descriptor.
def writeWord(fd,word):

    string = "<span class=\"size-%d\">%s</span>\n" % (word[1],word[0])
    
    fd.write(string)


#Writes the HTML head including CSS style, plus each word.

def saveHTML(filename):
    global topWords
    
    numWords = len(topWords)

    
    f = open(filename,"w")

    f.write("""<html>
                <head>
                <style type="text/css">
                span.size-1 {font-size: 400%}
                span.size-2 {font-size: 350%}
                span.size-3 {font-size: 300%}
                span.size-4 {font-size: 250%}
                span.size-5 {font-size: 200%}
                span.size-6 {font-size: 190%}
                span.size-7 {font-size: 150%}
                span.size-8 {font-size: 120%}
                span.size-9 {font-size: 100%}
                span.size-10 {font-size: 90%}
                span.size-11 {font-size: 80%}
                <!-- Any more sizes and you have WAY too many words!-->
                </style>
                </head>
                <body>""")

    for w in topWords:
        writeWord(f,w)


    f.write("</body></html>")
    f.close()
    

#this is the actual program!
loadCommonWords(cwFile)
loadText(inFile)
print "number of unique words:", len(tags.keys())
sortTags( 100 )
saveHTML(outFile)
print "all done writing webpage:", outFile