from math import log

import nltk
from nltk import ConditionalFreqDist, FreqDist
from nltk.corpus import reuters, udhr

# Never put imports for testing data here, only imports required by
# the assignment itself

# Problem 1.1
def count_words(words):
    """Return a dict of (word:number of occurences) pairs for words"""
    counts = {}
    for word in words:
        # For each word, increment its count by using its dict entry
        # creating it if it doesn't yet exist
        try:
            counts[word] += 1
        except KeyError:
            counts[word] = 1
    return counts


# Problem 1.2
def count_words_easy(words):
    """Return a FreqDist of words"""
    return FreqDist(words)


# Problem 2
def top_suffixes(words):
    """Return the top two-letter suffixes in words"""
    # Restrict to words of len 5 or more, and take the top 10  keys
    return FreqDist([word[-2:]
                     for word in words 
                     if len(word) > 4]).keys()[:10] 
             
        
# Problem 3 Helper
def udhr_char_cfd(languages):
    """Build a cfd of char frequencies in UDHR conditioned on each language"""
    # We'll lowercase each letter before counting,
    # and force the encoding to Unicode
    return nltk.ConditionalFreqDist(
        (lang, letter.lower())
        for lang in languages
        for word in udhr.words(lang + '-Latin1')
        for letter in word)


# Problem 3.1
def top_chars(languages): 
    """Return the top chars of the UDHR in each language"""
    cfd = udhr_char_cfd(languages)
    return [cfd[lang].keys()[:5] for lang in languages]


# Problem 3.2 Helper
def cfd_kl_divergence(cfd, lang_1, lang_2):
    """Return the K-L divergence of conditions lang_1 and lang_2 in the cfd"""
    divergence = 0
    # Treat cfd[lang_1] as P, and iterate over its outcomes
    for outcome in cfd[lang_1].keys():
        # cfd[lang_2] is Q. We only count an outcome toward the sum if
        # its probability is greater than zero in both P and Q 
        if cfd[lang_1].freq(outcome) > 0.0 and cfd[lang_2].freq(outcome) > 0.0:
            divergence += cfd[lang_1].freq(outcome) * \
                   log(cfd[lang_1].freq(outcome) / cfd[lang_2].freq(outcome), 2)
    
    return divergence


# Problem 3.2
def compare_langs(languages):
    """Compute the K-L div. of the char. distribution of two languages"""
    # Compute the CFD of the languages using the UDHR
    cfd = udhr_char_cfd(languages) 
    # Then return the K-L divergence of the two langs' CFD
    return cfd_kl_divergence(cfd, languages[0], languages[1])


# Problem 3.3
def align_chars(plain_words, encrypted_words):
    """Return the mapping between plain and encrypted words by aligning most common chars"""
    # Build CFDs for plain and encrypted 
    plain_cfd = nltk.FreqDist(letter.lower()
                              for word in plain_words
                              for letter in word)
    encrypted_cfd = nltk.FreqDist(letter.lower()
                                  for word in encrypted_words
                                  for letter in word)
    
    # Return the alignment, which is just tuples of chars of matched rank
    return [(plain_char, encrypted_char) for plain_char, encrypted_char in \
                zip(plain_cfd.keys(), encrypted_cfd.keys())]

# Problem 4 Helper
# Note, there are lots of ways to sort a dictionary by values. This is 
# a good example of using lambda, but the most common way is
# sorted(adict.items(), key=itemgetter(1), reverse=True) 
def top_keys_by_value(adict):
    """Return the keys sorted by descending order of their value"""
    return [pair[0] for pair in
            sorted(adict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)]

# Problem 4 Helper
def cfd_reuters_category(categories):
    """Return a CFD of words in Reuters conditioned on each category"""
    return ConditionalFreqDist(
        (category, word.lower())
        for category in categories
        for word in reuters.words(categories=category))


# Note, in an ideal world, the basic_ and better_ functions would basically
# use one function to do the work instead of practically being copies of each 
# other. But in this case refactoring can result in more complicated and
# longer code, so these functions are left somewhat redundant.


# Problem 4.1
def basic_conditional_words(categories):
    """Return the top words per category ranked by P(w|category)"""
    # Build a CFD for the categories
    word_category_cfd = cfd_reuters_category(categories)

    # Now build a dict per category where the words are keys 
    # and val is P(w|category)
    dicts = [dict((word, word_category_cfd[category].freq(word))
                  for word in word_category_cfd[category].keys())
             for category in categories]

    return [top_keys_by_value(adict)[:20] for adict in dicts]

# Problem 4.2
def better_conditional_words(categories):
    """Return the top words per category ranked by P(w|category)/P(w)"""
    # Get frequences for the whole corpus
    word_fd = FreqDist([word.lower() for word in reuters.words()])

    # Build a CFD for the categories
    word_category_cfd = cfd_reuters_category(categories)
        
    # Now build a dict per category where the words are keys and
    # val is P(w|category)/P(w)
    dicts = [dict((word, (word_category_cfd[category].freq(word)/
                          word_fd.freq(word)))  
                  for word in word_category_cfd[category].keys())
             for category in categories]

    return [top_keys_by_value(adict)[:20] for adict in dicts]

# NEVER PUT TEST CODE HERE
# Test code goes inside a block like the following
if __name__ == "__main__":
    # Put any imports your test code requires here, and run your test code here
    pass # A cute thing to put where you don't have any code to offer
