Python Program for Confletion algorithm.

This is my python version code of the porter's conflation algorithm. The original algorithm can be found on Porter's Website.

Code Here:
#from types import DictionaryType
from re import sub
def m(s):
    vowels = ['a','e','i','o','u','y']
    i, m = 0 , 0
    s = s.lower()
    for i in range(len(s)-1):
        if s[i] in vowels and s[i+1] not in vowels:
            m += 1
    return m    
def check_o(word):
    #for checking *o
    vowels = ['a','e','i','o','u','y']
    last = len(word)-1 
    if last > 2:
        return (word[last] not in vowels and 
            word[last] not in ['w','x','y'] and
            word[last-1] in vowels and
            word[last-2] not in vowels)
    return False
def has_vowels(word):
    return any((ch in ['a','e','i','o','u','y']) for ch in word)
def stem(word):
    
    if len(word) == 1: return word
    
    ########
    #Step 1a
    ########
    pre = ["sses","ies","ss","s"]
    post = ["ss","i","ss",""]
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix):
            word = word[:len(suffix)*-1] + replace
            break
    ########
    #Step 1b
    ########
    if(m(word) > 0 and word.endswith("eed")): word = word[:-1]
    flag = False
    if has_vowels(word[:-2]) and word.endswith("ed"):
        word = word[:-2]
        flag = True
    
    if has_vowels(word[:-3]) and word.endswith("ing"):
        word = word[:-3]
        flag = True
    
    if(flag):
        flag = False
        if(word.endswith("at") or word.endswith("bl") or word.endswith("iz")):
            word += "e"
        last = len(word) - 1
        if(word[last] == word[last-1] and word[last] not in ['l','s','z']):
            word = word[:-1] #Strip the double char.
    
    last = len(word) - 1
    if(last>2):
        if (m(word) == 1 and check_o(word)):
            word += 'e'
    
    ########
    #Rule 1c
    ########
    
    if(word[last] == 'y' and has_vowels(word[:-1])): word = word[:-1] + 'i'

    #######
    #Rule 2
    #######
    
    pre = ['ational','tional','enci','anci','izer','abli','alli','entli','eli','ousli','ization','ation','ator','alism','iveness','fulness','ousness','aliti','iviti','biliti']
    post = ['ate','tion','ence','ance','ize','able','al','ent','e','ous','ize','ate','ate','al','ive','ful','ous','al','ive','ble']
    
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
            break
    
    #######
    #Rule 3
    #######
    
    pre = ['icate','ative','alize','iciti','ical','ful','ness']
    post = ['ic','','al','ic','ic','','']
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
            break
    
    #######
    #Rule 4
    #######
    
    pre = ['al','ance','ence','er','ic','able','ible','ant','ement','ment','ent']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
            break
    
    if(word.endswith('ion')):
        splt = word.split('ion')[0]
        if (splt.endswith('s') or splt.endswith('t')) and m(splt) > 1:
            word = splt
    
    pre = ['ou','ism','ate','iti','ous','ive','ize']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
            break
    
    ########
    #Step 5a
    ########
    if word.endswith('e'):
        if m(word[:-1])>1:
            word = word[:-1] #Strip the trailing e
        elif(m(word[:-1]) == 1 and check_o(word)):
            word = word[:-1] #Strip the trailing e
            
    ########
    #Step 5b
    ########
    if word.endswith('ll') and m(word[:-2]) > 1:
        word = word[:-1]
    
    return word
def remove_symbols(input_string):
    #Convert the characters to lower case and then use
    #Regular expressions to remove non a-z chars
    return sub('[^a-z]+', '', input_string.lower())
def conflete(filename, out_path):

    ###################################################################
    #Step 1: Tokenization i.e. Remove anything that is not alphabetical
    ###################################################################

    try:
        input_file = open(filename)
    except IOError:
        print "File %s Not Found" % (filename)
        exit()

    from ntpath import basename
    file_base_name = basename(filename)
    token_fname = file_base_name.replace(".txt","")+"-tokenized.txt"
    token_file = open(path.join(out_path,token_fname), "w")

    for line in input_file.readlines():
        if line:
            tokenized = " ".join(map(remove_symbols,line.split()))
            token_file.write(tokenized + "\n")

    input_file.close()
    token_file.close()

    ##########################
    #Step 2: Stop Word Removal
    ##########################
    try:
        stop_words_text = open("stop_words.txt").read()
    except IOError:
        print "Stop Word File %s Not Found" % (filename)
        exit()

    stop_words = set(str.split(stop_words_text,","))
    token_file = open(path.join(out_path, token_fname))
    
    stop_word_fname = file_base_name.replace(".txt","")+"-stop-word-removed.txt"
    stop_word_file = open(path.join(out_path,stop_word_fname), "w")

    for line in token_file.readlines():
        stop_word_removed = []
        for word in line.split():
            if word not in stop_words:
                stop_word_removed.append(word)
        stop_word_file.write(" ".join(stop_word_removed) + " ")

    #Close Files
    token_file.close()
    stop_word_file.close()

    ######################################
    #Step 3: Remove suffixes i.e. Stemming
    ######################################

    stop_word_file = open(path.join(out_path, stop_word_fname))
    #to convert "basefile.txt" -> "basefile-stemmed.txt"
    stemmed_fname = file_base_name.replace(".txt", "") + "-stemmed.txt"
    stemmed_file = open(path.join(out_path,stemmed_fname), "w")

    for line in stop_word_file.readlines():
        suffixes_removed = []
        for word in line.split():
            suffixes_removed.append(stem(word))
        stemmed_file.write(" ".join(suffixes_removed)+" ")

    stop_word_file.close()
    stemmed_file.close()
    
    ########################
    #Step 4: Count Frequency
    ########################
    
    #Uses dictionary as DS where key = term word , value = term word's frequency
    dict_words = {}
    stemmed_file = open(path.join(out_path, stemmed_fname))

    for line in stemmed_file.readlines():
        for word in line.split():
            if not word: continue #Do not process empty words
            if word in dict_words:
                dict_words[word]+= 1
            else:
                dict_words[word]= 1 # This word is new, Initialize frequency with 1

    stemmed_file.close()
    
    ##########################################################
    #Step 5 Print Output in a Tabular Format,
    #And Save File named as the most frequently used term
    ##########################################################
    
    #uncomment 2 lines below to get output on console as well.
    result_name = file_base_name.replace(".txt", "")+"-tf.txt"
    result = open(path.join(out_path, result_name),"w")

    for word, freq in dict_words.iteritems():
        outstr = "%-25s %d" % (word, freq)
        #print outstr
        result.write(outstr+"\n")
    
    #print "-" * 27
    result.close()
    return result_name
def idf(out_path, files):
    i = 0
    conf_dict = []
    from os import path
    #Populate Dictionaries
    for f in files:
        conf_dict.append({})
        handle = open(path.join(out_path, f))
        for line in handle.readlines():
            spl = line.split()
            conf_dict[i][spl[0]] = int(spl[1])        
        i += 1
    
    #i = number of dictionaries
    #work over each dictionary
    
    total_documents = len(files)
    
    print "-" * 96
    print "Word\t\tFrequency\t\t\t\tIDF"
    print "-" * 96
    
    from math import log10
    for x in range(i):
        while True:
            
            try:
                item = conf_dict[x].popitem()
            except KeyError:
                break
            
            key , val = item
            df = 1.0
            in_docs = {}
            ###########################################
            #Formula Used for calculating IDF
            # idf = log (1 + (N / DF) )
            # N = Total Documents. 
            # DF = No. of documents term t belongs to
            ###########################################
            old_val = val
            in_docs = "["
            for y in range(i):
                if x == y: continue
                if conf_dict[y].has_key(key):
                    freq = conf_dict[y].pop(key)
                    #-7 means remove last 7 chars from string, since we want to remove "trailing" *-tf.txt (7 chars in total)
                    in_docs += "<{0},{1}>".format(files[y][:-7], val)
                    val += freq
                    df += 1
            
            if old_val == val: #Means the word is present in only 1 file
                in_docs = "[<{0},{1}>]".format(files[x][:-7], val)
            else:
                in_docs += "]"
            idf = log10(1.0 + (total_documents/df))
            print "%-12s\t%-40s%f" %(key,in_docs,idf)

    #After loop, conclude the program
    print "-" * 96
    print "\nThanks for using Python 2.7 implementation of Confletion Algorithm!"


#Path to where all the input files are
input_path = "input" #argv[1]
output_path = "output" #argv[2]

from os import listdir, path
i = 1
#####################################################################
#Call Confletion (Includes Tokenization, Stop-word Removal, Stemming)
#####################################################################
freq_files = []
for each_file in listdir(input_path):
    if each_file.startswith("."): continue #Do not process hidden files
    #conflete(inputfile, output folder)
    freq_files.append(
                      conflete(
                               path.join(input_path, each_file), output_path
                              )
                      )
    i += 1 
    
##############################
#Perform IDF and print results
##############################

idf(output_path,freq_files)
CodeBots Blog

Search This Blog

Python Program for Confletion algorithm.

Comments

Post a Comment

Popular posts from this blog

Selenium + Python + UnexpectedAlertPresentException: Dealing with annoying alerts

4. Lex and Yacc Program to detect errors in a 'C' Language Program

Python Program for Soundex Algorithm