Python Program for Confletion algorithm.

This is my python version code of the porter's conflation algorithm. The original algorithm can be found on Porter's Website.

Code Here:
#from types import DictionaryType
from re import sub
def m(s):
    vowels = ['a','e','i','o','u','y']
    i, m = 0 , 0
    s = s.lower()
    for i in range(len(s)-1):
        if s[i] in vowels and s[i+1] not in vowels:
            m += 1
    return m    
def check_o(word):
    #for checking *o
    vowels = ['a','e','i','o','u','y']
    last = len(word)-1 
    if last > 2:
        return (word[last] not in vowels and 
            word[last] not in ['w','x','y'] and
            word[last-1] in vowels and
            word[last-2] not in vowels)
    return False
def has_vowels(word):
    return any((ch in ['a','e','i','o','u','y']) for ch in word)
def stem(word):
    if len(word) == 1: return word
    #Step 1a
    pre = ["sses","ies","ss","s"]
    post = ["ss","i","ss",""]
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix):
            word = word[:len(suffix)*-1] + replace
    #Step 1b
    if(m(word) > 0 and word.endswith("eed")): word = word[:-1]
    flag = False
    if has_vowels(word[:-2]) and word.endswith("ed"):
        word = word[:-2]
        flag = True
    if has_vowels(word[:-3]) and word.endswith("ing"):
        word = word[:-3]
        flag = True
        flag = False
        if(word.endswith("at") or word.endswith("bl") or word.endswith("iz")):
            word += "e"
        last = len(word) - 1
        if(word[last] == word[last-1] and word[last] not in ['l','s','z']):
            word = word[:-1] #Strip the double char.
    last = len(word) - 1
        if (m(word) == 1 and check_o(word)):
            word += 'e'
    #Rule 1c
    if(word[last] == 'y' and has_vowels(word[:-1])): word = word[:-1] + 'i'

    #Rule 2
    pre = ['ational','tional','enci','anci','izer','abli','alli','entli','eli','ousli','ization','ation','ator','alism','iveness','fulness','ousness','aliti','iviti','biliti']
    post = ['ate','tion','ence','ance','ize','able','al','ent','e','ous','ize','ate','ate','al','ive','ful','ous','al','ive','ble']
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
    #Rule 3
    pre = ['icate','ative','alize','iciti','ical','ful','ness']
    post = ['ic','','al','ic','ic','','']
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
    #Rule 4
    pre = ['al','ance','ence','er','ic','able','ible','ant','ement','ment','ent']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
        splt = word.split('ion')[0]
        if (splt.endswith('s') or splt.endswith('t')) and m(splt) > 1:
            word = splt
    pre = ['ou','ism','ate','iti','ous','ive','ize']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
    #Step 5a
    if word.endswith('e'):
        if m(word[:-1])>1:
            word = word[:-1] #Strip the trailing e
        elif(m(word[:-1]) == 1 and check_o(word)):
            word = word[:-1] #Strip the trailing e
    #Step 5b
    if word.endswith('ll') and m(word[:-2]) > 1:
        word = word[:-1]
    return word
def remove_symbols(input_string):
    #Convert the characters to lower case and then use
    #Regular expressions to remove non a-z chars
    return sub('[^a-z]+', '', input_string.lower())
def conflete(filename, out_path):

    #Step 1: Tokenization i.e. Remove anything that is not alphabetical

        input_file = open(filename)
    except IOError:
        print "File %s Not Found" % (filename)

    from ntpath import basename
    file_base_name = basename(filename)
    token_fname = file_base_name.replace(".txt","")+"-tokenized.txt"
    token_file = open(path.join(out_path,token_fname), "w")

    for line in input_file.readlines():
        if line:
            tokenized = " ".join(map(remove_symbols,line.split()))
            token_file.write(tokenized + "\n")


    #Step 2: Stop Word Removal
        stop_words_text = open("stop_words.txt").read()
    except IOError:
        print "Stop Word File %s Not Found" % (filename)

    stop_words = set(str.split(stop_words_text,","))
    token_file = open(path.join(out_path, token_fname))
    stop_word_fname = file_base_name.replace(".txt","")+"-stop-word-removed.txt"
    stop_word_file = open(path.join(out_path,stop_word_fname), "w")

    for line in token_file.readlines():
        stop_word_removed = []
        for word in line.split():
            if word not in stop_words:
        stop_word_file.write(" ".join(stop_word_removed) + " ")

    #Close Files

    #Step 3: Remove suffixes i.e. Stemming

    stop_word_file = open(path.join(out_path, stop_word_fname))
    #to convert "basefile.txt" -> "basefile-stemmed.txt"
    stemmed_fname = file_base_name.replace(".txt", "") + "-stemmed.txt"
    stemmed_file = open(path.join(out_path,stemmed_fname), "w")

    for line in stop_word_file.readlines():
        suffixes_removed = []
        for word in line.split():
        stemmed_file.write(" ".join(suffixes_removed)+" ")

    #Step 4: Count Frequency
    #Uses dictionary as DS where key = term word , value = term word's frequency
    dict_words = {}
    stemmed_file = open(path.join(out_path, stemmed_fname))

    for line in stemmed_file.readlines():
        for word in line.split():
            if not word: continue #Do not process empty words
            if word in dict_words:
                dict_words[word]+= 1
                dict_words[word]= 1 # This word is new, Initialize frequency with 1

    #Step 5 Print Output in a Tabular Format,
    #And Save File named as the most frequently used term
    #uncomment 2 lines below to get output on console as well.
    result_name = file_base_name.replace(".txt", "")+"-tf.txt"
    result = open(path.join(out_path, result_name),"w")

    for word, freq in dict_words.iteritems():
        outstr = "%-25s %d" % (word, freq)
        #print outstr
    #print "-" * 27
    return result_name
def idf(out_path, files):
    i = 0
    conf_dict = []
    from os import path
    #Populate Dictionaries
    for f in files:
        handle = open(path.join(out_path, f))
        for line in handle.readlines():
            spl = line.split()
            conf_dict[i][spl[0]] = int(spl[1])        
        i += 1
    #i = number of dictionaries
    #work over each dictionary
    total_documents = len(files)
    print "-" * 96
    print "Word\t\tFrequency\t\t\t\tIDF"
    print "-" * 96
    from math import log10
    for x in range(i):
        while True:
                item = conf_dict[x].popitem()
            except KeyError:
            key , val = item
            df = 1.0
            in_docs = {}
            #Formula Used for calculating IDF
            # idf = log (1 + (N / DF) )
            # N = Total Documents. 
            # DF = No. of documents term t belongs to
            old_val = val
            in_docs = "["
            for y in range(i):
                if x == y: continue
                if conf_dict[y].has_key(key):
                    freq = conf_dict[y].pop(key)
                    #-7 means remove last 7 chars from string, since we want to remove "trailing" *-tf.txt (7 chars in total)
                    in_docs += "<{0},{1}>".format(files[y][:-7], val)
                    val += freq
                    df += 1
            if old_val == val: #Means the word is present in only 1 file
                in_docs = "[<{0},{1}>]".format(files[x][:-7], val)
                in_docs += "]"
            idf = log10(1.0 + (total_documents/df))
            print "%-12s\t%-40s%f" %(key,in_docs,idf)

    #After loop, conclude the program
    print "-" * 96
    print "\nThanks for using Python 2.7 implementation of Confletion Algorithm!"

#Path to where all the input files are
input_path = "input" #argv[1]
output_path = "output" #argv[2]

from os import listdir, path
i = 1
#Call Confletion (Includes Tokenization, Stop-word Removal, Stemming)
freq_files = []
for each_file in listdir(input_path):
    if each_file.startswith("."): continue #Do not process hidden files
    #conflete(inputfile, output folder)
                               path.join(input_path, each_file), output_path
    i += 1 
#Perform IDF and print results



