Skip to main content

Python Program for Confletion algorithm.

This is my python version code of the porter's conflation algorithm. The original algorithm can be found on Porter's Website.

Code Here:
#from types import DictionaryType
from re import sub
def m(s):
    vowels = ['a','e','i','o','u','y']
    i, m = 0 , 0
    s = s.lower()
    for i in range(len(s)-1):
        if s[i] in vowels and s[i+1] not in vowels:
            m += 1
    return m    
def check_o(word):
    #for checking *o
    vowels = ['a','e','i','o','u','y']
    last = len(word)-1 
    if last > 2:
        return (word[last] not in vowels and 
            word[last] not in ['w','x','y'] and
            word[last-1] in vowels and
            word[last-2] not in vowels)
    return False
def has_vowels(word):
    return any((ch in ['a','e','i','o','u','y']) for ch in word)
def stem(word):
    
    if len(word) == 1: return word
    
    ########
    #Step 1a
    ########
    pre = ["sses","ies","ss","s"]
    post = ["ss","i","ss",""]
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix):
            word = word[:len(suffix)*-1] + replace
            break
    ########
    #Step 1b
    ########
    if(m(word) > 0 and word.endswith("eed")): word = word[:-1]
    flag = False
    if has_vowels(word[:-2]) and word.endswith("ed"):
        word = word[:-2]
        flag = True
    
    if has_vowels(word[:-3]) and word.endswith("ing"):
        word = word[:-3]
        flag = True
    
    if(flag):
        flag = False
        if(word.endswith("at") or word.endswith("bl") or word.endswith("iz")):
            word += "e"
        last = len(word) - 1
        if(word[last] == word[last-1] and word[last] not in ['l','s','z']):
            word = word[:-1] #Strip the double char.
    
    last = len(word) - 1
    if(last>2):
        if (m(word) == 1 and check_o(word)):
            word += 'e'
    
    ########
    #Rule 1c
    ########
    
    if(word[last] == 'y' and has_vowels(word[:-1])): word = word[:-1] + 'i'

    #######
    #Rule 2
    #######
    
    pre = ['ational','tional','enci','anci','izer','abli','alli','entli','eli','ousli','ization','ation','ator','alism','iveness','fulness','ousness','aliti','iviti','biliti']
    post = ['ate','tion','ence','ance','ize','able','al','ent','e','ous','ize','ate','ate','al','ive','ful','ous','al','ive','ble']
    
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
            break
    
    #######
    #Rule 3
    #######
    
    pre = ['icate','ative','alize','iciti','ical','ful','ness']
    post = ['ic','','al','ic','ic','','']
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
            break
    
    #######
    #Rule 4
    #######
    
    pre = ['al','ance','ence','er','ic','able','ible','ant','ement','ment','ent']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
            break
    
    if(word.endswith('ion')):
        splt = word.split('ion')[0]
        if (splt.endswith('s') or splt.endswith('t')) and m(splt) > 1:
            word = splt
    
    pre = ['ou','ism','ate','iti','ous','ive','ize']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
            break
    
    ########
    #Step 5a
    ########
    if word.endswith('e'):
        if m(word[:-1])>1:
            word = word[:-1] #Strip the trailing e
        elif(m(word[:-1]) == 1 and check_o(word)):
            word = word[:-1] #Strip the trailing e
            
    ########
    #Step 5b
    ########
    if word.endswith('ll') and m(word[:-2]) > 1:
        word = word[:-1]
    
    return word
def remove_symbols(input_string):
    #Convert the characters to lower case and then use
    #Regular expressions to remove non a-z chars
    return sub('[^a-z]+', '', input_string.lower())
def conflete(filename, out_path):

    ###################################################################
    #Step 1: Tokenization i.e. Remove anything that is not alphabetical
    ###################################################################

    try:
        input_file = open(filename)
    except IOError:
        print "File %s Not Found" % (filename)
        exit()

    from ntpath import basename
    file_base_name = basename(filename)
    token_fname = file_base_name.replace(".txt","")+"-tokenized.txt"
    token_file = open(path.join(out_path,token_fname), "w")

    for line in input_file.readlines():
        if line:
            tokenized = " ".join(map(remove_symbols,line.split()))
            token_file.write(tokenized + "\n")

    input_file.close()
    token_file.close()

    ##########################
    #Step 2: Stop Word Removal
    ##########################
    try:
        stop_words_text = open("stop_words.txt").read()
    except IOError:
        print "Stop Word File %s Not Found" % (filename)
        exit()

    stop_words = set(str.split(stop_words_text,","))
    token_file = open(path.join(out_path, token_fname))
    
    stop_word_fname = file_base_name.replace(".txt","")+"-stop-word-removed.txt"
    stop_word_file = open(path.join(out_path,stop_word_fname), "w")

    for line in token_file.readlines():
        stop_word_removed = []
        for word in line.split():
            if word not in stop_words:
                stop_word_removed.append(word)
        stop_word_file.write(" ".join(stop_word_removed) + " ")

    #Close Files
    token_file.close()
    stop_word_file.close()

    ######################################
    #Step 3: Remove suffixes i.e. Stemming
    ######################################

    stop_word_file = open(path.join(out_path, stop_word_fname))
    #to convert "basefile.txt" -> "basefile-stemmed.txt"
    stemmed_fname = file_base_name.replace(".txt", "") + "-stemmed.txt"
    stemmed_file = open(path.join(out_path,stemmed_fname), "w")

    for line in stop_word_file.readlines():
        suffixes_removed = []
        for word in line.split():
            suffixes_removed.append(stem(word))
        stemmed_file.write(" ".join(suffixes_removed)+" ")

    stop_word_file.close()
    stemmed_file.close()
    
    ########################
    #Step 4: Count Frequency
    ########################
    
    #Uses dictionary as DS where key = term word , value = term word's frequency
    dict_words = {}
    stemmed_file = open(path.join(out_path, stemmed_fname))

    for line in stemmed_file.readlines():
        for word in line.split():
            if not word: continue #Do not process empty words
            if word in dict_words:
                dict_words[word]+= 1
            else:
                dict_words[word]= 1 # This word is new, Initialize frequency with 1

    stemmed_file.close()
    
    ##########################################################
    #Step 5 Print Output in a Tabular Format,
    #And Save File named as the most frequently used term
    ##########################################################
    
    #uncomment 2 lines below to get output on console as well.
    result_name = file_base_name.replace(".txt", "")+"-tf.txt"
    result = open(path.join(out_path, result_name),"w")

    for word, freq in dict_words.iteritems():
        outstr = "%-25s %d" % (word, freq)
        #print outstr
        result.write(outstr+"\n")
    
    #print "-" * 27
    result.close()
    return result_name
def idf(out_path, files):
    i = 0
    conf_dict = []
    from os import path
    #Populate Dictionaries
    for f in files:
        conf_dict.append({})
        handle = open(path.join(out_path, f))
        for line in handle.readlines():
            spl = line.split()
            conf_dict[i][spl[0]] = int(spl[1])        
        i += 1
    
    #i = number of dictionaries
    #work over each dictionary
    
    total_documents = len(files)
    
    print "-" * 96
    print "Word\t\tFrequency\t\t\t\tIDF"
    print "-" * 96
    
    from math import log10
    for x in range(i):
        while True:
            
            try:
                item = conf_dict[x].popitem()
            except KeyError:
                break
            
            key , val = item
            df = 1.0
            in_docs = {}
            ###########################################
            #Formula Used for calculating IDF
            # idf = log (1 + (N / DF) )
            # N = Total Documents. 
            # DF = No. of documents term t belongs to
            ###########################################
            old_val = val
            in_docs = "["
            for y in range(i):
                if x == y: continue
                if conf_dict[y].has_key(key):
                    freq = conf_dict[y].pop(key)
                    #-7 means remove last 7 chars from string, since we want to remove "trailing" *-tf.txt (7 chars in total)
                    in_docs += "<{0},{1}>".format(files[y][:-7], val)
                    val += freq
                    df += 1
            
            if old_val == val: #Means the word is present in only 1 file
                in_docs = "[<{0},{1}>]".format(files[x][:-7], val)
            else:
                in_docs += "]"
            idf = log10(1.0 + (total_documents/df))
            print "%-12s\t%-40s%f" %(key,in_docs,idf)

    #After loop, conclude the program
    print "-" * 96
    print "\nThanks for using Python 2.7 implementation of Confletion Algorithm!"


#Path to where all the input files are
input_path = "input" #argv[1]
output_path = "output" #argv[2]

from os import listdir, path
i = 1
#####################################################################
#Call Confletion (Includes Tokenization, Stop-word Removal, Stemming)
#####################################################################
freq_files = []
for each_file in listdir(input_path):
    if each_file.startswith("."): continue #Do not process hidden files
    #conflete(inputfile, output folder)
    freq_files.append(
                      conflete(
                               path.join(input_path, each_file), output_path
                              )
                      )
    i += 1 
    
##############################
#Perform IDF and print results
##############################

idf(output_path,freq_files)

Comments

Popular posts from this blog

Selenium + Python + UnexpectedAlertPresentException: Dealing with annoying alerts

Handling  UnexpectedAlertPresentException   Alerts who hates them? I Do!  Who doesn't hate an annoying alert causing your tests / scraping job to fail? I must say they are pretty much on point on the Unexpected part!  Fortunately, there are easy ways to mitigate the issue. 1. Disable alerts completely: driver . execute_script( 'window.alert = function(){};' ); execute this script just before where you anticipate the alert and you're golden. 2. You want to see the alert text but not disturb the execution flow. driver . execute_script( 'window.alert = console.info;' ); Now the alerts have been redirected to the console and you don't have to worry about them. (Unless you have to - then you'd have to monitor the console) 3. You know exactly when it comes and want to accept the alert and move on. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 from selenium import webdriver from selenium.webdriver.s...

4. Lex and Yacc Program to detect errors in a 'C' Language Program

Lex and Yacc Program to detect errors in a 'C' Language Program   Lex Code : %{ #include"y.tab.h" #include<stdio.h> int LineNo = 1 ; %} identifier [ a - zA - Z ][ _a - zA - Z0 - 9 ]* number [ 0 - 9 ]+|([ 0 - 9 ]*\.[ 0 - 9 ]+) %% main \(\) return MAIN ; if return IF ; else return ELSE ; while return WHILE ; int | char | flaot return TYPE ; { identifier } return VAR ; { number } return NUM ; \> | \< | \<= | \>= | == return RELOP ; [\ t ] ; [\ n ] LineNo ++; . return yytext [ 0 ]; %% Yacc Code : %{ #include<string.h> #include<stdio.h> extern int LineNo ; int errno = 0 ; %} % token NUM VAR RELOP % token MAIN IF ELSE WHILE TYPE % left '-' '+' % left '*' '/' %% PROGRAM : MAIN BLOCK ; BLOCK : '{' CODE '}' ; CODE : BLOCK | STATEMENT CODE | STATEMENT ; STATEMENT : DECST ';' | DECST { printf ( "\nLine number %d...

Python Program for Soundex Algorithm

This is a python implementation for Soundex Algorithm. This Program builds a JSON document as a dictionary and is kept on building at every execution. Its constantly appended and referenced while the program  is executed. Program: from re import sub def remove_symbols (input_string): #Convert the characters to lower case and then use #Regular expressions to remove non a-z chars return sub( '[^A-Z]+' , '' , input_string) def clean (input_string): #Convert the characters to lower case and then use #Regular expressions to remove non a-z chars return sub( '[^a-z]+' , '' , input_string . lower()) word = "Input" def soundex (word): #Step 1: Capitalize all letters in the word and drop all punctuation marks. word = remove_symbols(word . upper()) #Step 2: Retain the first letter of the word. first_letter = word[ 0 ] word = word[ 1 :] #Step 3 & 4: Change ( 'A...