Skip to main content

Python Program for Confletion algorithm.

This is my python version code of the porter's conflation algorithm. The original algorithm can be found on Porter's Website.

Code Here:
#from types import DictionaryType
from re import sub
def m(s):
    vowels = ['a','e','i','o','u','y']
    i, m = 0 , 0
    s = s.lower()
    for i in range(len(s)-1):
        if s[i] in vowels and s[i+1] not in vowels:
            m += 1
    return m    
def check_o(word):
    #for checking *o
    vowels = ['a','e','i','o','u','y']
    last = len(word)-1 
    if last > 2:
        return (word[last] not in vowels and 
            word[last] not in ['w','x','y'] and
            word[last-1] in vowels and
            word[last-2] not in vowels)
    return False
def has_vowels(word):
    return any((ch in ['a','e','i','o','u','y']) for ch in word)
def stem(word):
    
    if len(word) == 1: return word
    
    ########
    #Step 1a
    ########
    pre = ["sses","ies","ss","s"]
    post = ["ss","i","ss",""]
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix):
            word = word[:len(suffix)*-1] + replace
            break
    ########
    #Step 1b
    ########
    if(m(word) > 0 and word.endswith("eed")): word = word[:-1]
    flag = False
    if has_vowels(word[:-2]) and word.endswith("ed"):
        word = word[:-2]
        flag = True
    
    if has_vowels(word[:-3]) and word.endswith("ing"):
        word = word[:-3]
        flag = True
    
    if(flag):
        flag = False
        if(word.endswith("at") or word.endswith("bl") or word.endswith("iz")):
            word += "e"
        last = len(word) - 1
        if(word[last] == word[last-1] and word[last] not in ['l','s','z']):
            word = word[:-1] #Strip the double char.
    
    last = len(word) - 1
    if(last>2):
        if (m(word) == 1 and check_o(word)):
            word += 'e'
    
    ########
    #Rule 1c
    ########
    
    if(word[last] == 'y' and has_vowels(word[:-1])): word = word[:-1] + 'i'

    #######
    #Rule 2
    #######
    
    pre = ['ational','tional','enci','anci','izer','abli','alli','entli','eli','ousli','ization','ation','ator','alism','iveness','fulness','ousness','aliti','iviti','biliti']
    post = ['ate','tion','ence','ance','ize','able','al','ent','e','ous','ize','ate','ate','al','ive','ful','ous','al','ive','ble']
    
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
            break
    
    #######
    #Rule 3
    #######
    
    pre = ['icate','ative','alize','iciti','ical','ful','ness']
    post = ['ic','','al','ic','ic','','']
    for suffix, replace in zip(pre,post):
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0:
            word = word.split(suffix)[0]+replace
            break
    
    #######
    #Rule 4
    #######
    
    pre = ['al','ance','ence','er','ic','able','ible','ant','ement','ment','ent']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
            break
    
    if(word.endswith('ion')):
        splt = word.split('ion')[0]
        if (splt.endswith('s') or splt.endswith('t')) and m(splt) > 1:
            word = splt
    
    pre = ['ou','ism','ate','iti','ous','ive','ize']
    replace = ""
    for suffix in pre:
        if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1:
            word = word.split(suffix)[0]+replace
            break
    
    ########
    #Step 5a
    ########
    if word.endswith('e'):
        if m(word[:-1])>1:
            word = word[:-1] #Strip the trailing e
        elif(m(word[:-1]) == 1 and check_o(word)):
            word = word[:-1] #Strip the trailing e
            
    ########
    #Step 5b
    ########
    if word.endswith('ll') and m(word[:-2]) > 1:
        word = word[:-1]
    
    return word
def remove_symbols(input_string):
    #Convert the characters to lower case and then use
    #Regular expressions to remove non a-z chars
    return sub('[^a-z]+', '', input_string.lower())
def conflete(filename, out_path):

    ###################################################################
    #Step 1: Tokenization i.e. Remove anything that is not alphabetical
    ###################################################################

    try:
        input_file = open(filename)
    except IOError:
        print "File %s Not Found" % (filename)
        exit()

    from ntpath import basename
    file_base_name = basename(filename)
    token_fname = file_base_name.replace(".txt","")+"-tokenized.txt"
    token_file = open(path.join(out_path,token_fname), "w")

    for line in input_file.readlines():
        if line:
            tokenized = " ".join(map(remove_symbols,line.split()))
            token_file.write(tokenized + "\n")

    input_file.close()
    token_file.close()

    ##########################
    #Step 2: Stop Word Removal
    ##########################
    try:
        stop_words_text = open("stop_words.txt").read()
    except IOError:
        print "Stop Word File %s Not Found" % (filename)
        exit()

    stop_words = set(str.split(stop_words_text,","))
    token_file = open(path.join(out_path, token_fname))
    
    stop_word_fname = file_base_name.replace(".txt","")+"-stop-word-removed.txt"
    stop_word_file = open(path.join(out_path,stop_word_fname), "w")

    for line in token_file.readlines():
        stop_word_removed = []
        for word in line.split():
            if word not in stop_words:
                stop_word_removed.append(word)
        stop_word_file.write(" ".join(stop_word_removed) + " ")

    #Close Files
    token_file.close()
    stop_word_file.close()

    ######################################
    #Step 3: Remove suffixes i.e. Stemming
    ######################################

    stop_word_file = open(path.join(out_path, stop_word_fname))
    #to convert "basefile.txt" -> "basefile-stemmed.txt"
    stemmed_fname = file_base_name.replace(".txt", "") + "-stemmed.txt"
    stemmed_file = open(path.join(out_path,stemmed_fname), "w")

    for line in stop_word_file.readlines():
        suffixes_removed = []
        for word in line.split():
            suffixes_removed.append(stem(word))
        stemmed_file.write(" ".join(suffixes_removed)+" ")

    stop_word_file.close()
    stemmed_file.close()
    
    ########################
    #Step 4: Count Frequency
    ########################
    
    #Uses dictionary as DS where key = term word , value = term word's frequency
    dict_words = {}
    stemmed_file = open(path.join(out_path, stemmed_fname))

    for line in stemmed_file.readlines():
        for word in line.split():
            if not word: continue #Do not process empty words
            if word in dict_words:
                dict_words[word]+= 1
            else:
                dict_words[word]= 1 # This word is new, Initialize frequency with 1

    stemmed_file.close()
    
    ##########################################################
    #Step 5 Print Output in a Tabular Format,
    #And Save File named as the most frequently used term
    ##########################################################
    
    #uncomment 2 lines below to get output on console as well.
    result_name = file_base_name.replace(".txt", "")+"-tf.txt"
    result = open(path.join(out_path, result_name),"w")

    for word, freq in dict_words.iteritems():
        outstr = "%-25s %d" % (word, freq)
        #print outstr
        result.write(outstr+"\n")
    
    #print "-" * 27
    result.close()
    return result_name
def idf(out_path, files):
    i = 0
    conf_dict = []
    from os import path
    #Populate Dictionaries
    for f in files:
        conf_dict.append({})
        handle = open(path.join(out_path, f))
        for line in handle.readlines():
            spl = line.split()
            conf_dict[i][spl[0]] = int(spl[1])        
        i += 1
    
    #i = number of dictionaries
    #work over each dictionary
    
    total_documents = len(files)
    
    print "-" * 96
    print "Word\t\tFrequency\t\t\t\tIDF"
    print "-" * 96
    
    from math import log10
    for x in range(i):
        while True:
            
            try:
                item = conf_dict[x].popitem()
            except KeyError:
                break
            
            key , val = item
            df = 1.0
            in_docs = {}
            ###########################################
            #Formula Used for calculating IDF
            # idf = log (1 + (N / DF) )
            # N = Total Documents. 
            # DF = No. of documents term t belongs to
            ###########################################
            old_val = val
            in_docs = "["
            for y in range(i):
                if x == y: continue
                if conf_dict[y].has_key(key):
                    freq = conf_dict[y].pop(key)
                    #-7 means remove last 7 chars from string, since we want to remove "trailing" *-tf.txt (7 chars in total)
                    in_docs += "<{0},{1}>".format(files[y][:-7], val)
                    val += freq
                    df += 1
            
            if old_val == val: #Means the word is present in only 1 file
                in_docs = "[<{0},{1}>]".format(files[x][:-7], val)
            else:
                in_docs += "]"
            idf = log10(1.0 + (total_documents/df))
            print "%-12s\t%-40s%f" %(key,in_docs,idf)

    #After loop, conclude the program
    print "-" * 96
    print "\nThanks for using Python 2.7 implementation of Confletion Algorithm!"


#Path to where all the input files are
input_path = "input" #argv[1]
output_path = "output" #argv[2]

from os import listdir, path
i = 1
#####################################################################
#Call Confletion (Includes Tokenization, Stop-word Removal, Stemming)
#####################################################################
freq_files = []
for each_file in listdir(input_path):
    if each_file.startswith("."): continue #Do not process hidden files
    #conflete(inputfile, output folder)
    freq_files.append(
                      conflete(
                               path.join(input_path, each_file), output_path
                              )
                      )
    i += 1 
    
##############################
#Perform IDF and print results
##############################

idf(output_path,freq_files)

Comments

Popular posts from this blog

Selenium + Python + UnexpectedAlertPresentException: Dealing with annoying alerts

Handling  UnexpectedAlertPresentException   Alerts who hates them? I Do!  Who doesn't hate an annoying alert causing your tests / scraping job to fail? I must say they are pretty much on point on the Unexpected part!  Fortunately, there are easy ways to mitigate the issue. 1. Disable alerts completely: driver . execute_script( 'window.alert = function(){};' ); execute this script just before where you anticipate the alert and you're golden. 2. You want to see the alert text but not disturb the execution flow. driver . execute_script( 'window.alert = console.info;' ); Now the alerts have been redirected to the console and you don't have to worry about them. (Unless you have to - then you'd have to monitor the console) 3. You know exactly when it comes and want to accept the alert and move on. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 from selenium import webdriver from selenium.webdriver.s

Python Program for Soundex Algorithm

This is a python implementation for Soundex Algorithm. This Program builds a JSON document as a dictionary and is kept on building at every execution. Its constantly appended and referenced while the program  is executed. Program: from re import sub def remove_symbols (input_string): #Convert the characters to lower case and then use #Regular expressions to remove non a-z chars return sub( '[^A-Z]+' , '' , input_string) def clean (input_string): #Convert the characters to lower case and then use #Regular expressions to remove non a-z chars return sub( '[^a-z]+' , '' , input_string . lower()) word = "Input" def soundex (word): #Step 1: Capitalize all letters in the word and drop all punctuation marks. word = remove_symbols(word . upper()) #Step 2: Retain the first letter of the word. first_letter = word[ 0 ] word = word[ 1 :] #Step 3 & 4: Change ( 'A&#

weather report

/* Problem Statement: Create a class named weather report that holds a daily weather report with data members day_of_month,hightemp,lowtemp,amount_rain and amount_snow. The constructor initializes the fields with default values: 99 for day_of_month, 999 for hightemp,-999 for low temp and 0 for amount_rain and amount_snow. Include a function that prompts the user and sets values for each field so that you can override the default values. Write a program that creates a monthly report. */ #include #include class weather { public: int day_of_month[50]; int high_temp[50]; int low_temp[50]; int amount_rain[50]; int amount_snow[50]; weather() //defination { day_of_month[0]=99; high_temp[0]=999; low_temp[0]=-999; amount_rain[0]=amount_snow[0]=0; } void get_data(int n); void put_data(int n); void average(int n); }; void weather:: average(int n) { int min ,max,total_rainfall,total_snowfall; total_rainfall=0; tot