This is my python version code of the porter's conflation algorithm. The original algorithm can be found on Porter's Website.
Code Here:
Code Here:
#from types import DictionaryType from re import sub def m(s): vowels = ['a','e','i','o','u','y'] i, m = 0 , 0 s = s.lower() for i in range(len(s)-1): if s[i] in vowels and s[i+1] not in vowels: m += 1 return m def check_o(word): #for checking *o vowels = ['a','e','i','o','u','y'] last = len(word)-1 if last > 2: return (word[last] not in vowels and word[last] not in ['w','x','y'] and word[last-1] in vowels and word[last-2] not in vowels) return False def has_vowels(word): return any((ch in ['a','e','i','o','u','y']) for ch in word) def stem(word): if len(word) == 1: return word ######## #Step 1a ######## pre = ["sses","ies","ss","s"] post = ["ss","i","ss",""] for suffix, replace in zip(pre,post): if word.endswith(suffix): word = word[:len(suffix)*-1] + replace break ######## #Step 1b ######## if(m(word) > 0 and word.endswith("eed")): word = word[:-1] flag = False if has_vowels(word[:-2]) and word.endswith("ed"): word = word[:-2] flag = True if has_vowels(word[:-3]) and word.endswith("ing"): word = word[:-3] flag = True if(flag): flag = False if(word.endswith("at") or word.endswith("bl") or word.endswith("iz")): word += "e" last = len(word) - 1 if(word[last] == word[last-1] and word[last] not in ['l','s','z']): word = word[:-1] #Strip the double char. last = len(word) - 1 if(last>2): if (m(word) == 1 and check_o(word)): word += 'e' ######## #Rule 1c ######## if(word[last] == 'y' and has_vowels(word[:-1])): word = word[:-1] + 'i' ####### #Rule 2 ####### pre = ['ational','tional','enci','anci','izer','abli','alli','entli','eli','ousli','ization','ation','ator','alism','iveness','fulness','ousness','aliti','iviti','biliti'] post = ['ate','tion','ence','ance','ize','able','al','ent','e','ous','ize','ate','ate','al','ive','ful','ous','al','ive','ble'] for suffix, replace in zip(pre,post): if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0: word = word.split(suffix)[0]+replace break ####### #Rule 3 ####### pre = ['icate','ative','alize','iciti','ical','ful','ness'] post = ['ic','','al','ic','ic','',''] for suffix, replace in zip(pre,post): if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 0: word = word.split(suffix)[0]+replace break ####### #Rule 4 ####### pre = ['al','ance','ence','er','ic','able','ible','ant','ement','ment','ent'] replace = "" for suffix in pre: if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1: word = word.split(suffix)[0]+replace break if(word.endswith('ion')): splt = word.split('ion')[0] if (splt.endswith('s') or splt.endswith('t')) and m(splt) > 1: word = splt pre = ['ou','ism','ate','iti','ous','ive','ize'] replace = "" for suffix in pre: if word.endswith(suffix) and m(word.rsplit(suffix)[0]) > 1: word = word.split(suffix)[0]+replace break ######## #Step 5a ######## if word.endswith('e'): if m(word[:-1])>1: word = word[:-1] #Strip the trailing e elif(m(word[:-1]) == 1 and check_o(word)): word = word[:-1] #Strip the trailing e ######## #Step 5b ######## if word.endswith('ll') and m(word[:-2]) > 1: word = word[:-1] return word def remove_symbols(input_string): #Convert the characters to lower case and then use #Regular expressions to remove non a-z chars return sub('[^a-z]+', '', input_string.lower()) def conflete(filename, out_path): ################################################################### #Step 1: Tokenization i.e. Remove anything that is not alphabetical ################################################################### try: input_file = open(filename) except IOError: print "File %s Not Found" % (filename) exit() from ntpath import basename file_base_name = basename(filename) token_fname = file_base_name.replace(".txt","")+"-tokenized.txt" token_file = open(path.join(out_path,token_fname), "w") for line in input_file.readlines(): if line: tokenized = " ".join(map(remove_symbols,line.split())) token_file.write(tokenized + "\n") input_file.close() token_file.close() ########################## #Step 2: Stop Word Removal ########################## try: stop_words_text = open("stop_words.txt").read() except IOError: print "Stop Word File %s Not Found" % (filename) exit() stop_words = set(str.split(stop_words_text,",")) token_file = open(path.join(out_path, token_fname)) stop_word_fname = file_base_name.replace(".txt","")+"-stop-word-removed.txt" stop_word_file = open(path.join(out_path,stop_word_fname), "w") for line in token_file.readlines(): stop_word_removed = [] for word in line.split(): if word not in stop_words: stop_word_removed.append(word) stop_word_file.write(" ".join(stop_word_removed) + " ") #Close Files token_file.close() stop_word_file.close() ###################################### #Step 3: Remove suffixes i.e. Stemming ###################################### stop_word_file = open(path.join(out_path, stop_word_fname)) #to convert "basefile.txt" -> "basefile-stemmed.txt" stemmed_fname = file_base_name.replace(".txt", "") + "-stemmed.txt" stemmed_file = open(path.join(out_path,stemmed_fname), "w") for line in stop_word_file.readlines(): suffixes_removed = [] for word in line.split(): suffixes_removed.append(stem(word)) stemmed_file.write(" ".join(suffixes_removed)+" ") stop_word_file.close() stemmed_file.close() ######################## #Step 4: Count Frequency ######################## #Uses dictionary as DS where key = term word , value = term word's frequency dict_words = {} stemmed_file = open(path.join(out_path, stemmed_fname)) for line in stemmed_file.readlines(): for word in line.split(): if not word: continue #Do not process empty words if word in dict_words: dict_words[word]+= 1 else: dict_words[word]= 1 # This word is new, Initialize frequency with 1 stemmed_file.close() ########################################################## #Step 5 Print Output in a Tabular Format, #And Save File named as the most frequently used term ########################################################## #uncomment 2 lines below to get output on console as well. result_name = file_base_name.replace(".txt", "")+"-tf.txt" result = open(path.join(out_path, result_name),"w") for word, freq in dict_words.iteritems(): outstr = "%-25s %d" % (word, freq) #print outstr result.write(outstr+"\n") #print "-" * 27 result.close() return result_name def idf(out_path, files): i = 0 conf_dict = [] from os import path #Populate Dictionaries for f in files: conf_dict.append({}) handle = open(path.join(out_path, f)) for line in handle.readlines(): spl = line.split() conf_dict[i][spl[0]] = int(spl[1]) i += 1 #i = number of dictionaries #work over each dictionary total_documents = len(files) print "-" * 96 print "Word\t\tFrequency\t\t\t\tIDF" print "-" * 96 from math import log10 for x in range(i): while True: try: item = conf_dict[x].popitem() except KeyError: break key , val = item df = 1.0 in_docs = {} ########################################### #Formula Used for calculating IDF # idf = log (1 + (N / DF) ) # N = Total Documents. # DF = No. of documents term t belongs to ########################################### old_val = val in_docs = "[" for y in range(i): if x == y: continue if conf_dict[y].has_key(key): freq = conf_dict[y].pop(key) #-7 means remove last 7 chars from string, since we want to remove "trailing" *-tf.txt (7 chars in total) in_docs += "<{0},{1}>".format(files[y][:-7], val) val += freq df += 1 if old_val == val: #Means the word is present in only 1 file in_docs = "[<{0},{1}>]".format(files[x][:-7], val) else: in_docs += "]" idf = log10(1.0 + (total_documents/df)) print "%-12s\t%-40s%f" %(key,in_docs,idf) #After loop, conclude the program print "-" * 96 print "\nThanks for using Python 2.7 implementation of Confletion Algorithm!" #Path to where all the input files are input_path = "input" #argv[1] output_path = "output" #argv[2] from os import listdir, path i = 1 ##################################################################### #Call Confletion (Includes Tokenization, Stop-word Removal, Stemming) ##################################################################### freq_files = [] for each_file in listdir(input_path): if each_file.startswith("."): continue #Do not process hidden files #conflete(inputfile, output folder) freq_files.append( conflete( path.join(input_path, each_file), output_path ) ) i += 1 ############################## #Perform IDF and print results ############################## idf(output_path,freq_files)
Comments
Post a Comment