Python Program for Soundex Algorithm

This is a python implementation for Soundex Algorithm.

This Program builds a JSON document as a dictionary and is kept on building at every execution. Its constantly appended and referenced while the program  is executed.


from re import sub
def remove_symbols(input_string):
    #Convert the characters to lower case and then use
    #Regular expressions to remove non a-z chars
    return sub('[^A-Z]+', '', input_string)

def clean(input_string):
    #Convert the characters to lower case and then use
    #Regular expressions to remove non a-z chars
    return sub('[^a-z]+', '', input_string.lower())

word = "Input"

def soundex(word):
    #Step 1: Capitalize all letters in the word and drop all punctuation marks.
    word = remove_symbols(word.upper())
    #Step 2: Retain the first letter of the word. 
    first_letter = word[0]
    word = word[1:]
    #Step 3 & 4: Change ( 'A', E', 'I', 'O', 'U', 'H', 'W', 'Y') to 0
    #And ('B','F','P','V') => 1
    #('C','G','J','K','Q','S','X','Z') => 2
    #('D,'T') => 3 , ('L') =. 4 , ('M','N') => 5 and ('R') => 6  
    pre = ['[AEIOUWHY]','[BFPV]','[CGJKQSXZ]','[DT]','[L]','[MN]','[R]']
    post= ['0','1','2','3','4','5','6']
    for find , replace in zip(pre, post):
        word = sub(find, replace, word)
    #Step 5: Remove all pairs of digits which occur beside each other from the string that resulted after Step 4.
    new_word = ""
    maxpos = len(word) - 1
    for i in range(maxpos+1):
        if i< maxpos and word[i] != word[i+1]:
            new_word += word[i]
        elif i == maxpos and word[i] != word[i-1]:
            new_word += word[i]
    #Step 6: Remove all zeros from the string that results from step 5.0 (placed there in step 3)
    #(Retaining the first character as well) 
    word = first_letter + sub('0','', new_word)
    #Step 7:  Pad the string that reVeekramsulted from step (6) with trailing zeros and return only the first four positions,
    #which will be of the form <uppercase letter> <digit> <digit> <digit>
    length = len(word)
    if length >= 4:
        word = word[:4]
        word = word + ("0" * (4 - length))
    #print input, word
    return word

import json
fp = open("D:\\Vikram Projects\\Eclipse Workspace\\Soundex Algorithm\\repository.txt")
dic  = json.load(fp)
#dic = dic[0]

from os import listdir, path 
dict = {}
files = []
for fle in listdir("data"):
    #Ignore the ~ and . i.e. hidden / system files
    f = open(path.join("data", fle)) 
    if fle.startswith('~') or fle.startswith('.'): continue
    dict[fle] = list()
    for line in f.readlines():
        dict[fle] += map(clean, line.split())

word = raw_input("Enter a word: ").lower()
code = soundex(word)

if dic.has_key(code):
    print word+" has  following similar words: "
    print dic[code]
    print word + " has no phonetically similar words."
    dic[code] = list()

    _ = dic[code].index(word)

print "And it's present in following files: "
for fle in files:
    for word in dic[code]:
            if dict[fle].index(str(word)) != -1:
                print word +" is found in --> " + fle
        except (ValueError):
#insert further into dictionary

fp = open("repository.txt",'w')
json.dump(dic, fp)

"C416": ["calpurnia", "calpoornia","calpornia"],
"V265": ["vikrant","vikramjeet", "veekram", "vikram"], "A123": ["abheejit"], "V220": ["vishakha"], "M622": ["markus", "markoos","merkus"],
"J310": ["jaydeep","jaydip"],
"V240": ["vishal"], 
"V625": ["virkam"],
"V230": ["viksto"],
"B632": ["brutus", "brutoos"]

Input Files:
Brutus killed calpurnia, brutoos the evil brother of calpornia,
avenged her death. vikram gets angry when called veekram. 

Markus is the step brother of brutus. Brutus and markus are each others good friends. calpoornia, is also a friend.
Vikram is a friend of Markus.

brutus and markus were classmates but they changed roads after college. They do not have any similar interests now. brutoos is  a butcher and merkus weaves. 

veekram and markus were also 

classmates but they rarely spoke.
Enter a word: merkus
merkus has  following similar words:  

[u'markus', u'markoos']
And it's present in following files:
markus is found in --> f2.txt
markus is found in --> f3.txt
merkus is found in --> f3.txt
Enter a word: vikram
vikram has  following similar words:
[u'vikrant', u'vikramjeet', u'veekram', u'vikram']
And it's present in following files:
veekram is found in --> f1.txt
vikram is found in --> f1.txt
vikram is found in --> f2.txt
veekram is found in --> f3.txt
Enter a word: santiago
santiago has no phonetically similar words.
And it's present in following files:


