From fad0dd2902bda2163bff384ef89b8b5700981a96 Mon Sep 17 00:00:00 2001 From: timmahrt Date: Thu, 29 Jan 2015 23:02:13 -0600 Subject: [PATCH] SPEED BOOST: Now word lookup ~65 times faster. Used to iterate through the isle text file for each search. Now builds a dictionary of the form{word:pronunciation list,} --- pysle/isletool.py | 102 ++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/pysle/isletool.py b/pysle/isletool.py index 652982f..9d2c271 100644 --- a/pysle/isletool.py +++ b/pysle/isletool.py @@ -17,70 +17,75 @@ class WordNotInISLE(Exception): return "Word '%s' not in ISLE dictionary. Please add it to continue." % self.word - class LexicalTool(): def __init__(self, islePath): self.islePath = islePath - self.data = None - self.pronDict = None + self.data = self._buildDict() + + + def _buildDict(self): + ''' + Builds the isle textfile into a dictionary for fast searching + ''' + dict = {} + wordList = open(self.islePath, "r").read().split("\n") + for row in wordList: + word, pronunciation = row.split(" ", 1) + word = word.split("(")[0] + + dict.setdefault(word, []) + dict[word].append(pronunciation) + + return dict def lookup(self, word): + ''' + Lookup a word and receive a list of syllables and stressInfo + ''' # All words must be lowercase with no extraneous whitespace word = word.lower() word = word.strip() - # Find indicies in the dictionary + pronList = self.data.get(word, None) - if self.data == None: - self.data = open(self.islePath, "r").read() + if pronList == None: + raise WordNotInISLE(word) + else: + pronList = [_parsePronunciation(pronunciationStr) + for pronunciationStr in pronList] - wordList = [] - searchIndex = 0 - while True: - # (The +1 skips over the "\n" which marks the start of every word) - startIndex = self.data.find("\n"+word + "(", searchIndex) + 1 - - # find() returns -1 if it does not find anything, but - # note that we added 1 to the return value - try: - assert(startIndex != 0) - except AssertionError: - if searchIndex == 0: - raise WordNotInISLE(word) - else: - break - - endIndex = self.data.find("\n", startIndex) - - searchIndex = endIndex - wordList.append((startIndex, endIndex)) - - returnList = [] - for startIndex, endIndex in wordList: - isleWord = self.data[startIndex:endIndex] - syllableTxt = isleWord.split("#")[1].strip() - syllableList = [x for x in syllableTxt.split(' . ')] - - # Find stress - stressList = [] - for i, syllable in enumerate(syllableList): - # Primary stress - if "'" in syllable: - stressList.insert(0, i) - # Secondary stress - elif '"' in syllable: - stressList.append(i) - - syllableList = [x.split(" ") for x in syllableList] - returnList.append((syllableList, stressList)) - - return returnList + return pronList +def _parsePronunciation(pronunciationStr): + ''' + Parses the pronunciation string + + Returns the list of syllables and a list of primary and + secondary stress locations + ''' + syllableTxt = pronunciationStr.split("#")[1].strip() + syllableList = [x for x in syllableTxt.split(' . ')] + + # Find stress + stressList = [] + for i, syllable in enumerate(syllableList): + # Primary stress + if "'" in syllable: + stressList.insert(0, i) + # Secondary stress + elif '"' in syllable: + stressList.append(i) + + syllableList = [x.split(" ") for x in syllableList] + + return syllableList, stressList + + def getNumPhones(isleDict, label, maxFlag): ''' @@ -102,7 +107,8 @@ def getNumPhones(isleDict, label, maxFlag): # likely than another, we take the average of all of them phoneCountList = [] for syllableList, stressIndex in phoneListOfLists: - phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList])) + phoneCountList.append(len([phon for phoneList in syllableList for + phon in phoneList])) # The average number of phones for all possible pronunciations # of this word