SPEED BOOST: Now word lookup ~65 times faster.

Used to iterate through the isle text file for each search. Now builds a dictionary of the form{word:pronunciation list,}
DOCUMENTATION: Moved the project description up.
2026-06-27 16:10:05 +08:00 · 2015-01-29 23:02:13 -06:00 · 2014-10-23 15:53:57 -05:00 · 2014-10-23 15:51:35 -05:00 · 2014-10-23 15:41:02 -05:00
2 changed files with 87 additions and 51 deletions
@@ -11,6 +11,32 @@ pronunciations (e.g. a list of phones someone said versus a standard or
 canonical dictionary pronunciation). 


+.. sectnum::
+.. contents::
+
+
+Common Use Cases
+================
+
+What can you do with this library?
+
+- look up the list of phones and syllables for canonical pronunciations 
+  of a word::
+  
+    pysle.isletool.LexicalTool.lookup('cat')
+
+- map an actual pronunciation to a dictionary pronunciation (can be used 
+  to automatically find speech errors)::
+  
+    pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',]) 
+
+- automatically syllabify a praat textgrid containing words and phones 
+  (e.g. force-aligned text) -- requires my 
+  `praatIO <https://github.com/timmahrt/praatIO>`_ library::
+  
+    pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
+
+
 Requirements
 ================

@@ -20,10 +46,15 @@ Requirements
  `ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_

  `Direct link to the ISLEX file used in this project
-  <http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_
+  <http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_ (islev2.txt)

 - ``Python 2.7.*`` or above

+- The `praatIO <https://github.com/timmahrt/praatIO>`_ library is required IF 
+  you want to use the textgrid functionality.  It is not required 
+  for normal use.
+
+
 Installation
 ================

@@ -61,7 +92,6 @@ and another::
    print syllableList
    >> [["''"], ['n', '@'], ['th', 'r']]
    
-stressedSyllable, syllableList, syllabification, stressedIndex = returnList

-Please see \test for example usage
+Please see \\test for example usage

@@ -17,70 +17,75 @@ class WordNotInISLE(Exception):
        return "Word '%s' not in ISLE dictionary.  Please add it to continue." % self.word


-
 class LexicalTool():
    
    
    def __init__(self, islePath):
        self.islePath = islePath
-        self.data = None
-        self.pronDict = None
+        self.data = self._buildDict()
+    
+    
+    def _buildDict(self):
+        '''
+        Builds the isle textfile into a dictionary for fast searching
+        '''
+        dict = {}
+        wordList = open(self.islePath, "r").read().split("\n")
+        for row in wordList:
+            word, pronunciation = row.split(" ", 1)
+            word = word.split("(")[0]
+            
+            dict.setdefault(word, [])
+            dict[word].append(pronunciation)
+        
+        return dict
    
    
    def lookup(self, word):
+        '''
+        Lookup a word and receive a list of syllables and stressInfo
+        '''
        
        # All words must be lowercase with no extraneous whitespace
        word = word.lower()
        word = word.strip()
        
-        # Find indicies in the dictionary
+        pronList = self.data.get(word, None)
        
-        if self.data == None:
-            self.data = open(self.islePath, "r").read()
+        if pronList == None:
+            raise WordNotInISLE(word)
+        else:
+            pronList = [_parsePronunciation(pronunciationStr) 
+                        for pronunciationStr in pronList]
        
-        wordList = []
-        searchIndex = 0
-        while True:
-            # (The +1 skips over the "\n" which marks the start of every word)
-            startIndex = self.data.find("\n"+word + "(", searchIndex) + 1
-            
-            # find() returns -1 if it does not find anything, but
-            #    note that we added 1 to the return value
-            try:
-                assert(startIndex != 0)
-            except AssertionError:
-                if searchIndex == 0:
-                    raise WordNotInISLE(word)
-                else:
-                    break
-            
-            endIndex = self.data.find("\n", startIndex)
-            
-            searchIndex = endIndex
-            wordList.append((startIndex, endIndex))
-            
-        returnList = []
-        for startIndex, endIndex in wordList:
-            isleWord = self.data[startIndex:endIndex]
-            syllableTxt = isleWord.split("#")[1].strip()
-            syllableList = [x for x in syllableTxt.split(' . ')]
-            
-            # Find stress
-            stressList = []
-            for i, syllable in enumerate(syllableList):
-                # Primary stress
-                if "'" in syllable:
-                    stressList.insert(0, i)
-                # Secondary stress
-                elif '"' in syllable:
-                    stressList.append(i)
-            
-            syllableList = [x.split(" ") for x in syllableList]
-            returnList.append((syllableList, stressList))
-        
-        return returnList
+        return pronList


+def _parsePronunciation(pronunciationStr):
+    '''
+    Parses the pronunciation string
+    
+    Returns the list of syllables and a list of primary and 
+    secondary stress locations 
+    '''
+    syllableTxt = pronunciationStr.split("#")[1].strip()
+    syllableList = [x for x in syllableTxt.split(' . ')]
+    
+    # Find stress
+    stressList = []
+    for i, syllable in enumerate(syllableList):
+        # Primary stress
+        if "'" in syllable:
+            stressList.insert(0, i)
+        # Secondary stress
+        elif '"' in syllable:
+            stressList.append(i)
+    
+    syllableList = [x.split(" ") for x in syllableList]
+    
+    return syllableList, stressList
+            
+            
 def getNumPhones(isleDict, label, maxFlag):
    '''
    
@@ -102,7 +107,8 @@ def getNumPhones(isleDict, label, maxFlag):
        # likely than another, we take the average of all of them
        phoneCountList = []
        for syllableList, stressIndex in phoneListOfLists:
-            phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList]))
+            phoneCountList.append(len([phon for phoneList in syllableList for 
+                                       phon in phoneList]))
        
        # The average number of phones for all possible pronunciations
        #    of this word
Author	SHA1	Message	Date
timmahrt	fad0dd2902	SPEED BOOST: Now word lookup ~65 times faster. Used to iterate through the isle text file for each search. Now builds a dictionary of the form{word:pronunciation list,}	2015-01-29 23:02:13 -06:00
timmahrt	475053eee2	DOCUMENTATION: Moved the project description up.	2014-10-23 15:53:57 -05:00
timmahrt	08f8e859cc	DOCUMENTATION: Added link to praatio. Added table of contents. Also added some clarification about the requirements.	2014-10-23 15:51:35 -05:00
timmahrt	9cd6a7e68b	DOCUMENTATION: Added/cleaned up the readme file Added a new section 'common use cases' since I get that question a lot.	2014-10-23 15:41:02 -05:00