REFACTOR: Change print statement to print function

REFACTOR: PEP 8 compliance and minor bugfix
For bugfix, see last change in pronunciationtools.py
2026-06-27 16:10:05 +08:00 · 2015-06-19 17:29:19 -05:00 · 2015-06-18 19:56:15 -05:00 · 2015-06-16 02:27:46 -05:00 · 2015-06-11 15:46:36 -05:00 · 2015-06-11 15:43:27 -05:00
6 changed files with 167 additions and 135 deletions
@@ -11,6 +11,32 @@ pronunciations (e.g. a list of phones someone said versus a standard or
 canonical dictionary pronunciation). 


+.. sectnum::
+.. contents::
+
+
+Common Use Cases
+================
+
+What can you do with this library?
+
+- look up the list of phones and syllables for canonical pronunciations 
+  of a word::
+  
+    pysle.isletool.LexicalTool.lookup('cat')
+
+- map an actual pronunciation to a dictionary pronunciation (can be used 
+  to automatically find speech errors)::
+  
+    pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',]) 
+
+- automatically syllabify a praat textgrid containing words and phones 
+  (e.g. force-aligned text) -- requires my 
+  `praatIO <https://github.com/timmahrt/praatIO>`_ library::
+  
+    pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
+
+
 Requirements
 ================

@@ -20,10 +46,15 @@ Requirements
  `ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_

  `Direct link to the ISLEX file used in this project
-  <http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_
+  <http://www.isle.illinois.edu/sst/data/dict/islex/islev2.txt>`_ (islev2.txt)

 - ``Python 2.7.*`` or above

+- The `praatIO <https://github.com/timmahrt/praatIO>`_ library is required IF 
+  you want to use the textgrid functionality.  It is not required 
+  for normal use.
+
+
 Installation
 ================

@@ -61,7 +92,6 @@ and another::
    print syllableList
    >> [["''"], ['n', '@'], ['th', 'r']]
    
-stressedSyllable, syllableList, syllabification, stressedIndex = returnList

-Please see \test for example usage
+Please see \\test for example usage

@@ -5,86 +5,91 @@ Created on Oct 11, 2012
 '''


-vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>',]
+vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ]


 class WordNotInISLE(Exception):
    
    def __init__(self, word):
+        super(WordNotInISLE, self).__init__()
        self.word = word
        
    def __str__(self):
-        return "Word '%s' not in ISLE dictionary.  Please add it to continue." % self.word
-
+        return ("Word '%s' not in ISLE dictionary.  "
+                "Please add it to continue." % self.word)


 class LexicalTool():
    
-    
    def __init__(self, islePath):
        self.islePath = islePath
-        self.data = None
-        self.pronDict = None
+        self.data = self._buildDict()
    
+    def _buildDict(self):
+        '''
+        Builds the isle textfile into a dictionary for fast searching
+        '''
+        lexDict = {}
+        wordList = [line.rstrip('\n') for line in open(self.islePath, "rU")]
+            
+        for row in wordList:
+            word, pronunciation = row.split(" ", 1)
+            word = word.split("(")[0]
+            
+            lexDict.setdefault(word, [])
+            lexDict[word].append(pronunciation)
+        
+        return lexDict
    
    def lookup(self, word):
+        '''
+        Lookup a word and receive a list of syllables and stressInfo
+        '''
        
        # All words must be lowercase with no extraneous whitespace
        word = word.lower()
        word = word.strip()
        
-        # Find indicies in the dictionary
+        pronList = self.data.get(word, None)
        
-        if self.data == None:
-            self.data = open(self.islePath, "r").read()
+        if pronList is None:
+            raise WordNotInISLE(word)
+        else:
+            pronList = [_parsePronunciation(pronunciationStr)
+                        for pronunciationStr in pronList]
        
-        wordList = []
-        searchIndex = 0
-        while True:
-            # (The +1 skips over the "\n" which marks the start of every word)
-            startIndex = self.data.find("\n"+word + "(", searchIndex) + 1
-            
-            # find() returns -1 if it does not find anything, but
-            #    note that we added 1 to the return value
-            try:
-                assert(startIndex != 0)
-            except AssertionError:
-                if searchIndex == 0:
-                    raise WordNotInISLE(word)
-                else:
-                    break
-            
-            endIndex = self.data.find("\n", startIndex)
-            
-            searchIndex = endIndex
-            wordList.append((startIndex, endIndex))
-            
-        returnList = []
-        for startIndex, endIndex in wordList:
-            isleWord = self.data[startIndex:endIndex]
-            syllableTxt = isleWord.split("#")[1].strip()
-            syllableList = [x for x in syllableTxt.split(' . ')]
-            
-            # Find stress
-            stressList = []
-            for i, syllable in enumerate(syllableList):
-                # Primary stress
-                if "'" in syllable:
-                    stressList.insert(0, i)
-                # Secondary stress
-                elif '"' in syllable:
-                    stressList.append(i)
-            
-            syllableList = [x.split(" ") for x in syllableList]
-            returnList.append((syllableList, stressList))
-        
-        return returnList
+        return pronList


+def _parsePronunciation(pronunciationStr):
+    '''
+    Parses the pronunciation string
+    
+    Returns the list of syllables and a list of primary and
+    secondary stress locations
+    '''
+    syllableTxt = pronunciationStr.split("#")[1].strip()
+    syllableList = [x for x in syllableTxt.split(' . ')]
+    
+    # Find stress
+    stressList = []
+    for i, syllable in enumerate(syllableList):
+        # Primary stress
+        if "'" in syllable:
+            stressList.insert(0, i)
+        # Secondary stress
+        elif '"' in syllable:
+            stressList.append(i)
+    
+    syllableList = [x.split(" ") for x in syllableList]
+    
+    return syllableList, stressList
+            
+            
 def getNumPhones(isleDict, label, maxFlag):
    '''
    
-    If maxFlag=True, use the longest pronunciation.  Otherwise, take the 
+    If maxFlag=True, use the longest pronunciation.  Otherwise, take the
    average length.
    '''
    phoneCount = 0
@@ -94,24 +99,28 @@ def getNumPhones(isleDict, label, maxFlag):
        phoneListOfLists = isleDict.lookup(word)
        
        syllableCountList = []
-        for syllableList, stressIndex in phoneListOfLists:
+        for row in phoneListOfLists:
+            syllableList = row[0]
            syllableCountList.append(len(syllableList))
        
        # In ISLE, there can be multiple pronunciations for each word
        # as we have no reason to believe one pronunciation is more
        # likely than another, we take the average of all of them
        phoneCountList = []
-        for syllableList, stressIndex in phoneListOfLists:
-            phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList]))
+        for row in phoneListOfLists:
+            syllableList = row[0]
+            phoneCountList.append(len([phon for phoneList in syllableList for
+                                       phon in phoneList]))
        
        # The average number of phones for all possible pronunciations
        #    of this word
-        if maxFlag == True:
+        if maxFlag is True:
            syllableCount += max(syllableCountList)
            phoneCount += max(phoneCountList)
        else:
-            syllableCount += sum(syllableCountList) / float(len(syllableCountList))
-            phoneCount += sum(phoneCountList) / float(len(phoneCountList))    
+            syllableCount += (sum(syllableCountList) /
+                              float(len(syllableCountList)))
+            phoneCount += sum(phoneCountList) / float(len(phoneCountList))
    
    return syllableCount, phoneCount

@@ -131,6 +140,3 @@ def findOODWords(isleDict, wordList):
    oodList.sort()
    
    return oodList
-
-        
-        
@@ -4,6 +4,7 @@ Created on Oct 22, 2014
@author: tmahrt
 '''

+
 class OptionalFeatureError(ImportError):
    
    def __str__(self):
@@ -18,7 +19,7 @@ from pysle import isletool
 from pysle import pronunciationtools


-def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName, 
+def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
                      skipLabelList=None):
    '''
    Given a textgrid, syllabifies the phones in the textgrid
@@ -34,7 +35,7 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
    wordTier = tg.tierDict[wordTierName]
    phoneTier = tg.tierDict[phoneTierName]
    
-    if skipLabelList == None:
+    if skipLabelList is None:
        skipLabelList = []
    
    syllableEntryList = []
@@ -46,28 +47,31 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
        
        subPhoneTier = phoneTier.crop(start, stop, True, False)[0]
        
-        phoneList = [phone for startP, endP, phone in subPhoneTier.entryList if phone != '']
+        # entry = (start, stop, phone)
+        phoneList = [entry[2] for entry in subPhoneTier.entryList
+                     if entry[2] != '']
        
        try:
-            returnList = pronunciationtools.findBestSyllabification(isleDict, 
-                                                                    word, 
+            returnList = pronunciationtools.findBestSyllabification(isleDict,
+                                                                    word,
                                                                    phoneList)
        except isletool.WordNotInISLE:
-            print "Word ('%s') not is isle -- skipping syllabification" % word
+            print("Word ('%s') not is isle -- skipping syllabification" % word)
            continue
        except (pronunciationtools.NullPronunciationError):
-            print "Word ('%s') has no provided pronunciation" % word
+            print("Word ('%s') has no provided pronunciation" % word)
            continue
-    
-        stressedSyllable, syllableList, syllabification, stressIndexList = returnList
+        
+        syllableList = returnList[1]
+        stressIndexList = returnList[3]
        
        i = 0
-#         print syllableList
+#         print(syllableList)
        for k, syllable in enumerate(syllableList):
            
            # Create the syllable tier entry
            j = len(syllable)
-            stubEntryList = subPhoneTier.entryList[i:i+j]
+            stubEntryList = subPhoneTier.entryList[i:i + j]
            i += j
            
            # The whole syllable was deleted
@@ -76,29 +80,28 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
            
            syllableStart = stubEntryList[0][0]
            syllableEnd = stubEntryList[-1][1]
-            label = "-".join([phone for start, end, phone in stubEntryList])
+            label = "-".join([entry[2] for entry in stubEntryList])
        
-            syllableEntryList.append( (syllableStart, syllableEnd, label) )
+            syllableEntryList.append((syllableStart, syllableEnd, label))
            
            # Create the tonic tier entry
            try:
                stressIndex = stressIndexList[0]
            except IndexError:
-                stressIndex = None # Function word probably
+                stressIndex = None  # Function word probably
                
            tonicLabel = ''
            if k == stressIndex:
                tonicLabel = 'T'
                
-            tonicEntryList.append( (syllableStart, syllableEnd, tonicLabel) )
+            tonicEntryList.append((syllableStart, syllableEnd, tonicLabel))
    
    # Create a textgrid with the two syllable-level tiers
-    syllableTier = praatio.TextgridTier("syllable", syllableEntryList, praatio.INTERVAL_TIER)
-    tonicTier = praatio.TextgridTier('tonic', tonicEntryList, praatio.INTERVAL_TIER)
+    syllableTier = praatio.IntervalTier("syllable", syllableEntryList)
+    tonicTier = praatio.IntervalTier('tonic', tonicEntryList)
    
    syllableTG = praatio.Textgrid()
    syllableTG.addTier(syllableTier)
    syllableTG.addTier(tonicTier)

    return syllableTG
-
@@ -9,10 +9,10 @@ import itertools
 from pysle import isletool


-
 class NullPronunciationError(Exception):
    
    def __init__(self, word):
+        super(NullPronunciationError, self).__init__()
        self.word = word
    
    def __str__(self):
@@ -49,7 +49,7 @@ def _lcs(xs, ys):
        ll_b = _lcs_lens(xb, ys)
        ll_e = _lcs_lens(xe[::-1], ys[::-1])
        _, k = max((ll_b[j] + ll_e[ny - j], j)
-                    for j in range(ny + 1))
+                   for j in range(ny + 1))
        yb, ye = ys[:k], ys[k:]
        return _lcs(xb, yb) + _lcs(xe, ye)

@@ -58,14 +58,13 @@ def _prepPronunciation(phoneList):
    retList = []
    for phone in phoneList:
        if 'r' in phone:
-            phone = ['r',]
+            phone = ['r', ]
        try:
-            phone = phone[0] # Only represent the str by its first letter
+            phone = phone[0]  # Only represent the string by its first letter
+            phone = phone.lower()
        except IndexError:
            raise NullPhoneError()
        
-        phone = phone.lower()
-        
        if phone in isletool.vowelList:
            phone = 'V'
        retList.append(phone)
@@ -85,14 +84,14 @@ def _adjustSyllabification(adjustedPhoneList, syllableList):
    retSyllableList = []
    for syllable in syllableList:
        j = len(syllable)
-        tmpPhoneList = adjustedPhoneList[i:i+j]
+        tmpPhoneList = adjustedPhoneList[i:i + j]
        numBlanks = -1
        phoneList = tmpPhoneList[:]
        while numBlanks != 0:
            
            numBlanks = tmpPhoneList.count("''")
            if numBlanks > 0:
-                tmpPhoneList = adjustedPhoneList[i+j:i+j+numBlanks]
+                tmpPhoneList = adjustedPhoneList[i + j:i + j + numBlanks]
                phoneList.extend(tmpPhoneList)
                j += numBlanks
        
@@ -116,27 +115,32 @@ def _findBestPronunciation(isleDict, wordText, aPron):
    
    isleWordList = isleDict.lookup(wordText)
    
-    aP = _prepPronunciation(aPron) # Mapping to simplified phone inventory
+    aP = _prepPronunciation(aPron)  # Mapping to simplified phone inventory
    
-    origPronDict = dict((newPron,oldPron) for newPron, oldPron in zip(aP, aPron))
+    origPronDict = dict((newPron, oldPron)
+                        for newPron, oldPron in zip(aP, aPron))
    
    numDiffList = []
    withStress = []
    i = 0
    alignedSyllabificationList = []
    alignedActualPronunciationList = []
-    for syllableList, stressList in isleWordList:
+    for wordTuple in isleWordList:
+        syllableList = wordTuple[0]  # syllableList, stressList
        
        iP = [phone for phoneList in syllableList for phone in phoneList]
        iP = _prepPronunciation(iP)

        alignedIP, alignedAP = alignPronunciations(iP, aP)
-        alignedAP = [origPronDict.get(phon, "''") for phon in alignedAP] # Remapping to actual phones
+        
+        # Remapping to actual phones
+        alignedAP = [origPronDict.get(phon, "''") for phon in alignedAP]
        alignedActualPronunciationList.append(alignedAP)
        
        # Adjusting the syllabification for differences between the dictionary
        # pronunciation and the actual pronunciation
-        alignedSyllabification = _adjustSyllabification(alignedIP, syllableList)
+        alignedSyllabification = _adjustSyllabification(alignedIP,
+                                                        syllableList)
        alignedSyllabificationList.append(alignedSyllabification)
        
        # Count the number of misalignments between the two
@@ -147,7 +151,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
        hasStress = False
        for syllable in syllableList:
            for phone in syllable:
-                hasStress = "'" in phone or hasStress 
+                hasStress = "'" in phone or hasStress
        
        if hasStress:
            withStress.append(i)
@@ -164,16 +168,16 @@ def _findBestPronunciation(isleDict, wordText, aPron):
    for i, numDiff in enumerate(numDiffList):
        if numDiff != minDiff:
            continue
-        if bestIndex == None:
+        if bestIndex is None:
            bestIndex = i
            bestIsStressed = i in withStress
        else:
            if not bestIsStressed and i in withStress:
                bestIndex = i
                bestIsStressed = True
-        
    
-    return isleWordList, alignedActualPronunciationList, alignedSyllabificationList, bestIndex
+    return (isleWordList, alignedActualPronunciationList,
+            alignedSyllabificationList, bestIndex)


 def _syllabifyPhones(phoneList, syllableList, isleStressList):
@@ -193,9 +197,9 @@ def _syllabifyPhones(phoneList, syllableList, isleStressList):
    
    start = 0
    syllabifiedList = []
-    for i, end in enumerate(numPhoneList):
+    for end in numPhoneList:
        
-        syllable = phoneList[start:start+end]
+        syllable = phoneList[start:start + end]
        syllabifiedList.append(syllable)
        
        start += end
@@ -212,21 +216,6 @@ def alignPronunciations(pronI, pronA):
    pronI = [char for char in pronI]
    pronA = [char for char in pronA]
    
-    # -- allow for some flexibility in pronunciation
-    correctionsTuple = (('d', 't'), ('t', 'd'), ('s', 'z'), ('z', 's'),
-                        ('m', 'n'), ('n', 'm'),)
-    
-    doMatch = lambda i, a: ((i == a) or 
-                            ((i, a) in correctionsTuple))
-    
-    def matchExists(targetPhone, pron):
-        match = False
-        for phone in pron:
-            match = match or doMatch(targetPhone, phone)
-        return match
-    
-    # Remove vowels
-    
    # Remove any elements not in the other list (but maintain order)
    pronITmp = pronI
    pronATmp = pronA
@@ -244,7 +233,7 @@ def alignPronunciations(pronI, pronA):
        startA = pronA.index(phone, startA)
        startI = pronI.index(phone, startI)
        
-        sequenceIndexListA.append(startA) 
+        sequenceIndexListA.append(startA)
        sequenceIndexListI.append(startI)
    
    # An index on the tail of both will be used to create output strings
@@ -257,14 +246,16 @@ def alignPronunciations(pronI, pronA):
    for x in xrange(len(sequenceIndexListA)):
        indexA = sequenceIndexListA[x]
        indexI = sequenceIndexListI[x]
-        if indexA < indexI :
+        if indexA < indexI:
            for x in xrange(indexI - indexA):
                pronA.insert(indexA, "''")
-            sequenceIndexListA = [val + indexI - indexA for val in sequenceIndexListA]
+            sequenceIndexListA = [val + indexI - indexA
+                                  for val in sequenceIndexListA]
        elif indexA > indexI:
            for x in xrange(indexA - indexI):
                pronI.insert(indexI, "''")
-            sequenceIndexListI = [val + indexA - indexI for val in sequenceIndexListI]
+            sequenceIndexListI = [val + indexA - indexI
+                                  for val in sequenceIndexListI]
    
    return pronI, pronA
   
@@ -273,11 +264,12 @@ def findBestSyllabification(isleDict, wordText, actualPronunciationList):
    '''
    Find the best syllabification for a word
    
-    First find the closest pronunciation to a given pronunciation. Then take 
-    the syllabification for that pronunciation and map it onto the 
+    First find the closest pronunciation to a given pronunciation. Then take
+    the syllabification for that pronunciation and map it onto the
    input pronunciation.
    '''
-    retList = _findBestPronunciation(isleDict, wordText, actualPronunciationList)
+    retList = _findBestPronunciation(isleDict, wordText,
+                                     actualPronunciationList)
    isleWordList, alignedAPronList, alignedSyllableList, bestIndex = retList
    
    alignedPhoneList = alignedAPronList[bestIndex]
@@ -285,8 +277,8 @@ def findBestSyllabification(isleDict, wordText, actualPronunciationList):
    syllabification = isleWordList[bestIndex][0]
    stressedIndex = isleWordList[bestIndex][1]
    
-    stressedSyllable, syllableList = _syllabifyPhones(alignedPhoneList, 
-                                                      alignedSyllables, 
+    stressedSyllable, syllableList = _syllabifyPhones(alignedPhoneList,
+                                                      alignedSyllables,
                                                      stressedIndex)
    
    return stressedSyllable, syllableList, syllabification, stressedIndex
@@ -298,9 +290,7 @@ def findClosestPronunciation(isleDict, wordText, aPron):
    '''
    
    retList = _findBestPronunciation(isleDict, wordText, aPron)
-    isleWordList, actualPronunciationList, bestIndex = retList
+    isleWordList = retList[0]
+    bestIndex = retList[3]
    
    return isleWordList[bestIndex]
-
-
-
@@ -20,13 +20,13 @@ firstEntry = lookupResults[0]
 firstSyllableList = firstEntry[0] 
 firstStressList = firstEntry[1]

-print searchWord
-print firstSyllableList, firstStressList # 3rd syllable carries stress
+print(searchWord)
+print(firstSyllableList, firstStressList) # 3rd syllable carries stress


 # Here we determine the syllabification of a word, as it was said.
 # (Of course, this is just a guess)
-print '-'*50
+print('-'*50)

 searchWord = 'another'
 anotherPhoneList = ['n', '@', 'th', 'r']
@@ -37,8 +37,8 @@ returnList = pronunciationtools.findBestSyllabification(isleDict,

 stressedSyllable, syllableList, syllabification, stressedIndex = returnList

-print searchWord
-print anotherPhoneList
-print syllableList # We can see the first syllable was elided
+print(searchWord)
+print(anotherPhoneList)
+print(syllableList) # We can see the first syllable was elided


@@ -20,7 +20,10 @@ path = join('.', 'files')
 path = "/Users/tmahrt/Dropbox/workspace/pysle/test/files"

 tg = praatio.openTextGrid(join(path, "pumpkins.TextGrid"))
-isleDict = isletool.LexicalTool('/Users/tmahrt/Dropbox/workspace/pysle/test/islev2.txt') # Needs the full path to the file
+
+# Needs the full path to the file
+islevPath = '/Users/tmahrt/Dropbox/workspace/pysle/test/islev2.txt'
+isleDict = isletool.LexicalTool(islevPath)

 # Get the syllabification tiers and add it to the textgrid
 syllableTG = praattools.syllabifyTextgrid(isleDict, tg, "word", "phone",
Author	SHA1	Message	Date
timmahrt	985d68da6c	REFACTOR: Change print statement to print function	2015-06-19 17:29:19 -05:00
timmahrt	0e53ed654e	REFACTOR: PEP 8 compliance and minor bugfix For bugfix, see last change in pronunciationtools.py	2015-06-18 19:56:15 -05:00
timmahrt	ce633d0590	BUGFIX: Reflect changes in praatio library	2015-06-16 02:27:46 -05:00
timmahrt	e2a2025f5b	Merge remote-tracking branch 'origin/master'	2015-06-11 15:46:36 -05:00
timmahrt	c10e3cf05f	BUGFIX: Was unable to read islev2.txt with trailing newline My custom islev2.txt did not have a trailing newline.	2015-06-11 15:43:27 -05:00
timmahrt	06222bf176	REFACTOR: PEP 8 compliance	2015-06-11 15:00:26 -05:00
Tim	6353e0172e	Update README.rst	2015-06-01 15:01:29 -05:00
timmahrt	fad0dd2902	SPEED BOOST: Now word lookup ~65 times faster. Used to iterate through the isle text file for each search. Now builds a dictionary of the form{word:pronunciation list,}	2015-01-29 23:02:13 -06:00
timmahrt	475053eee2	DOCUMENTATION: Moved the project description up.	2014-10-23 15:53:57 -05:00
timmahrt	08f8e859cc	DOCUMENTATION: Added link to praatio. Added table of contents. Also added some clarification about the requirements.	2014-10-23 15:51:35 -05:00
timmahrt	9cd6a7e68b	DOCUMENTATION: Added/cleaned up the readme file Added a new section 'common use cases' since I get that question a lot.	2014-10-23 15:41:02 -05:00