11 Commits

Author SHA1 Message Date
timmahrt 985d68da6c REFACTOR: Change print statement to print function 2015-06-19 17:29:19 -05:00
timmahrt 0e53ed654e REFACTOR: PEP 8 compliance and minor bugfix
For bugfix, see last change in pronunciationtools.py
2015-06-18 19:56:15 -05:00
timmahrt ce633d0590 BUGFIX: Reflect changes in praatio library 2015-06-16 02:27:46 -05:00
timmahrt e2a2025f5b Merge remote-tracking branch 'origin/master' 2015-06-11 15:46:36 -05:00
timmahrt c10e3cf05f BUGFIX: Was unable to read islev2.txt with trailing newline
My custom islev2.txt did not have a trailing newline.
2015-06-11 15:43:27 -05:00
timmahrt 06222bf176 REFACTOR: PEP 8 compliance 2015-06-11 15:00:26 -05:00
Tim 6353e0172e Update README.rst 2015-06-01 15:01:29 -05:00
timmahrt fad0dd2902 SPEED BOOST: Now word lookup ~65 times faster.
Used to iterate through the isle text file for each search.
Now builds a dictionary of the form{word:pronunciation list,}
2015-01-29 23:02:13 -06:00
timmahrt 475053eee2 DOCUMENTATION: Moved the project description up. 2014-10-23 15:53:57 -05:00
timmahrt 08f8e859cc DOCUMENTATION: Added link to praatio. Added table of contents.
Also added some clarification about the requirements.
2014-10-23 15:51:35 -05:00
timmahrt 9cd6a7e68b DOCUMENTATION: Added/cleaned up the readme file
Added a new section 'common use cases' since I
get that question a lot.
2014-10-23 15:41:02 -05:00
6 changed files with 167 additions and 135 deletions
+33 -3
View File
@@ -11,6 +11,32 @@ pronunciations (e.g. a list of phones someone said versus a standard or
canonical dictionary pronunciation).
.. sectnum::
.. contents::
Common Use Cases
================
What can you do with this library?
- look up the list of phones and syllables for canonical pronunciations
of a word::
pysle.isletool.LexicalTool.lookup('cat')
- map an actual pronunciation to a dictionary pronunciation (can be used
to automatically find speech errors)::
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',])
- automatically syllabify a praat textgrid containing words and phones
(e.g. force-aligned text) -- requires my
`praatIO <https://github.com/timmahrt/praatIO>`_ library::
pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
Requirements
================
@@ -20,10 +46,15 @@ Requirements
`ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_
`Direct link to the ISLEX file used in this project
<http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_
<http://www.isle.illinois.edu/sst/data/dict/islex/islev2.txt>`_ (islev2.txt)
- ``Python 2.7.*`` or above
- The `praatIO <https://github.com/timmahrt/praatIO>`_ library is required IF
you want to use the textgrid functionality. It is not required
for normal use.
Installation
================
@@ -61,7 +92,6 @@ and another::
print syllableList
>> [["''"], ['n', '@'], ['th', 'r']]
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
Please see \test for example usage
Please see \\test for example usage
+66 -60
View File
@@ -5,86 +5,91 @@ Created on Oct 11, 2012
'''
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>',]
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ]
class WordNotInISLE(Exception):
def __init__(self, word):
super(WordNotInISLE, self).__init__()
self.word = word
def __str__(self):
return "Word '%s' not in ISLE dictionary. Please add it to continue." % self.word
return ("Word '%s' not in ISLE dictionary. "
"Please add it to continue." % self.word)
class LexicalTool():
def __init__(self, islePath):
self.islePath = islePath
self.data = None
self.pronDict = None
self.data = self._buildDict()
def _buildDict(self):
'''
Builds the isle textfile into a dictionary for fast searching
'''
lexDict = {}
wordList = [line.rstrip('\n') for line in open(self.islePath, "rU")]
for row in wordList:
word, pronunciation = row.split(" ", 1)
word = word.split("(")[0]
lexDict.setdefault(word, [])
lexDict[word].append(pronunciation)
return lexDict
def lookup(self, word):
'''
Lookup a word and receive a list of syllables and stressInfo
'''
# All words must be lowercase with no extraneous whitespace
word = word.lower()
word = word.strip()
# Find indicies in the dictionary
pronList = self.data.get(word, None)
if self.data == None:
self.data = open(self.islePath, "r").read()
if pronList is None:
raise WordNotInISLE(word)
else:
pronList = [_parsePronunciation(pronunciationStr)
for pronunciationStr in pronList]
wordList = []
searchIndex = 0
while True:
# (The +1 skips over the "\n" which marks the start of every word)
startIndex = self.data.find("\n"+word + "(", searchIndex) + 1
# find() returns -1 if it does not find anything, but
# note that we added 1 to the return value
try:
assert(startIndex != 0)
except AssertionError:
if searchIndex == 0:
raise WordNotInISLE(word)
else:
break
endIndex = self.data.find("\n", startIndex)
searchIndex = endIndex
wordList.append((startIndex, endIndex))
returnList = []
for startIndex, endIndex in wordList:
isleWord = self.data[startIndex:endIndex]
syllableTxt = isleWord.split("#")[1].strip()
syllableList = [x for x in syllableTxt.split(' . ')]
# Find stress
stressList = []
for i, syllable in enumerate(syllableList):
# Primary stress
if "'" in syllable:
stressList.insert(0, i)
# Secondary stress
elif '"' in syllable:
stressList.append(i)
syllableList = [x.split(" ") for x in syllableList]
returnList.append((syllableList, stressList))
return returnList
return pronList
def _parsePronunciation(pronunciationStr):
'''
Parses the pronunciation string
Returns the list of syllables and a list of primary and
secondary stress locations
'''
syllableTxt = pronunciationStr.split("#")[1].strip()
syllableList = [x for x in syllableTxt.split(' . ')]
# Find stress
stressList = []
for i, syllable in enumerate(syllableList):
# Primary stress
if "'" in syllable:
stressList.insert(0, i)
# Secondary stress
elif '"' in syllable:
stressList.append(i)
syllableList = [x.split(" ") for x in syllableList]
return syllableList, stressList
def getNumPhones(isleDict, label, maxFlag):
'''
If maxFlag=True, use the longest pronunciation. Otherwise, take the
If maxFlag=True, use the longest pronunciation. Otherwise, take the
average length.
'''
phoneCount = 0
@@ -94,24 +99,28 @@ def getNumPhones(isleDict, label, maxFlag):
phoneListOfLists = isleDict.lookup(word)
syllableCountList = []
for syllableList, stressIndex in phoneListOfLists:
for row in phoneListOfLists:
syllableList = row[0]
syllableCountList.append(len(syllableList))
# In ISLE, there can be multiple pronunciations for each word
# as we have no reason to believe one pronunciation is more
# likely than another, we take the average of all of them
phoneCountList = []
for syllableList, stressIndex in phoneListOfLists:
phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList]))
for row in phoneListOfLists:
syllableList = row[0]
phoneCountList.append(len([phon for phoneList in syllableList for
phon in phoneList]))
# The average number of phones for all possible pronunciations
# of this word
if maxFlag == True:
if maxFlag is True:
syllableCount += max(syllableCountList)
phoneCount += max(phoneCountList)
else:
syllableCount += sum(syllableCountList) / float(len(syllableCountList))
phoneCount += sum(phoneCountList) / float(len(phoneCountList))
syllableCount += (sum(syllableCountList) /
float(len(syllableCountList)))
phoneCount += sum(phoneCountList) / float(len(phoneCountList))
return syllableCount, phoneCount
@@ -131,6 +140,3 @@ def findOODWords(isleDict, wordList):
oodList.sort()
return oodList
+21 -18
View File
@@ -4,6 +4,7 @@ Created on Oct 22, 2014
@author: tmahrt
'''
class OptionalFeatureError(ImportError):
def __str__(self):
@@ -18,7 +19,7 @@ from pysle import isletool
from pysle import pronunciationtools
def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
skipLabelList=None):
'''
Given a textgrid, syllabifies the phones in the textgrid
@@ -34,7 +35,7 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
wordTier = tg.tierDict[wordTierName]
phoneTier = tg.tierDict[phoneTierName]
if skipLabelList == None:
if skipLabelList is None:
skipLabelList = []
syllableEntryList = []
@@ -46,28 +47,31 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
subPhoneTier = phoneTier.crop(start, stop, True, False)[0]
phoneList = [phone for startP, endP, phone in subPhoneTier.entryList if phone != '']
# entry = (start, stop, phone)
phoneList = [entry[2] for entry in subPhoneTier.entryList
if entry[2] != '']
try:
returnList = pronunciationtools.findBestSyllabification(isleDict,
word,
returnList = pronunciationtools.findBestSyllabification(isleDict,
word,
phoneList)
except isletool.WordNotInISLE:
print "Word ('%s') not is isle -- skipping syllabification" % word
print("Word ('%s') not is isle -- skipping syllabification" % word)
continue
except (pronunciationtools.NullPronunciationError):
print "Word ('%s') has no provided pronunciation" % word
print("Word ('%s') has no provided pronunciation" % word)
continue
stressedSyllable, syllableList, syllabification, stressIndexList = returnList
syllableList = returnList[1]
stressIndexList = returnList[3]
i = 0
# print syllableList
# print(syllableList)
for k, syllable in enumerate(syllableList):
# Create the syllable tier entry
j = len(syllable)
stubEntryList = subPhoneTier.entryList[i:i+j]
stubEntryList = subPhoneTier.entryList[i:i + j]
i += j
# The whole syllable was deleted
@@ -76,29 +80,28 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
syllableStart = stubEntryList[0][0]
syllableEnd = stubEntryList[-1][1]
label = "-".join([phone for start, end, phone in stubEntryList])
label = "-".join([entry[2] for entry in stubEntryList])
syllableEntryList.append( (syllableStart, syllableEnd, label) )
syllableEntryList.append((syllableStart, syllableEnd, label))
# Create the tonic tier entry
try:
stressIndex = stressIndexList[0]
except IndexError:
stressIndex = None # Function word probably
stressIndex = None # Function word probably
tonicLabel = ''
if k == stressIndex:
tonicLabel = 'T'
tonicEntryList.append( (syllableStart, syllableEnd, tonicLabel) )
tonicEntryList.append((syllableStart, syllableEnd, tonicLabel))
# Create a textgrid with the two syllable-level tiers
syllableTier = praatio.TextgridTier("syllable", syllableEntryList, praatio.INTERVAL_TIER)
tonicTier = praatio.TextgridTier('tonic', tonicEntryList, praatio.INTERVAL_TIER)
syllableTier = praatio.IntervalTier("syllable", syllableEntryList)
tonicTier = praatio.IntervalTier('tonic', tonicEntryList)
syllableTG = praatio.Textgrid()
syllableTG.addTier(syllableTier)
syllableTG.addTier(tonicTier)
return syllableTG
+37 -47
View File
@@ -9,10 +9,10 @@ import itertools
from pysle import isletool
class NullPronunciationError(Exception):
def __init__(self, word):
super(NullPronunciationError, self).__init__()
self.word = word
def __str__(self):
@@ -49,7 +49,7 @@ def _lcs(xs, ys):
ll_b = _lcs_lens(xb, ys)
ll_e = _lcs_lens(xe[::-1], ys[::-1])
_, k = max((ll_b[j] + ll_e[ny - j], j)
for j in range(ny + 1))
for j in range(ny + 1))
yb, ye = ys[:k], ys[k:]
return _lcs(xb, yb) + _lcs(xe, ye)
@@ -58,14 +58,13 @@ def _prepPronunciation(phoneList):
retList = []
for phone in phoneList:
if 'r' in phone:
phone = ['r',]
phone = ['r', ]
try:
phone = phone[0] # Only represent the str by its first letter
phone = phone[0] # Only represent the string by its first letter
phone = phone.lower()
except IndexError:
raise NullPhoneError()
phone = phone.lower()
if phone in isletool.vowelList:
phone = 'V'
retList.append(phone)
@@ -85,14 +84,14 @@ def _adjustSyllabification(adjustedPhoneList, syllableList):
retSyllableList = []
for syllable in syllableList:
j = len(syllable)
tmpPhoneList = adjustedPhoneList[i:i+j]
tmpPhoneList = adjustedPhoneList[i:i + j]
numBlanks = -1
phoneList = tmpPhoneList[:]
while numBlanks != 0:
numBlanks = tmpPhoneList.count("''")
if numBlanks > 0:
tmpPhoneList = adjustedPhoneList[i+j:i+j+numBlanks]
tmpPhoneList = adjustedPhoneList[i + j:i + j + numBlanks]
phoneList.extend(tmpPhoneList)
j += numBlanks
@@ -116,27 +115,32 @@ def _findBestPronunciation(isleDict, wordText, aPron):
isleWordList = isleDict.lookup(wordText)
aP = _prepPronunciation(aPron) # Mapping to simplified phone inventory
aP = _prepPronunciation(aPron) # Mapping to simplified phone inventory
origPronDict = dict((newPron,oldPron) for newPron, oldPron in zip(aP, aPron))
origPronDict = dict((newPron, oldPron)
for newPron, oldPron in zip(aP, aPron))
numDiffList = []
withStress = []
i = 0
alignedSyllabificationList = []
alignedActualPronunciationList = []
for syllableList, stressList in isleWordList:
for wordTuple in isleWordList:
syllableList = wordTuple[0] # syllableList, stressList
iP = [phone for phoneList in syllableList for phone in phoneList]
iP = _prepPronunciation(iP)
alignedIP, alignedAP = alignPronunciations(iP, aP)
alignedAP = [origPronDict.get(phon, "''") for phon in alignedAP] # Remapping to actual phones
# Remapping to actual phones
alignedAP = [origPronDict.get(phon, "''") for phon in alignedAP]
alignedActualPronunciationList.append(alignedAP)
# Adjusting the syllabification for differences between the dictionary
# pronunciation and the actual pronunciation
alignedSyllabification = _adjustSyllabification(alignedIP, syllableList)
alignedSyllabification = _adjustSyllabification(alignedIP,
syllableList)
alignedSyllabificationList.append(alignedSyllabification)
# Count the number of misalignments between the two
@@ -147,7 +151,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
hasStress = False
for syllable in syllableList:
for phone in syllable:
hasStress = "'" in phone or hasStress
hasStress = "'" in phone or hasStress
if hasStress:
withStress.append(i)
@@ -164,16 +168,16 @@ def _findBestPronunciation(isleDict, wordText, aPron):
for i, numDiff in enumerate(numDiffList):
if numDiff != minDiff:
continue
if bestIndex == None:
if bestIndex is None:
bestIndex = i
bestIsStressed = i in withStress
else:
if not bestIsStressed and i in withStress:
bestIndex = i
bestIsStressed = True
return isleWordList, alignedActualPronunciationList, alignedSyllabificationList, bestIndex
return (isleWordList, alignedActualPronunciationList,
alignedSyllabificationList, bestIndex)
def _syllabifyPhones(phoneList, syllableList, isleStressList):
@@ -193,9 +197,9 @@ def _syllabifyPhones(phoneList, syllableList, isleStressList):
start = 0
syllabifiedList = []
for i, end in enumerate(numPhoneList):
for end in numPhoneList:
syllable = phoneList[start:start+end]
syllable = phoneList[start:start + end]
syllabifiedList.append(syllable)
start += end
@@ -212,21 +216,6 @@ def alignPronunciations(pronI, pronA):
pronI = [char for char in pronI]
pronA = [char for char in pronA]
# -- allow for some flexibility in pronunciation
correctionsTuple = (('d', 't'), ('t', 'd'), ('s', 'z'), ('z', 's'),
('m', 'n'), ('n', 'm'),)
doMatch = lambda i, a: ((i == a) or
((i, a) in correctionsTuple))
def matchExists(targetPhone, pron):
match = False
for phone in pron:
match = match or doMatch(targetPhone, phone)
return match
# Remove vowels
# Remove any elements not in the other list (but maintain order)
pronITmp = pronI
pronATmp = pronA
@@ -244,7 +233,7 @@ def alignPronunciations(pronI, pronA):
startA = pronA.index(phone, startA)
startI = pronI.index(phone, startI)
sequenceIndexListA.append(startA)
sequenceIndexListA.append(startA)
sequenceIndexListI.append(startI)
# An index on the tail of both will be used to create output strings
@@ -257,14 +246,16 @@ def alignPronunciations(pronI, pronA):
for x in xrange(len(sequenceIndexListA)):
indexA = sequenceIndexListA[x]
indexI = sequenceIndexListI[x]
if indexA < indexI :
if indexA < indexI:
for x in xrange(indexI - indexA):
pronA.insert(indexA, "''")
sequenceIndexListA = [val + indexI - indexA for val in sequenceIndexListA]
sequenceIndexListA = [val + indexI - indexA
for val in sequenceIndexListA]
elif indexA > indexI:
for x in xrange(indexA - indexI):
pronI.insert(indexI, "''")
sequenceIndexListI = [val + indexA - indexI for val in sequenceIndexListI]
sequenceIndexListI = [val + indexA - indexI
for val in sequenceIndexListI]
return pronI, pronA
@@ -273,11 +264,12 @@ def findBestSyllabification(isleDict, wordText, actualPronunciationList):
'''
Find the best syllabification for a word
First find the closest pronunciation to a given pronunciation. Then take
the syllabification for that pronunciation and map it onto the
First find the closest pronunciation to a given pronunciation. Then take
the syllabification for that pronunciation and map it onto the
input pronunciation.
'''
retList = _findBestPronunciation(isleDict, wordText, actualPronunciationList)
retList = _findBestPronunciation(isleDict, wordText,
actualPronunciationList)
isleWordList, alignedAPronList, alignedSyllableList, bestIndex = retList
alignedPhoneList = alignedAPronList[bestIndex]
@@ -285,8 +277,8 @@ def findBestSyllabification(isleDict, wordText, actualPronunciationList):
syllabification = isleWordList[bestIndex][0]
stressedIndex = isleWordList[bestIndex][1]
stressedSyllable, syllableList = _syllabifyPhones(alignedPhoneList,
alignedSyllables,
stressedSyllable, syllableList = _syllabifyPhones(alignedPhoneList,
alignedSyllables,
stressedIndex)
return stressedSyllable, syllableList, syllabification, stressedIndex
@@ -298,9 +290,7 @@ def findClosestPronunciation(isleDict, wordText, aPron):
'''
retList = _findBestPronunciation(isleDict, wordText, aPron)
isleWordList, actualPronunciationList, bestIndex = retList
isleWordList = retList[0]
bestIndex = retList[3]
return isleWordList[bestIndex]
+6 -6
View File
@@ -20,13 +20,13 @@ firstEntry = lookupResults[0]
firstSyllableList = firstEntry[0]
firstStressList = firstEntry[1]
print searchWord
print firstSyllableList, firstStressList # 3rd syllable carries stress
print(searchWord)
print(firstSyllableList, firstStressList) # 3rd syllable carries stress
# Here we determine the syllabification of a word, as it was said.
# (Of course, this is just a guess)
print '-'*50
print('-'*50)
searchWord = 'another'
anotherPhoneList = ['n', '@', 'th', 'r']
@@ -37,8 +37,8 @@ returnList = pronunciationtools.findBestSyllabification(isleDict,
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
print searchWord
print anotherPhoneList
print syllableList # We can see the first syllable was elided
print(searchWord)
print(anotherPhoneList)
print(syllableList) # We can see the first syllable was elided
+4 -1
View File
@@ -20,7 +20,10 @@ path = join('.', 'files')
path = "/Users/tmahrt/Dropbox/workspace/pysle/test/files"
tg = praatio.openTextGrid(join(path, "pumpkins.TextGrid"))
isleDict = isletool.LexicalTool('/Users/tmahrt/Dropbox/workspace/pysle/test/islev2.txt') # Needs the full path to the file
# Needs the full path to the file
islevPath = '/Users/tmahrt/Dropbox/workspace/pysle/test/islev2.txt'
isleDict = isletool.LexicalTool(islevPath)
# Get the syllabification tiers and add it to the textgrid
syllableTG = praattools.syllabifyTextgrid(isleDict, tg, "word", "phone",