4 Commits

Author SHA1 Message Date
timmahrt fad0dd2902 SPEED BOOST: Now word lookup ~65 times faster.
Used to iterate through the isle text file for each search.
Now builds a dictionary of the form{word:pronunciation list,}
2015-01-29 23:02:13 -06:00
timmahrt 475053eee2 DOCUMENTATION: Moved the project description up. 2014-10-23 15:53:57 -05:00
timmahrt 08f8e859cc DOCUMENTATION: Added link to praatio. Added table of contents.
Also added some clarification about the requirements.
2014-10-23 15:51:35 -05:00
timmahrt 9cd6a7e68b DOCUMENTATION: Added/cleaned up the readme file
Added a new section 'common use cases' since I
get that question a lot.
2014-10-23 15:41:02 -05:00
2 changed files with 87 additions and 51 deletions
+33 -3
View File
@@ -11,6 +11,32 @@ pronunciations (e.g. a list of phones someone said versus a standard or
canonical dictionary pronunciation).
.. sectnum::
.. contents::
Common Use Cases
================
What can you do with this library?
- look up the list of phones and syllables for canonical pronunciations
of a word::
pysle.isletool.LexicalTool.lookup('cat')
- map an actual pronunciation to a dictionary pronunciation (can be used
to automatically find speech errors)::
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',])
- automatically syllabify a praat textgrid containing words and phones
(e.g. force-aligned text) -- requires my
`praatIO <https://github.com/timmahrt/praatIO>`_ library::
pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
Requirements
================
@@ -20,10 +46,15 @@ Requirements
`ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_
`Direct link to the ISLEX file used in this project
<http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_
<http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_ (islev2.txt)
- ``Python 2.7.*`` or above
- The `praatIO <https://github.com/timmahrt/praatIO>`_ library is required IF
you want to use the textgrid functionality. It is not required
for normal use.
Installation
================
@@ -61,7 +92,6 @@ and another::
print syllableList
>> [["''"], ['n', '@'], ['th', 'r']]
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
Please see \test for example usage
Please see \\test for example usage
+54 -48
View File
@@ -17,70 +17,75 @@ class WordNotInISLE(Exception):
return "Word '%s' not in ISLE dictionary. Please add it to continue." % self.word
class LexicalTool():
def __init__(self, islePath):
self.islePath = islePath
self.data = None
self.pronDict = None
self.data = self._buildDict()
def _buildDict(self):
'''
Builds the isle textfile into a dictionary for fast searching
'''
dict = {}
wordList = open(self.islePath, "r").read().split("\n")
for row in wordList:
word, pronunciation = row.split(" ", 1)
word = word.split("(")[0]
dict.setdefault(word, [])
dict[word].append(pronunciation)
return dict
def lookup(self, word):
'''
Lookup a word and receive a list of syllables and stressInfo
'''
# All words must be lowercase with no extraneous whitespace
word = word.lower()
word = word.strip()
# Find indicies in the dictionary
pronList = self.data.get(word, None)
if self.data == None:
self.data = open(self.islePath, "r").read()
if pronList == None:
raise WordNotInISLE(word)
else:
pronList = [_parsePronunciation(pronunciationStr)
for pronunciationStr in pronList]
wordList = []
searchIndex = 0
while True:
# (The +1 skips over the "\n" which marks the start of every word)
startIndex = self.data.find("\n"+word + "(", searchIndex) + 1
# find() returns -1 if it does not find anything, but
# note that we added 1 to the return value
try:
assert(startIndex != 0)
except AssertionError:
if searchIndex == 0:
raise WordNotInISLE(word)
else:
break
endIndex = self.data.find("\n", startIndex)
searchIndex = endIndex
wordList.append((startIndex, endIndex))
returnList = []
for startIndex, endIndex in wordList:
isleWord = self.data[startIndex:endIndex]
syllableTxt = isleWord.split("#")[1].strip()
syllableList = [x for x in syllableTxt.split(' . ')]
# Find stress
stressList = []
for i, syllable in enumerate(syllableList):
# Primary stress
if "'" in syllable:
stressList.insert(0, i)
# Secondary stress
elif '"' in syllable:
stressList.append(i)
syllableList = [x.split(" ") for x in syllableList]
returnList.append((syllableList, stressList))
return returnList
return pronList
def _parsePronunciation(pronunciationStr):
'''
Parses the pronunciation string
Returns the list of syllables and a list of primary and
secondary stress locations
'''
syllableTxt = pronunciationStr.split("#")[1].strip()
syllableList = [x for x in syllableTxt.split(' . ')]
# Find stress
stressList = []
for i, syllable in enumerate(syllableList):
# Primary stress
if "'" in syllable:
stressList.insert(0, i)
# Secondary stress
elif '"' in syllable:
stressList.append(i)
syllableList = [x.split(" ") for x in syllableList]
return syllableList, stressList
def getNumPhones(isleDict, label, maxFlag):
'''
@@ -102,7 +107,8 @@ def getNumPhones(isleDict, label, maxFlag):
# likely than another, we take the average of all of them
phoneCountList = []
for syllableList, stressIndex in phoneListOfLists:
phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList]))
phoneCountList.append(len([phon for phoneList in syllableList for
phon in phoneList]))
# The average number of phones for all possible pronunciations
# of this word