mirror of
https://github.com/wassname/pysle.git
synced 2026-06-27 16:10:05 +08:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fad0dd2902 | |||
| 475053eee2 | |||
| 08f8e859cc | |||
| 9cd6a7e68b |
+33
-3
@@ -11,6 +11,32 @@ pronunciations (e.g. a list of phones someone said versus a standard or
|
||||
canonical dictionary pronunciation).
|
||||
|
||||
|
||||
.. sectnum::
|
||||
.. contents::
|
||||
|
||||
|
||||
Common Use Cases
|
||||
================
|
||||
|
||||
What can you do with this library?
|
||||
|
||||
- look up the list of phones and syllables for canonical pronunciations
|
||||
of a word::
|
||||
|
||||
pysle.isletool.LexicalTool.lookup('cat')
|
||||
|
||||
- map an actual pronunciation to a dictionary pronunciation (can be used
|
||||
to automatically find speech errors)::
|
||||
|
||||
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',])
|
||||
|
||||
- automatically syllabify a praat textgrid containing words and phones
|
||||
(e.g. force-aligned text) -- requires my
|
||||
`praatIO <https://github.com/timmahrt/praatIO>`_ library::
|
||||
|
||||
pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
|
||||
|
||||
|
||||
Requirements
|
||||
================
|
||||
|
||||
@@ -20,10 +46,15 @@ Requirements
|
||||
`ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_
|
||||
|
||||
`Direct link to the ISLEX file used in this project
|
||||
<http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_
|
||||
<http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_ (islev2.txt)
|
||||
|
||||
- ``Python 2.7.*`` or above
|
||||
|
||||
- The `praatIO <https://github.com/timmahrt/praatIO>`_ library is required IF
|
||||
you want to use the textgrid functionality. It is not required
|
||||
for normal use.
|
||||
|
||||
|
||||
Installation
|
||||
================
|
||||
|
||||
@@ -61,7 +92,6 @@ and another::
|
||||
print syllableList
|
||||
>> [["''"], ['n', '@'], ['th', 'r']]
|
||||
|
||||
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
|
||||
|
||||
Please see \test for example usage
|
||||
Please see \\test for example usage
|
||||
|
||||
|
||||
+54
-48
@@ -17,70 +17,75 @@ class WordNotInISLE(Exception):
|
||||
return "Word '%s' not in ISLE dictionary. Please add it to continue." % self.word
|
||||
|
||||
|
||||
|
||||
class LexicalTool():
|
||||
|
||||
|
||||
def __init__(self, islePath):
|
||||
self.islePath = islePath
|
||||
self.data = None
|
||||
self.pronDict = None
|
||||
self.data = self._buildDict()
|
||||
|
||||
|
||||
def _buildDict(self):
|
||||
'''
|
||||
Builds the isle textfile into a dictionary for fast searching
|
||||
'''
|
||||
dict = {}
|
||||
wordList = open(self.islePath, "r").read().split("\n")
|
||||
for row in wordList:
|
||||
word, pronunciation = row.split(" ", 1)
|
||||
word = word.split("(")[0]
|
||||
|
||||
dict.setdefault(word, [])
|
||||
dict[word].append(pronunciation)
|
||||
|
||||
return dict
|
||||
|
||||
|
||||
def lookup(self, word):
|
||||
'''
|
||||
Lookup a word and receive a list of syllables and stressInfo
|
||||
'''
|
||||
|
||||
# All words must be lowercase with no extraneous whitespace
|
||||
word = word.lower()
|
||||
word = word.strip()
|
||||
|
||||
# Find indicies in the dictionary
|
||||
pronList = self.data.get(word, None)
|
||||
|
||||
if self.data == None:
|
||||
self.data = open(self.islePath, "r").read()
|
||||
if pronList == None:
|
||||
raise WordNotInISLE(word)
|
||||
else:
|
||||
pronList = [_parsePronunciation(pronunciationStr)
|
||||
for pronunciationStr in pronList]
|
||||
|
||||
wordList = []
|
||||
searchIndex = 0
|
||||
while True:
|
||||
# (The +1 skips over the "\n" which marks the start of every word)
|
||||
startIndex = self.data.find("\n"+word + "(", searchIndex) + 1
|
||||
|
||||
# find() returns -1 if it does not find anything, but
|
||||
# note that we added 1 to the return value
|
||||
try:
|
||||
assert(startIndex != 0)
|
||||
except AssertionError:
|
||||
if searchIndex == 0:
|
||||
raise WordNotInISLE(word)
|
||||
else:
|
||||
break
|
||||
|
||||
endIndex = self.data.find("\n", startIndex)
|
||||
|
||||
searchIndex = endIndex
|
||||
wordList.append((startIndex, endIndex))
|
||||
|
||||
returnList = []
|
||||
for startIndex, endIndex in wordList:
|
||||
isleWord = self.data[startIndex:endIndex]
|
||||
syllableTxt = isleWord.split("#")[1].strip()
|
||||
syllableList = [x for x in syllableTxt.split(' . ')]
|
||||
|
||||
# Find stress
|
||||
stressList = []
|
||||
for i, syllable in enumerate(syllableList):
|
||||
# Primary stress
|
||||
if "'" in syllable:
|
||||
stressList.insert(0, i)
|
||||
# Secondary stress
|
||||
elif '"' in syllable:
|
||||
stressList.append(i)
|
||||
|
||||
syllableList = [x.split(" ") for x in syllableList]
|
||||
returnList.append((syllableList, stressList))
|
||||
|
||||
return returnList
|
||||
return pronList
|
||||
|
||||
|
||||
def _parsePronunciation(pronunciationStr):
|
||||
'''
|
||||
Parses the pronunciation string
|
||||
|
||||
Returns the list of syllables and a list of primary and
|
||||
secondary stress locations
|
||||
'''
|
||||
syllableTxt = pronunciationStr.split("#")[1].strip()
|
||||
syllableList = [x for x in syllableTxt.split(' . ')]
|
||||
|
||||
# Find stress
|
||||
stressList = []
|
||||
for i, syllable in enumerate(syllableList):
|
||||
# Primary stress
|
||||
if "'" in syllable:
|
||||
stressList.insert(0, i)
|
||||
# Secondary stress
|
||||
elif '"' in syllable:
|
||||
stressList.append(i)
|
||||
|
||||
syllableList = [x.split(" ") for x in syllableList]
|
||||
|
||||
return syllableList, stressList
|
||||
|
||||
|
||||
def getNumPhones(isleDict, label, maxFlag):
|
||||
'''
|
||||
|
||||
@@ -102,7 +107,8 @@ def getNumPhones(isleDict, label, maxFlag):
|
||||
# likely than another, we take the average of all of them
|
||||
phoneCountList = []
|
||||
for syllableList, stressIndex in phoneListOfLists:
|
||||
phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList]))
|
||||
phoneCountList.append(len([phon for phoneList in syllableList for
|
||||
phon in phoneList]))
|
||||
|
||||
# The average number of phones for all possible pronunciations
|
||||
# of this word
|
||||
|
||||
Reference in New Issue
Block a user