mirror of
https://github.com/wassname/pysle.git
synced 2026-07-05 17:30:28 +08:00
Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9e212125b1 | |||
| 1b1903bc0b | |||
| 5e64deebe6 | |||
| bce3c8ff23 | |||
| 4056b105c9 | |||
| d88ff7d8d9 | |||
| 4cc4bf85ec | |||
| 4c1a26ed03 | |||
| ac8643678b | |||
| b76454f626 | |||
| ea0bc5c5cd | |||
| 5d70367bfc | |||
| 81257bdfaf | |||
| 88f79d63e8 | |||
| 2dcb92217d | |||
| a36d7c8d17 | |||
| 65ac652dea | |||
| ee08c347d5 | |||
| c16c68a6ac | |||
| bc4f19c74c | |||
| c19cde7165 | |||
| 38ebc7f3f9 | |||
| 102e8a7488 | |||
| 6b786cd00a | |||
| fb1e638cb8 | |||
| e5acdfce30 | |||
| d47c312de7 | |||
| 303d9bfcf2 | |||
| 9c0ccd5748 | |||
| 393182500e | |||
| 985d68da6c | |||
| 0e53ed654e | |||
| ce633d0590 | |||
| e2a2025f5b | |||
| c10e3cf05f | |||
| 06222bf176 | |||
| 6353e0172e | |||
| fad0dd2902 | |||
| 475053eee2 | |||
| 08f8e859cc | |||
| 9cd6a7e68b |
+103
-10
@@ -3,6 +3,9 @@
|
|||||||
pysle
|
pysle
|
||||||
---------
|
---------
|
||||||
|
|
||||||
|
.. image:: https://img.shields.io/badge/license-MIT-blue.svg?
|
||||||
|
:target: http://opensource.org/licenses/MIT
|
||||||
|
|
||||||
Pronounced like 'p' + 'isle'.
|
Pronounced like 'p' + 'isle'.
|
||||||
|
|
||||||
An interface for the ILSEX (international speech lexicon) dictionary,
|
An interface for the ILSEX (international speech lexicon) dictionary,
|
||||||
@@ -11,26 +14,96 @@ pronunciations (e.g. a list of phones someone said versus a standard or
|
|||||||
canonical dictionary pronunciation).
|
canonical dictionary pronunciation).
|
||||||
|
|
||||||
|
|
||||||
|
.. sectnum::
|
||||||
|
.. contents::
|
||||||
|
|
||||||
|
|
||||||
|
Common Use Cases
|
||||||
|
================
|
||||||
|
|
||||||
|
What can you do with this library?
|
||||||
|
|
||||||
|
- look up the list of phones and syllables for canonical pronunciations
|
||||||
|
of a word::
|
||||||
|
|
||||||
|
pysle.isletool.LexicalTool.lookup('cat')
|
||||||
|
|
||||||
|
- map an actual pronunciation to a dictionary pronunciation (can be used
|
||||||
|
to automatically find speech errors)::
|
||||||
|
|
||||||
|
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['k', 'æ',])
|
||||||
|
|
||||||
|
- automatically syllabify a praat textgrid containing words and phones
|
||||||
|
(e.g. force-aligned text) -- requires my
|
||||||
|
`praatIO <https://github.com/timmahrt/praatIO>`_ library::
|
||||||
|
|
||||||
|
pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
|
||||||
|
|
||||||
|
- search for words based on pronunciation::
|
||||||
|
|
||||||
|
e.g. Words that start with a sound, or have a sound word medially, or
|
||||||
|
in stressed vowel position, etc.
|
||||||
|
|
||||||
|
see /tests/dictionary_search.py
|
||||||
|
|
||||||
|
Major revisions
|
||||||
|
================
|
||||||
|
|
||||||
|
Ver 1.4 (July 9, 2016)
|
||||||
|
|
||||||
|
- added search functionality
|
||||||
|
|
||||||
|
- ported code to use the new unicode IPA-based isledict
|
||||||
|
(the old one was ascii)
|
||||||
|
|
||||||
|
Ver 1.3 (March 15, 2016)
|
||||||
|
|
||||||
|
- added indicies for stressed vowels
|
||||||
|
|
||||||
|
Ver 1.2 (June 20, 2015)
|
||||||
|
|
||||||
|
- Python 3.x support
|
||||||
|
|
||||||
|
Ver 1.1 (January 30, 2015)
|
||||||
|
|
||||||
|
- word lookup ~65 times faster
|
||||||
|
|
||||||
|
Ver 1.0 (October 23, 2014)
|
||||||
|
|
||||||
|
- first public release.
|
||||||
|
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
================
|
================
|
||||||
|
|
||||||
- Before you use this library (before or after installing it) you will need
|
- Before you use this library (before or after installing it) you will need
|
||||||
to download the ILSEX dictionary. It can be downloaded here:
|
to download the ILSEX dictionary. It can be downloaded here under the
|
||||||
|
section 'English' linked under the text 'English Pronlex'
|
||||||
|
(with a file name of ISLEdict.txt):
|
||||||
|
|
||||||
`ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_
|
`ISLEX project page <http://isle.illinois.edu/sst/data/g2ps/>`_
|
||||||
|
|
||||||
`Direct link to the ISLEX file used in this project
|
`Direct link to the ISLEX file used in this project
|
||||||
<http://www.isle.illinois.edu/sst/data/dict/islev2.txt)>`_
|
<http://isle.illinois.edu/sst/data/g2ps/English/ISLEdict.txt>`_ (ISLEdict.txt)
|
||||||
|
|
||||||
- ``Python 2.7.*`` or above
|
- ``Python 2.7.*`` or above
|
||||||
|
|
||||||
|
- ``Python 3.3.*`` or above
|
||||||
|
|
||||||
|
- The `praatIO <https://github.com/timmahrt/praatIO>`_ library is required IF
|
||||||
|
you want to use the textgrid functionality. It is not required
|
||||||
|
for normal use.
|
||||||
|
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
================
|
================
|
||||||
|
|
||||||
From a command-line shell, navigate to the directory this is located in
|
If you on Windows, you can use the installer found here (check that it is up to date though)
|
||||||
and type::
|
`Windows installer <http://www.timmahrt.com/python_installers>`_
|
||||||
|
|
||||||
python setup.py install
|
Otherwise, to manually install, after downloading the source from github, from a command-line shell, navigate to the directory containing setup.py and type::
|
||||||
|
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
If python is not in your path, you'll need to enter the full path e.g.::
|
If python is not in your path, you'll need to enter the full path e.g.::
|
||||||
|
|
||||||
@@ -45,7 +118,7 @@ Here is a typical common usage::
|
|||||||
from pysle import isle
|
from pysle import isle
|
||||||
isleDict = isle.LexicalTool('C:\islev2.dict')
|
isleDict = isle.LexicalTool('C:\islev2.dict')
|
||||||
print isleDict.lookup('catatonic')[0] # Get the first pronunciation
|
print isleDict.lookup('catatonic')[0] # Get the first pronunciation
|
||||||
>> [['kh', '@,'], ['t_(', '&'], ['th', "A'"], ['n', 'I', 'kh']] [2]
|
>> [['k', 'ˌæ'], ['t˺', 'ə'], ['t', 'ˈɑ'], ['n', 'ɪ', 'k']] [2, 0]
|
||||||
|
|
||||||
and another::
|
and another::
|
||||||
|
|
||||||
@@ -53,7 +126,7 @@ and another::
|
|||||||
from psyle import pronunciationTools
|
from psyle import pronunciationTools
|
||||||
|
|
||||||
searchWord = 'another'
|
searchWord = 'another'
|
||||||
anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced
|
anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced (ASCII or IPA ok here)
|
||||||
|
|
||||||
returnList = pronunciationTools.findBestSyllabification(isleDict,
|
returnList = pronunciationTools.findBestSyllabification(isleDict,
|
||||||
searchWord,
|
searchWord,
|
||||||
@@ -61,7 +134,27 @@ and another::
|
|||||||
print syllableList
|
print syllableList
|
||||||
>> [["''"], ['n', '@'], ['th', 'r']]
|
>> [["''"], ['n', '@'], ['th', 'r']]
|
||||||
|
|
||||||
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
|
|
||||||
|
|
||||||
Please see \test for example usage
|
Please see \\examples for example usage
|
||||||
|
|
||||||
|
|
||||||
|
Citing pysle
|
||||||
|
===============
|
||||||
|
|
||||||
|
Pysle is general purpose coding and doesn't need to be cited
|
||||||
|
(you should cite the
|
||||||
|
`ISLEX project <http://isle.illinois.edu/sst/data/g2ps/>`_
|
||||||
|
instead) but if you would like to, it can be cited like so:
|
||||||
|
|
||||||
|
Tim Mahrt. Pysle. https://github.com/timmahrt/pysle, 2016.
|
||||||
|
|
||||||
|
|
||||||
|
Acknowledgements
|
||||||
|
================
|
||||||
|
|
||||||
|
Development of Pysle was possible thanks to NSF grant **IIS 07-03624**
|
||||||
|
to Jennifer Cole and Mark Hasegawa-Johnson, NSF grant **BCS 12-51343**
|
||||||
|
to Jennifer Cole, José Hualde, and Caroline Smith, and
|
||||||
|
to the A*MIDEX project (n° **ANR-11-IDEX-0001-02**) to James Sneed German
|
||||||
|
funded by the Investissements d'Avenir French Government program, managed
|
||||||
|
by the French National Research Agency (ANR).
|
||||||
|
|||||||
+288
-50
@@ -1,84 +1,321 @@
|
|||||||
|
#encoding: utf-8
|
||||||
'''
|
'''
|
||||||
Created on Oct 11, 2012
|
Created on Oct 11, 2012
|
||||||
|
|
||||||
@author: timmahrt
|
@author: timmahrt
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
|
||||||
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>',]
|
|
||||||
|
charList = [u'#', u'.', u'aʊ', u'b', u'd', u'dʒ', u'ei', u'f', u'g',
|
||||||
|
u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'oʊ', u'p',
|
||||||
|
u'r', u's', u't', u'tʃ', u'u', u'v', u'w', u'z', u'æ',
|
||||||
|
u'ð', u'ŋ', u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
|
||||||
|
u'ɪ', u'ɵ', u'ɹ', u'ʃ', u'ʊ', u'ʒ', u'æ', u'ʌ', ]
|
||||||
|
|
||||||
|
diacriticList = [u'˺', u'ˌ', u'̩', u'̃', ]
|
||||||
|
|
||||||
|
vowelList = [u'aʊ', u'ei', u'i', u'oʊ', u'u', u'æ',
|
||||||
|
u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
|
||||||
|
u'ɪ', u'ʊ', u'ʌ', ]
|
||||||
|
|
||||||
|
|
||||||
|
def isVowel(char):
|
||||||
|
return any([vowel in char for vowel in vowelList])
|
||||||
|
|
||||||
|
|
||||||
|
def sequenceMatch(matchChar, searchStr):
|
||||||
|
return matchChar in searchStr
|
||||||
|
|
||||||
|
|
||||||
class WordNotInISLE(Exception):
|
class WordNotInISLE(Exception):
|
||||||
|
|
||||||
def __init__(self, word):
|
def __init__(self, word):
|
||||||
|
super(WordNotInISLE, self).__init__()
|
||||||
self.word = word
|
self.word = word
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "Word '%s' not in ISLE dictionary. Please add it to continue." % self.word
|
return ("Word '%s' not in ISLE dictionary. "
|
||||||
|
"Please add it to continue." % self.word)
|
||||||
|
|
||||||
|
|
||||||
class LexicalTool():
|
class LexicalTool():
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, islePath):
|
def __init__(self, islePath):
|
||||||
self.islePath = islePath
|
self.islePath = islePath
|
||||||
self.data = None
|
self.data = self._buildDict()
|
||||||
self.pronDict = None
|
|
||||||
|
|
||||||
|
def _buildDict(self):
|
||||||
|
'''
|
||||||
|
Builds the isle textfile into a dictionary for fast searching
|
||||||
|
'''
|
||||||
|
lexDict = {}
|
||||||
|
with io.open(self.islePath, "r", encoding='utf-8') as fd:
|
||||||
|
wordList = [line.rstrip('\n') for line in fd]
|
||||||
|
|
||||||
|
for row in wordList:
|
||||||
|
word, pronunciation = row.split(" ", 1)
|
||||||
|
word = word.split("(")[0]
|
||||||
|
|
||||||
|
lexDict.setdefault(word, [])
|
||||||
|
lexDict[word].append(pronunciation)
|
||||||
|
|
||||||
|
return lexDict
|
||||||
|
|
||||||
def lookup(self, word):
|
def lookup(self, word):
|
||||||
|
'''
|
||||||
|
Lookup a word and receive a list of syllables and stressInfo
|
||||||
|
'''
|
||||||
|
|
||||||
# All words must be lowercase with no extraneous whitespace
|
# All words must be lowercase with no extraneous whitespace
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
word = word.strip()
|
word = word.strip()
|
||||||
|
|
||||||
# Find indicies in the dictionary
|
pronList = self.data.get(word, None)
|
||||||
|
|
||||||
if self.data == None:
|
if pronList is None:
|
||||||
self.data = open(self.islePath, "r").read()
|
raise WordNotInISLE(word)
|
||||||
|
else:
|
||||||
|
pronList = [_parsePronunciation(pronunciationStr)
|
||||||
|
for pronunciationStr in pronList]
|
||||||
|
|
||||||
wordList = []
|
return pronList
|
||||||
searchIndex = 0
|
|
||||||
|
def search(self, matchStr, numSyllables=None, wordInitial='ok',
|
||||||
|
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||||
|
multiword='ok'):
|
||||||
|
return search(self.data.items(), matchStr, numSyllables=numSyllables,
|
||||||
|
wordInitial=wordInitial, wordFinal=wordFinal,
|
||||||
|
spanSyllable=spanSyllable,
|
||||||
|
stressedSyllable=stressedSyllable,
|
||||||
|
multiword=multiword)
|
||||||
|
|
||||||
|
|
||||||
|
def _prepRESearchStr(matchStr, wordInitial='ok', wordFinal='ok',
|
||||||
|
spanSyllable='ok', stressedSyllable='ok'):
|
||||||
|
'''
|
||||||
|
Prepares a user's RE string for a search
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Protect sounds that are two characters
|
||||||
|
# After this we can assume that each character represents a sound
|
||||||
|
# (We'll revert back when we're done processing the RE)
|
||||||
|
replList = [(u'ei', u'9'), (u'tʃ', u'='), (u'oʊ', u'~'),
|
||||||
|
(u'dʒ', u'@'), (u'aʊ', u'%'), (u'ɑɪ', u'&'),
|
||||||
|
(u'ɔi', u'$')]
|
||||||
|
|
||||||
|
# Add to the replList
|
||||||
|
currentReplNum = 0
|
||||||
|
startI = 0
|
||||||
|
for left, right in (('(', ')'), ('[', ']')):
|
||||||
while True:
|
while True:
|
||||||
# (The +1 skips over the "\n" which marks the start of every word)
|
|
||||||
startIndex = self.data.find("\n"+word + "(", searchIndex) + 1
|
|
||||||
|
|
||||||
# find() returns -1 if it does not find anything, but
|
|
||||||
# note that we added 1 to the return value
|
|
||||||
try:
|
try:
|
||||||
assert(startIndex != 0)
|
i = matchStr.index(left, startI)
|
||||||
except AssertionError:
|
except ValueError:
|
||||||
if searchIndex == 0:
|
break
|
||||||
raise WordNotInISLE(word)
|
j = matchStr.index(right, i) + 1
|
||||||
else:
|
replList.append((matchStr[i:j], str(currentReplNum)))
|
||||||
break
|
currentReplNum += 1
|
||||||
|
startI = j
|
||||||
|
|
||||||
endIndex = self.data.find("\n", startIndex)
|
for charA, charB in replList:
|
||||||
|
matchStr = matchStr.replace(charA, charB)
|
||||||
|
|
||||||
searchIndex = endIndex
|
# Characters to check between all other characters
|
||||||
wordList.append((startIndex, endIndex))
|
# Don't check between all other characters if the character is already
|
||||||
|
# in the search string or
|
||||||
|
interleaveStr = None
|
||||||
|
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
|
||||||
|
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
|
||||||
|
if stressOpt and spanOpt:
|
||||||
|
interleaveStr = u"\.?ˈ?"
|
||||||
|
elif stressOpt:
|
||||||
|
interleaveStr = u"ˈ?"
|
||||||
|
elif spanOpt:
|
||||||
|
interleaveStr = u"\.?"
|
||||||
|
|
||||||
returnList = []
|
if interleaveStr is not None:
|
||||||
for startIndex, endIndex in wordList:
|
matchStr = interleaveStr.join(matchStr)
|
||||||
isleWord = self.data[startIndex:endIndex]
|
|
||||||
syllableTxt = isleWord.split("#")[1].strip()
|
|
||||||
syllableList = [x for x in syllableTxt.split(' . ')]
|
|
||||||
|
|
||||||
# Find stress
|
# Setting search boundaries
|
||||||
stressList = []
|
# We search on '[^\.#]' and not '.' so that the search doesn't span
|
||||||
for i, syllable in enumerate(syllableList):
|
# multiple syllables or words
|
||||||
# Primary stress
|
if wordInitial == 'only':
|
||||||
if "'" in syllable:
|
matchStr = u'#' + matchStr
|
||||||
stressList.insert(0, i)
|
elif wordInitial == 'no':
|
||||||
# Secondary stress
|
# Match the closest preceeding syllable. If there is none, look
|
||||||
elif '"' in syllable:
|
# for word boundary plus at least one other character
|
||||||
stressList.append(i)
|
matchStr = u'(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
|
||||||
|
else:
|
||||||
|
matchStr = u'[#\.][^\.#]*?' + matchStr
|
||||||
|
|
||||||
syllableList = [x.split(" ") for x in syllableList]
|
if wordFinal == 'only':
|
||||||
returnList.append((syllableList, stressList))
|
matchStr = matchStr + u'#'
|
||||||
|
elif wordFinal == 'no':
|
||||||
|
matchStr = matchStr + u"(?:[^\.#]*?\.|[^\.#]+?#)"
|
||||||
|
else:
|
||||||
|
matchStr = matchStr + u'[^\.#]*?[#\.]'
|
||||||
|
|
||||||
return returnList
|
# For sounds that are designated two characters, prevent
|
||||||
|
# detecting those sounds if the user wanted a sound
|
||||||
|
# designated by one of the contained characters
|
||||||
|
|
||||||
|
# Forward search ('a' and not 'ab')
|
||||||
|
insertList = []
|
||||||
|
for charA, charB in [(u'e', u'i'), (u't', u'ʃ'), (u'd', u'ʒ'),
|
||||||
|
(u'o', u'ʊ'), (u'a', u'ʊ|ɪ'), (u'ɔ', u'i'), ]:
|
||||||
|
startI = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
i = matchStr.index(charA, startI)
|
||||||
|
except ValueError:
|
||||||
|
break
|
||||||
|
if matchStr[i + 1] != charB:
|
||||||
|
forwardStr = u'(?!%s)' % charB
|
||||||
|
# matchStr = matchStr[:i + 1] + forwardStr + matchStr[i + 1:]
|
||||||
|
startI = i + 1 + len(forwardStr)
|
||||||
|
insertList.append((i + 1, forwardStr))
|
||||||
|
|
||||||
|
# Backward search ('b' and not 'ab')
|
||||||
|
for charA, charB in [(u't', u'ʃ'), (u'd', u'ʒ'),
|
||||||
|
(u'a|o', u'ʊ'), (u'e|ɔ', u'i'), (u'ɑ' u'ɪ'), ]:
|
||||||
|
startI = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
i = matchStr.index(charB, startI)
|
||||||
|
except ValueError:
|
||||||
|
break
|
||||||
|
if matchStr[i - 1] != charA:
|
||||||
|
backStr = u'(?<!%s)' % charA
|
||||||
|
# matchStr = matchStr[:i] + backStr + matchStr[i:]
|
||||||
|
startI = i + 1 + len(backStr)
|
||||||
|
insertList.append((i, backStr))
|
||||||
|
|
||||||
|
insertList.sort()
|
||||||
|
for i, insertStr in insertList[::-1]:
|
||||||
|
matchStr = matchStr[:i] + insertStr + matchStr[i:]
|
||||||
|
|
||||||
|
# Revert the special sounds back from 1 character to 2 characters
|
||||||
|
for charA, charB in replList:
|
||||||
|
matchStr = matchStr.replace(charB, charA)
|
||||||
|
|
||||||
|
# Replace special characters
|
||||||
|
replDict = {"D": u"(?:t(?!ʃ)|d(?!ʒ)|[sz])", # dentals
|
||||||
|
"F": u"[ʃʒfvszɵðh]", # fricatives
|
||||||
|
"S": u"(?:t(?!ʃ)|d(?!ʒ)|[pbkg])", # stops
|
||||||
|
"N": u"[nmŋ]", # nasals
|
||||||
|
"R": u"[rɝɚ]", # rhotics
|
||||||
|
"V": u"(?:aʊ|ei|oʊ|ɑɪ|ɔi|[iuæɑɔəɛɪʊʌ]):?", # vowels
|
||||||
|
"B": u"\.", # syllable boundary
|
||||||
|
}
|
||||||
|
|
||||||
|
for char, replStr in replDict.items():
|
||||||
|
matchStr = matchStr.replace(char, replStr)
|
||||||
|
|
||||||
|
return matchStr
|
||||||
|
|
||||||
|
|
||||||
|
def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
|
||||||
|
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||||
|
multiword='ok'):
|
||||||
|
'''
|
||||||
|
Searches for matching words in the dictionary with regular expressions
|
||||||
|
|
||||||
|
wordInitial, wordFinal, spanSyllable, stressSyllable, and multiword
|
||||||
|
can take three different values: 'ok', 'only', or 'no'.
|
||||||
|
|
||||||
|
Special search characters:
|
||||||
|
'D' - any dental; 'F' - any fricative; 'S' - any stop
|
||||||
|
'V' - any vowel; 'N' - any nasal; 'R' - any rhotic
|
||||||
|
'#' - word boundary
|
||||||
|
'B' - syllable boundary
|
||||||
|
'.' - anything
|
||||||
|
|
||||||
|
For advanced queries:
|
||||||
|
Regular expression syntax applies, so if you wanted to search for any
|
||||||
|
word ending with a vowel or rhotic, matchStr = '(?:VR)#', '[VR]#', etc.
|
||||||
|
'''
|
||||||
|
# Run search for words
|
||||||
|
|
||||||
|
matchStr = _prepRESearchStr(matchStr, wordInitial, wordFinal,
|
||||||
|
spanSyllable, stressedSyllable)
|
||||||
|
|
||||||
|
compiledRE = re.compile(matchStr)
|
||||||
|
retList = []
|
||||||
|
for word, pronList in searchList:
|
||||||
|
newPronList = []
|
||||||
|
for pron in pronList:
|
||||||
|
searchPron = pron.replace(",", "").replace(" ", "")
|
||||||
|
|
||||||
|
# Ignore diacritics for now:
|
||||||
|
for diacritic in diacriticList:
|
||||||
|
if diacritic not in matchStr:
|
||||||
|
searchPron = searchPron.replace(diacritic, "")
|
||||||
|
|
||||||
|
if numSyllables is not None:
|
||||||
|
if numSyllables != searchPron.count('.') + 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Is this a compound word?
|
||||||
|
if multiword == 'only':
|
||||||
|
if searchPron.count('#') == 2:
|
||||||
|
continue
|
||||||
|
elif multiword == 'no':
|
||||||
|
if searchPron.count('#') > 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matchList = compiledRE.findall(searchPron)
|
||||||
|
if len(matchList) > 0:
|
||||||
|
if stressedSyllable == 'only':
|
||||||
|
if all([u"ˈ" not in match for match in matchList]):
|
||||||
|
continue
|
||||||
|
if stressedSyllable == 'no':
|
||||||
|
if all([u"ˈ" in match for match in matchList]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# For syllable spanning, we check if there is a syllable
|
||||||
|
# marker inside (not at the border) of the match.
|
||||||
|
if spanSyllable == 'only':
|
||||||
|
if all(["." not in txt[1:-1] for txt in matchList]):
|
||||||
|
continue
|
||||||
|
if spanSyllable == 'no':
|
||||||
|
if all(["." in txt[1:-1] for txt in matchList]):
|
||||||
|
continue
|
||||||
|
newPronList.append(pron)
|
||||||
|
|
||||||
|
if len(newPronList) > 0:
|
||||||
|
retList.append((word, newPronList))
|
||||||
|
|
||||||
|
retList.sort()
|
||||||
|
return retList
|
||||||
|
|
||||||
|
|
||||||
|
def _parsePronunciation(pronunciationStr):
|
||||||
|
'''
|
||||||
|
Parses the pronunciation string
|
||||||
|
|
||||||
|
Returns the list of syllables and a list of primary and
|
||||||
|
secondary stress locations
|
||||||
|
'''
|
||||||
|
syllableTxt = pronunciationStr.split("#")[1].strip()
|
||||||
|
syllableList = [x.split() for x in syllableTxt.split(' . ')]
|
||||||
|
|
||||||
|
# Find stress
|
||||||
|
stressedSyllableList = []
|
||||||
|
stressedPhoneList = []
|
||||||
|
for i, syllable in enumerate(syllableList):
|
||||||
|
for j, phone in enumerate(syllable):
|
||||||
|
if u"ˈ" in phone:
|
||||||
|
stressedSyllableList.insert(0, i)
|
||||||
|
stressedPhoneList.insert(0, j)
|
||||||
|
break
|
||||||
|
elif u'ˌ' in phone:
|
||||||
|
stressedSyllableList.append(i)
|
||||||
|
stressedPhoneList.append(j)
|
||||||
|
|
||||||
|
return syllableList, stressedSyllableList, stressedPhoneList
|
||||||
|
|
||||||
|
|
||||||
def getNumPhones(isleDict, label, maxFlag):
|
def getNumPhones(isleDict, label, maxFlag):
|
||||||
@@ -94,23 +331,27 @@ def getNumPhones(isleDict, label, maxFlag):
|
|||||||
phoneListOfLists = isleDict.lookup(word)
|
phoneListOfLists = isleDict.lookup(word)
|
||||||
|
|
||||||
syllableCountList = []
|
syllableCountList = []
|
||||||
for syllableList, stressIndex in phoneListOfLists:
|
for row in phoneListOfLists:
|
||||||
|
syllableList = row[0]
|
||||||
syllableCountList.append(len(syllableList))
|
syllableCountList.append(len(syllableList))
|
||||||
|
|
||||||
# In ISLE, there can be multiple pronunciations for each word
|
# In ISLE, there can be multiple pronunciations for each word
|
||||||
# as we have no reason to believe one pronunciation is more
|
# as we have no reason to believe one pronunciation is more
|
||||||
# likely than another, we take the average of all of them
|
# likely than another, we take the average of all of them
|
||||||
phoneCountList = []
|
phoneCountList = []
|
||||||
for syllableList, stressIndex in phoneListOfLists:
|
for row in phoneListOfLists:
|
||||||
phoneCountList.append(len([phon for phoneList in syllableList for phon in phoneList]))
|
syllableList = row[0]
|
||||||
|
phoneCountList.append(len([phon for phoneList in syllableList for
|
||||||
|
phon in phoneList]))
|
||||||
|
|
||||||
# The average number of phones for all possible pronunciations
|
# The average number of phones for all possible pronunciations
|
||||||
# of this word
|
# of this word
|
||||||
if maxFlag == True:
|
if maxFlag is True:
|
||||||
syllableCount += max(syllableCountList)
|
syllableCount += max(syllableCountList)
|
||||||
phoneCount += max(phoneCountList)
|
phoneCount += max(phoneCountList)
|
||||||
else:
|
else:
|
||||||
syllableCount += sum(syllableCountList) / float(len(syllableCountList))
|
syllableCount += (sum(syllableCountList) /
|
||||||
|
float(len(syllableCountList)))
|
||||||
phoneCount += sum(phoneCountList) / float(len(phoneCountList))
|
phoneCount += sum(phoneCountList) / float(len(phoneCountList))
|
||||||
|
|
||||||
return syllableCount, phoneCount
|
return syllableCount, phoneCount
|
||||||
@@ -131,6 +372,3 @@ def findOODWords(isleDict, wordList):
|
|||||||
oodList.sort()
|
oodList.sort()
|
||||||
|
|
||||||
return oodList
|
return oodList
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
+46
-25
@@ -1,16 +1,18 @@
|
|||||||
|
#encoding: utf-8
|
||||||
'''
|
'''
|
||||||
Created on Oct 22, 2014
|
Created on Oct 22, 2014
|
||||||
|
|
||||||
@author: tmahrt
|
@author: tmahrt
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
class OptionalFeatureError(ImportError):
|
class OptionalFeatureError(ImportError):
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "ERROR: You must have praatio installed to use pysle.praatTools"
|
return "ERROR: You must have praatio installed to use pysle.praatTools"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import praatio
|
from praatio import tgio
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise OptionalFeatureError()
|
raise OptionalFeatureError()
|
||||||
|
|
||||||
@@ -34,11 +36,12 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
|
|||||||
wordTier = tg.tierDict[wordTierName]
|
wordTier = tg.tierDict[wordTierName]
|
||||||
phoneTier = tg.tierDict[phoneTierName]
|
phoneTier = tg.tierDict[phoneTierName]
|
||||||
|
|
||||||
if skipLabelList == None:
|
if skipLabelList is None:
|
||||||
skipLabelList = []
|
skipLabelList = []
|
||||||
|
|
||||||
syllableEntryList = []
|
syllableEntryList = []
|
||||||
tonicEntryList = []
|
tonicSEntryList = []
|
||||||
|
tonicPEntryList = []
|
||||||
for start, stop, word in wordTier.entryList:
|
for start, stop, word in wordTier.entryList:
|
||||||
|
|
||||||
if word in skipLabelList:
|
if word in skipLabelList:
|
||||||
@@ -46,28 +49,43 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
|
|||||||
|
|
||||||
subPhoneTier = phoneTier.crop(start, stop, True, False)[0]
|
subPhoneTier = phoneTier.crop(start, stop, True, False)[0]
|
||||||
|
|
||||||
phoneList = [phone for startP, endP, phone in subPhoneTier.entryList if phone != '']
|
# entry = (start, stop, phone)
|
||||||
|
phoneList = [entry[2] for entry in subPhoneTier.entryList
|
||||||
|
if entry[2] != '']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
returnList = pronunciationtools.findBestSyllabification(isleDict,
|
returnList = pronunciationtools.findBestSyllabification(isleDict,
|
||||||
word,
|
word,
|
||||||
phoneList)
|
phoneList)
|
||||||
except isletool.WordNotInISLE:
|
except isletool.WordNotInISLE:
|
||||||
print "Word ('%s') not is isle -- skipping syllabification" % word
|
print("Word ('%s') not is isle -- skipping syllabification" % word)
|
||||||
continue
|
continue
|
||||||
except (pronunciationtools.NullPronunciationError):
|
except (pronunciationtools.NullPronunciationError):
|
||||||
print "Word ('%s') has no provided pronunciation" % word
|
print("Word ('%s') has no provided pronunciation" % word)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
stressedSyllable, syllableList, syllabification, stressIndexList = returnList
|
syllableList = returnList[1]
|
||||||
|
stressedSyllableIndexList = returnList[3]
|
||||||
|
stressedPhoneIndexList = returnList[4]
|
||||||
|
flattenedPhoneIndexList = returnList[5]
|
||||||
|
|
||||||
|
try:
|
||||||
|
stressI = stressedSyllableIndexList[0]
|
||||||
|
stressJ = stressedPhoneIndexList[0]
|
||||||
|
except IndexError:
|
||||||
|
stressI = None # Function word probably
|
||||||
|
stressJ = None #
|
||||||
|
|
||||||
|
if stressI is not None:
|
||||||
|
syllableList[stressI][stressJ] += u"ˈ"
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
# print syllableList
|
# print(syllableList)
|
||||||
for k, syllable in enumerate(syllableList):
|
for k, syllable in enumerate(syllableList):
|
||||||
|
|
||||||
# Create the syllable tier entry
|
# Create the syllable tier entry
|
||||||
j = len(syllable)
|
j = len(syllable)
|
||||||
stubEntryList = subPhoneTier.entryList[i:i+j]
|
stubEntryList = subPhoneTier.entryList[i:i + j]
|
||||||
i += j
|
i += j
|
||||||
|
|
||||||
# The whole syllable was deleted
|
# The whole syllable was deleted
|
||||||
@@ -76,29 +94,32 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
|
|||||||
|
|
||||||
syllableStart = stubEntryList[0][0]
|
syllableStart = stubEntryList[0][0]
|
||||||
syllableEnd = stubEntryList[-1][1]
|
syllableEnd = stubEntryList[-1][1]
|
||||||
label = "-".join([phone for start, end, phone in stubEntryList])
|
label = "-".join([entry[2] for entry in stubEntryList])
|
||||||
|
|
||||||
syllableEntryList.append( (syllableStart, syllableEnd, label) )
|
syllableEntryList.append((syllableStart, syllableEnd, label))
|
||||||
|
|
||||||
# Create the tonic tier entry
|
# Create the tonic syllable tier entry
|
||||||
try:
|
if k == stressI:
|
||||||
stressIndex = stressIndexList[0]
|
tonicSEntryList.append((syllableStart, syllableEnd, 'T'))
|
||||||
except IndexError:
|
|
||||||
stressIndex = None # Function word probably
|
|
||||||
|
|
||||||
tonicLabel = ''
|
# Create the tonic phone tier entry
|
||||||
if k == stressIndex:
|
if k == stressI:
|
||||||
tonicLabel = 'T'
|
syllablePhoneTier = phoneTier.crop(syllableStart, syllableEnd,
|
||||||
|
True, False)[0]
|
||||||
|
|
||||||
tonicEntryList.append( (syllableStart, syllableEnd, tonicLabel) )
|
phoneList = [entry for entry in syllablePhoneTier.entryList
|
||||||
|
if entry[2] != '']
|
||||||
|
phoneStart, phoneEnd = phoneList[stressJ][:2]
|
||||||
|
tonicPEntryList.append((phoneStart, phoneEnd, 'T'))
|
||||||
|
|
||||||
# Create a textgrid with the two syllable-level tiers
|
# Create a textgrid with the two syllable-level tiers
|
||||||
syllableTier = praatio.TextgridTier("syllable", syllableEntryList, praatio.INTERVAL_TIER)
|
syllableTier = tgio.IntervalTier("syllable", syllableEntryList)
|
||||||
tonicTier = praatio.TextgridTier('tonic', tonicEntryList, praatio.INTERVAL_TIER)
|
tonicSTier = tgio.IntervalTier('tonicSyllable', tonicSEntryList)
|
||||||
|
tonicPTier = tgio.IntervalTier('tonicVowel', tonicPEntryList)
|
||||||
|
|
||||||
syllableTG = praatio.Textgrid()
|
syllableTG = tgio.Textgrid()
|
||||||
syllableTG.addTier(syllableTier)
|
syllableTG.addTier(syllableTier)
|
||||||
syllableTG.addTier(tonicTier)
|
syllableTG.addTier(tonicSTier)
|
||||||
|
syllableTG.addTier(tonicPTier)
|
||||||
|
|
||||||
return syllableTG
|
return syllableTG
|
||||||
|
|
||||||
|
|||||||
+51
-48
@@ -1,3 +1,4 @@
|
|||||||
|
#encoding: utf-8
|
||||||
'''
|
'''
|
||||||
Created on Oct 15, 2014
|
Created on Oct 15, 2014
|
||||||
|
|
||||||
@@ -9,10 +10,10 @@ import itertools
|
|||||||
from pysle import isletool
|
from pysle import isletool
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class NullPronunciationError(Exception):
|
class NullPronunciationError(Exception):
|
||||||
|
|
||||||
def __init__(self, word):
|
def __init__(self, word):
|
||||||
|
super(NullPronunciationError, self).__init__()
|
||||||
self.word = word
|
self.word = word
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
@@ -49,7 +50,7 @@ def _lcs(xs, ys):
|
|||||||
ll_b = _lcs_lens(xb, ys)
|
ll_b = _lcs_lens(xb, ys)
|
||||||
ll_e = _lcs_lens(xe[::-1], ys[::-1])
|
ll_e = _lcs_lens(xe[::-1], ys[::-1])
|
||||||
_, k = max((ll_b[j] + ll_e[ny - j], j)
|
_, k = max((ll_b[j] + ll_e[ny - j], j)
|
||||||
for j in range(ny + 1))
|
for j in range(ny + 1))
|
||||||
yb, ye = ys[:k], ys[k:]
|
yb, ye = ys[:k], ys[k:]
|
||||||
return _lcs(xb, yb) + _lcs(xe, ye)
|
return _lcs(xb, yb) + _lcs(xe, ye)
|
||||||
|
|
||||||
@@ -58,14 +59,13 @@ def _prepPronunciation(phoneList):
|
|||||||
retList = []
|
retList = []
|
||||||
for phone in phoneList:
|
for phone in phoneList:
|
||||||
if 'r' in phone:
|
if 'r' in phone:
|
||||||
phone = ['r',]
|
phone = ['r', ]
|
||||||
try:
|
try:
|
||||||
phone = phone[0] # Only represent the str by its first letter
|
phone = phone[0] # Only represent the string by its first letter
|
||||||
|
phone = phone.lower()
|
||||||
except IndexError:
|
except IndexError:
|
||||||
raise NullPhoneError()
|
raise NullPhoneError()
|
||||||
|
|
||||||
phone = phone.lower()
|
|
||||||
|
|
||||||
if phone in isletool.vowelList:
|
if phone in isletool.vowelList:
|
||||||
phone = 'V'
|
phone = 'V'
|
||||||
retList.append(phone)
|
retList.append(phone)
|
||||||
@@ -85,14 +85,14 @@ def _adjustSyllabification(adjustedPhoneList, syllableList):
|
|||||||
retSyllableList = []
|
retSyllableList = []
|
||||||
for syllable in syllableList:
|
for syllable in syllableList:
|
||||||
j = len(syllable)
|
j = len(syllable)
|
||||||
tmpPhoneList = adjustedPhoneList[i:i+j]
|
tmpPhoneList = adjustedPhoneList[i:i + j]
|
||||||
numBlanks = -1
|
numBlanks = -1
|
||||||
phoneList = tmpPhoneList[:]
|
phoneList = tmpPhoneList[:]
|
||||||
while numBlanks != 0:
|
while numBlanks != 0:
|
||||||
|
|
||||||
numBlanks = tmpPhoneList.count("''")
|
numBlanks = tmpPhoneList.count("''")
|
||||||
if numBlanks > 0:
|
if numBlanks > 0:
|
||||||
tmpPhoneList = adjustedPhoneList[i+j:i+j+numBlanks]
|
tmpPhoneList = adjustedPhoneList[i + j:i + j + numBlanks]
|
||||||
phoneList.extend(tmpPhoneList)
|
phoneList.extend(tmpPhoneList)
|
||||||
j += numBlanks
|
j += numBlanks
|
||||||
|
|
||||||
@@ -116,27 +116,32 @@ def _findBestPronunciation(isleDict, wordText, aPron):
|
|||||||
|
|
||||||
isleWordList = isleDict.lookup(wordText)
|
isleWordList = isleDict.lookup(wordText)
|
||||||
|
|
||||||
aP = _prepPronunciation(aPron) # Mapping to simplified phone inventory
|
aP = _prepPronunciation(aPron) # Mapping to simplified phone inventory
|
||||||
|
|
||||||
origPronDict = dict((newPron,oldPron) for newPron, oldPron in zip(aP, aPron))
|
origPronDict = dict((newPron, oldPron)
|
||||||
|
for newPron, oldPron in zip(aP, aPron))
|
||||||
|
|
||||||
numDiffList = []
|
numDiffList = []
|
||||||
withStress = []
|
withStress = []
|
||||||
i = 0
|
i = 0
|
||||||
alignedSyllabificationList = []
|
alignedSyllabificationList = []
|
||||||
alignedActualPronunciationList = []
|
alignedActualPronunciationList = []
|
||||||
for syllableList, stressList in isleWordList:
|
for wordTuple in isleWordList:
|
||||||
|
syllableList = wordTuple[0] # syllableList, stressList
|
||||||
|
|
||||||
iP = [phone for phoneList in syllableList for phone in phoneList]
|
iP = [phone for phoneList in syllableList for phone in phoneList]
|
||||||
iP = _prepPronunciation(iP)
|
iP = _prepPronunciation(iP)
|
||||||
|
|
||||||
alignedIP, alignedAP = alignPronunciations(iP, aP)
|
alignedIP, alignedAP = alignPronunciations(iP, aP)
|
||||||
alignedAP = [origPronDict.get(phon, "''") for phon in alignedAP] # Remapping to actual phones
|
|
||||||
|
# Remapping to actual phones
|
||||||
|
alignedAP = [origPronDict.get(phon, "''") for phon in alignedAP]
|
||||||
alignedActualPronunciationList.append(alignedAP)
|
alignedActualPronunciationList.append(alignedAP)
|
||||||
|
|
||||||
# Adjusting the syllabification for differences between the dictionary
|
# Adjusting the syllabification for differences between the dictionary
|
||||||
# pronunciation and the actual pronunciation
|
# pronunciation and the actual pronunciation
|
||||||
alignedSyllabification = _adjustSyllabification(alignedIP, syllableList)
|
alignedSyllabification = _adjustSyllabification(alignedIP,
|
||||||
|
syllableList)
|
||||||
alignedSyllabificationList.append(alignedSyllabification)
|
alignedSyllabificationList.append(alignedSyllabification)
|
||||||
|
|
||||||
# Count the number of misalignments between the two
|
# Count the number of misalignments between the two
|
||||||
@@ -147,7 +152,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
|
|||||||
hasStress = False
|
hasStress = False
|
||||||
for syllable in syllableList:
|
for syllable in syllableList:
|
||||||
for phone in syllable:
|
for phone in syllable:
|
||||||
hasStress = "'" in phone or hasStress
|
hasStress = u"ˈ" in phone or hasStress
|
||||||
|
|
||||||
if hasStress:
|
if hasStress:
|
||||||
withStress.append(i)
|
withStress.append(i)
|
||||||
@@ -164,7 +169,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
|
|||||||
for i, numDiff in enumerate(numDiffList):
|
for i, numDiff in enumerate(numDiffList):
|
||||||
if numDiff != minDiff:
|
if numDiff != minDiff:
|
||||||
continue
|
continue
|
||||||
if bestIndex == None:
|
if bestIndex is None:
|
||||||
bestIndex = i
|
bestIndex = i
|
||||||
bestIsStressed = i in withStress
|
bestIsStressed = i in withStress
|
||||||
else:
|
else:
|
||||||
@@ -172,8 +177,8 @@ def _findBestPronunciation(isleDict, wordText, aPron):
|
|||||||
bestIndex = i
|
bestIndex = i
|
||||||
bestIsStressed = True
|
bestIsStressed = True
|
||||||
|
|
||||||
|
return (isleWordList, alignedActualPronunciationList,
|
||||||
return isleWordList, alignedActualPronunciationList, alignedSyllabificationList, bestIndex
|
alignedSyllabificationList, bestIndex)
|
||||||
|
|
||||||
|
|
||||||
def _syllabifyPhones(phoneList, syllableList, isleStressList):
|
def _syllabifyPhones(phoneList, syllableList, isleStressList):
|
||||||
@@ -193,9 +198,9 @@ def _syllabifyPhones(phoneList, syllableList, isleStressList):
|
|||||||
|
|
||||||
start = 0
|
start = 0
|
||||||
syllabifiedList = []
|
syllabifiedList = []
|
||||||
for i, end in enumerate(numPhoneList):
|
for end in numPhoneList:
|
||||||
|
|
||||||
syllable = phoneList[start:start+end]
|
syllable = phoneList[start:start + end]
|
||||||
syllabifiedList.append(syllable)
|
syllabifiedList.append(syllable)
|
||||||
|
|
||||||
start += end
|
start += end
|
||||||
@@ -212,21 +217,6 @@ def alignPronunciations(pronI, pronA):
|
|||||||
pronI = [char for char in pronI]
|
pronI = [char for char in pronI]
|
||||||
pronA = [char for char in pronA]
|
pronA = [char for char in pronA]
|
||||||
|
|
||||||
# -- allow for some flexibility in pronunciation
|
|
||||||
correctionsTuple = (('d', 't'), ('t', 'd'), ('s', 'z'), ('z', 's'),
|
|
||||||
('m', 'n'), ('n', 'm'),)
|
|
||||||
|
|
||||||
doMatch = lambda i, a: ((i == a) or
|
|
||||||
((i, a) in correctionsTuple))
|
|
||||||
|
|
||||||
def matchExists(targetPhone, pron):
|
|
||||||
match = False
|
|
||||||
for phone in pron:
|
|
||||||
match = match or doMatch(targetPhone, phone)
|
|
||||||
return match
|
|
||||||
|
|
||||||
# Remove vowels
|
|
||||||
|
|
||||||
# Remove any elements not in the other list (but maintain order)
|
# Remove any elements not in the other list (but maintain order)
|
||||||
pronITmp = pronI
|
pronITmp = pronI
|
||||||
pronATmp = pronA
|
pronATmp = pronA
|
||||||
@@ -254,17 +244,19 @@ def alignPronunciations(pronI, pronA):
|
|||||||
|
|
||||||
# Fill in any blanks such that the sequential items have the same
|
# Fill in any blanks such that the sequential items have the same
|
||||||
# index and the two strings are the same length
|
# index and the two strings are the same length
|
||||||
for x in xrange(len(sequenceIndexListA)):
|
for x in range(len(sequenceIndexListA)):
|
||||||
indexA = sequenceIndexListA[x]
|
indexA = sequenceIndexListA[x]
|
||||||
indexI = sequenceIndexListI[x]
|
indexI = sequenceIndexListI[x]
|
||||||
if indexA < indexI :
|
if indexA < indexI:
|
||||||
for x in xrange(indexI - indexA):
|
for x in range(indexI - indexA):
|
||||||
pronA.insert(indexA, "''")
|
pronA.insert(indexA, "''")
|
||||||
sequenceIndexListA = [val + indexI - indexA for val in sequenceIndexListA]
|
sequenceIndexListA = [val + indexI - indexA
|
||||||
|
for val in sequenceIndexListA]
|
||||||
elif indexA > indexI:
|
elif indexA > indexI:
|
||||||
for x in xrange(indexA - indexI):
|
for x in range(indexA - indexI):
|
||||||
pronI.insert(indexI, "''")
|
pronI.insert(indexI, "''")
|
||||||
sequenceIndexListI = [val + indexA - indexI for val in sequenceIndexListI]
|
sequenceIndexListI = [val + indexA - indexI
|
||||||
|
for val in sequenceIndexListI]
|
||||||
|
|
||||||
return pronI, pronA
|
return pronI, pronA
|
||||||
|
|
||||||
@@ -277,19 +269,32 @@ def findBestSyllabification(isleDict, wordText, actualPronunciationList):
|
|||||||
the syllabification for that pronunciation and map it onto the
|
the syllabification for that pronunciation and map it onto the
|
||||||
input pronunciation.
|
input pronunciation.
|
||||||
'''
|
'''
|
||||||
retList = _findBestPronunciation(isleDict, wordText, actualPronunciationList)
|
retList = _findBestPronunciation(isleDict, wordText,
|
||||||
|
actualPronunciationList)
|
||||||
isleWordList, alignedAPronList, alignedSyllableList, bestIndex = retList
|
isleWordList, alignedAPronList, alignedSyllableList, bestIndex = retList
|
||||||
|
|
||||||
alignedPhoneList = alignedAPronList[bestIndex]
|
alignedPhoneList = alignedAPronList[bestIndex]
|
||||||
alignedSyllables = alignedSyllableList[bestIndex]
|
alignedSyllables = alignedSyllableList[bestIndex]
|
||||||
syllabification = isleWordList[bestIndex][0]
|
syllabification = isleWordList[bestIndex][0]
|
||||||
stressedIndex = isleWordList[bestIndex][1]
|
stressedSyllableIndexList = isleWordList[bestIndex][1]
|
||||||
|
stressedPhoneIndexList = isleWordList[bestIndex][2]
|
||||||
|
|
||||||
stressedSyllable, syllableList = _syllabifyPhones(alignedPhoneList,
|
stressedSyllable, syllableList = _syllabifyPhones(alignedPhoneList,
|
||||||
alignedSyllables,
|
alignedSyllables,
|
||||||
stressedIndex)
|
stressedSyllableIndexList)
|
||||||
|
|
||||||
return stressedSyllable, syllableList, syllabification, stressedIndex
|
# Count the index of the stressed phones, if the stress list has
|
||||||
|
# become flattened (no syllable information)
|
||||||
|
flattenedStressIndexList = []
|
||||||
|
for i, j in zip(stressedSyllableIndexList, stressedPhoneIndexList):
|
||||||
|
k = j
|
||||||
|
for l in range(i):
|
||||||
|
k += len(syllableList[l])
|
||||||
|
flattenedStressIndexList.append(k)
|
||||||
|
|
||||||
|
return (stressedSyllable, syllableList, syllabification,
|
||||||
|
stressedSyllableIndexList, stressedPhoneIndexList,
|
||||||
|
flattenedStressIndexList)
|
||||||
|
|
||||||
|
|
||||||
def findClosestPronunciation(isleDict, wordText, aPron):
|
def findClosestPronunciation(isleDict, wordText, aPron):
|
||||||
@@ -298,9 +303,7 @@ def findClosestPronunciation(isleDict, wordText, aPron):
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
retList = _findBestPronunciation(isleDict, wordText, aPron)
|
retList = _findBestPronunciation(isleDict, wordText, aPron)
|
||||||
isleWordList, actualPronunciationList, bestIndex = retList
|
isleWordList = retList[0]
|
||||||
|
bestIndex = retList[3]
|
||||||
|
|
||||||
return isleWordList[bestIndex]
|
return isleWordList[bestIndex]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,19 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# encoding: utf-8
|
||||||
'''
|
'''
|
||||||
Created on Oct 15, 2014
|
Created on Oct 15, 2014
|
||||||
|
|
||||||
@author: tmahrt
|
@author: tmahrt
|
||||||
'''
|
'''
|
||||||
|
import codecs
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
setup(name='pysle',
|
setup(name='pysle',
|
||||||
version='1.0.0',
|
version='1.4.0',
|
||||||
author='Tim Mahrt',
|
author='Tim Mahrt',
|
||||||
author_email='timmahrt@gmail.com',
|
author_email='timmahrt@gmail.com',
|
||||||
package_dir={'pysle':'pysle'},
|
package_dir={'pysle':'pysle'},
|
||||||
packages=['pysle'],
|
packages=['pysle'],
|
||||||
license='LICENSE',
|
license='LICENSE',
|
||||||
long_description=open('README.rst', 'r').read(),
|
long_description=codecs.open('README.rst', 'r', encoding="utf-8").read(),
|
||||||
# install_requires=[], # No requirements! # requires 'from setuptools import setup'
|
# install_requires=[], # No requirements! # requires 'from setuptools import setup'
|
||||||
)
|
)
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
#encoding: utf-8
|
||||||
'''
|
'''
|
||||||
Created on Oct 22, 2014
|
Created on Oct 22, 2014
|
||||||
|
|
||||||
@@ -12,21 +13,23 @@ from pysle import pronunciationtools
|
|||||||
# In this first example we look up the syllabification of a word and get it's
|
# In this first example we look up the syllabification of a word and get it's
|
||||||
# stress information.
|
# stress information.
|
||||||
|
|
||||||
searchWord = 'pumpkins'
|
searchWord = 'catatonic'
|
||||||
isleDict = isletool.LexicalTool('islev2.txt')
|
isleDict = isletool.LexicalTool('ISLEdict.txt')
|
||||||
lookupResults = isleDict.lookup(searchWord)
|
lookupResults = isleDict.lookup(searchWord)
|
||||||
|
|
||||||
firstEntry = lookupResults[0]
|
firstEntry = lookupResults[0]
|
||||||
firstSyllableList = firstEntry[0]
|
firstSyllableList = firstEntry[0]
|
||||||
|
firstSyllableList = ".".join([u" ".join(syllable) for syllable in firstSyllableList])
|
||||||
firstStressList = firstEntry[1]
|
firstStressList = firstEntry[1]
|
||||||
|
|
||||||
print searchWord
|
print(searchWord)
|
||||||
print firstSyllableList, firstStressList # 3rd syllable carries stress
|
print(firstSyllableList)
|
||||||
|
print(firstStressList) # 3rd syllable carries stress
|
||||||
|
|
||||||
|
|
||||||
# Here we determine the syllabification of a word, as it was said.
|
# Here we determine the syllabification of a word, as it was said.
|
||||||
# (Of course, this is just a guess)
|
# (Of course, this is just a guess)
|
||||||
print '-'*50
|
print('-'*50)
|
||||||
|
|
||||||
searchWord = 'another'
|
searchWord = 'another'
|
||||||
anotherPhoneList = ['n', '@', 'th', 'r']
|
anotherPhoneList = ['n', '@', 'th', 'r']
|
||||||
@@ -35,10 +38,14 @@ returnList = pronunciationtools.findBestSyllabification(isleDict,
|
|||||||
searchWord,
|
searchWord,
|
||||||
anotherPhoneList)
|
anotherPhoneList)
|
||||||
|
|
||||||
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
|
(stressedSyllable, syllableList, syllabification,
|
||||||
|
stressedSyllableIndexList, stressedPhoneIndexList,
|
||||||
print searchWord
|
flattenedStressIndexList) = returnList
|
||||||
print anotherPhoneList
|
print(searchWord)
|
||||||
print syllableList # We can see the first syllable was elided
|
print(anotherPhoneList)
|
||||||
|
print(stressedSyllableIndexList) # We can see the first syllable was elided
|
||||||
|
print(stressedPhoneIndexList)
|
||||||
|
print(flattenedStressIndexList)
|
||||||
|
print(syllableList)
|
||||||
|
print(syllabification)
|
||||||
|
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
#encoding: utf-8
|
||||||
|
'''
|
||||||
|
Created on July 08, 2016
|
||||||
|
|
||||||
|
@author: tmahrt
|
||||||
|
|
||||||
|
Basic examples of common usage.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import random
|
||||||
|
|
||||||
|
from pysle import isletool
|
||||||
|
|
||||||
|
tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\ISLEdict.txt"
|
||||||
|
isleDict = isletool.LexicalTool(tmpPath)
|
||||||
|
|
||||||
|
def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
|
||||||
|
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||||
|
multiword='ok', numMatches=None, matchList=None):
|
||||||
|
|
||||||
|
if matchList is None:
|
||||||
|
matchList = isleDict.search(matchStr, numSyllables, wordInitial,
|
||||||
|
wordFinal, spanSyllable, stressedSyllable,
|
||||||
|
multiword)
|
||||||
|
else:
|
||||||
|
matchList = isletool.search(matchList, matchStr, numSyllables, wordInitial,
|
||||||
|
wordFinal, spanSyllable, stressedSyllable,
|
||||||
|
multiword)
|
||||||
|
|
||||||
|
if numMatches is not None and len(matchList) > numMatches:
|
||||||
|
random.shuffle(matchList)
|
||||||
|
|
||||||
|
for i, matchTuple in enumerate(matchList):
|
||||||
|
if numMatches is not None and i > numMatches:
|
||||||
|
break
|
||||||
|
word, pronList = matchTuple
|
||||||
|
print("%s: %s" % (word, ",".join(pronList)))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
return matchList
|
||||||
|
|
||||||
|
# 2-syllable words with a stressed syllable containing 'dV' but not word initially
|
||||||
|
printOutMatches("dV", stressedSyllable="only", spanSyllable="no",
|
||||||
|
wordInitial="no", numSyllables=2, numMatches=10)
|
||||||
|
|
||||||
|
# 3-syllable word with an 'ld' sequence that spans a syllable boundary
|
||||||
|
printOutMatches("lBd", wordInitial="no", multiword='no',
|
||||||
|
numSyllables=3, numMatches=10)
|
||||||
|
|
||||||
|
# words ending in 'inth'
|
||||||
|
matchList = printOutMatches(u"ɪnɵ", wordFinal="only", numMatches=10)
|
||||||
|
|
||||||
|
# that also start with 's'
|
||||||
|
matchList = printOutMatches("s", wordInitial="only", numMatches=10,
|
||||||
|
matchList=matchList, multiword="no")
|
||||||
@@ -12,21 +12,25 @@ This snippet shows you how to use this function.
|
|||||||
|
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
|
||||||
import praatio
|
from praatio import tgio
|
||||||
from pysle import isletool
|
from pysle import isletool
|
||||||
from pysle import praattools
|
from pysle import praattools
|
||||||
|
|
||||||
path = join('.', 'files')
|
path = join('.', 'files')
|
||||||
path = "/Users/tmahrt/Dropbox/workspace/pysle/test/files"
|
path = "/Users/tmahrt/Dropbox/workspace/pysle/test/files"
|
||||||
|
|
||||||
tg = praatio.openTextGrid(join(path, "pumpkins.TextGrid"))
|
tg = tgio.openTextGrid(join(path, "pumpkins.TextGrid"))
|
||||||
isleDict = isletool.LexicalTool('/Users/tmahrt/Dropbox/workspace/pysle/test/islev2.txt') # Needs the full path to the file
|
|
||||||
|
# Needs the full path to the file
|
||||||
|
islevPath = '/Users/tmahrt/Dropbox/workspace/pysle/test/islev2.txt'
|
||||||
|
isleDict = isletool.LexicalTool(islevPath)
|
||||||
|
|
||||||
# Get the syllabification tiers and add it to the textgrid
|
# Get the syllabification tiers and add it to the textgrid
|
||||||
syllableTG = praattools.syllabifyTextgrid(isleDict, tg, "word", "phone",
|
syllableTG = praattools.syllabifyTextgrid(isleDict, tg, "word", "phone",
|
||||||
skipLabelList=["",])
|
skipLabelList=["",])
|
||||||
tg.addTier(syllableTG.tierDict["syllable"])
|
tg.addTier(syllableTG.tierDict["syllable"])
|
||||||
tg.addTier(syllableTG.tierDict["tonic"])
|
tg.addTier(syllableTG.tierDict["tonicSyllable"])
|
||||||
|
tg.addTier(syllableTG.tierDict["tonicVowel"])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user