FEATURE: Updated to new isledict format. Now using unicode IPA

It made the code a little more complex and now the system
is less typing friendly but is more intuitive (no more guessing
how to pronounce a character).

Update includes changes to documentation.
This commit is contained in:
Tim Mahrt
2016-07-16 00:49:45 +02:00
parent 4cc4bf85ec
commit d88ff7d8d9
6 changed files with 152 additions and 72 deletions
+12 -7
View File
@@ -31,7 +31,7 @@ What can you do with this library?
- map an actual pronunciation to a dictionary pronunciation (can be used - map an actual pronunciation to a dictionary pronunciation (can be used
to automatically find speech errors):: to automatically find speech errors)::
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',]) pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['k', 'æ',])
- automatically syllabify a praat textgrid containing words and phones - automatically syllabify a praat textgrid containing words and phones
(e.g. force-aligned text) -- requires my (e.g. force-aligned text) -- requires my
@@ -53,6 +53,9 @@ Ver 1.4 (July 9, 2016)
- added search functionality - added search functionality
- ported code to use the new unicode IPA-based isledict
(the old one was ascii)
Ver 1.3 (March 15, 2016) Ver 1.3 (March 15, 2016)
- added indicies for stressed vowels - added indicies for stressed vowels
@@ -74,12 +77,14 @@ Requirements
================ ================
- Before you use this library (before or after installing it) you will need - Before you use this library (before or after installing it) you will need
to download the ILSEX dictionary. It can be downloaded here: to download the ILSEX dictionary. It can be downloaded here under the
section 'English' linked under the text 'English Pronlex'
(with a file name of ISLEdict.txt):
`ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_ `ISLEX project page <http://isle.illinois.edu/sst/data/g2ps/>`_
`Direct link to the ISLEX file used in this project `Direct link to the ISLEX file used in this project
<http://www.isle.illinois.edu/sst/data/dict/islex/islev2.txt>`_ (islev2.txt) <http://isle.illinois.edu/sst/data/g2ps/English/ISLEdict.txt>`_ (ISLEdict.txt)
- ``Python 2.7.*`` or above - ``Python 2.7.*`` or above
@@ -113,7 +118,7 @@ Here is a typical common usage::
from pysle import isle from pysle import isle
isleDict = isle.LexicalTool('C:\islev2.dict') isleDict = isle.LexicalTool('C:\islev2.dict')
print isleDict.lookup('catatonic')[0] # Get the first pronunciation print isleDict.lookup('catatonic')[0] # Get the first pronunciation
>> [['kh', '@,'], ['t_(', '&'], ['th', "A'"], ['n', 'I', 'kh']] [2] >> [['k', 'ˌæ'], ['t˺', 'ə'], ['t', 'ˈɑ'], ['n', 'ɪ', 'k']] [2, 0]
and another:: and another::
@@ -121,7 +126,7 @@ and another::
from psyle import pronunciationTools from psyle import pronunciationTools
searchWord = 'another' searchWord = 'another'
anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced (ASCII or IPA ok here)
returnList = pronunciationTools.findBestSyllabification(isleDict, returnList = pronunciationTools.findBestSyllabification(isleDict,
searchWord, searchWord,
@@ -138,7 +143,7 @@ Citing pysle
Pysle is general purpose coding and doesn't need to be cited Pysle is general purpose coding and doesn't need to be cited
(you should cite the (you should cite the
`ISLEX project <http://www.isle.illinois.edu/sst/data/dict/islex/index.shtml>`_ `ISLEX project <http://isle.illinois.edu/sst/data/g2ps/>`_
instead) but if you would like to, it can be cited like so: instead) but if you would like to, it can be cited like so:
Tim Mahrt. Pysle. https://github.com/timmahrt/pysle, 2016. Tim Mahrt. Pysle. https://github.com/timmahrt/pysle, 2016.
+122 -55
View File
@@ -1,3 +1,4 @@
#encoding: utf-8
''' '''
Created on Oct 11, 2012 Created on Oct 11, 2012
@@ -7,14 +8,18 @@ Created on Oct 11, 2012
import io import io
import re import re
charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E',
'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b',
'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:',
'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p',
'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u',
'v', 'w', 'y', 'z']
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ] charList = [u'#', u'.', u'', u'b', u'd', u'', u'ei', u'f', u'g',
u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'', u'p',
u'r', u's', u't', u'', u'u', u'v', u'w', u'z', u'æ',
u'ð', u'ŋ', u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
u'ɪ', u'ɵ', u'ɹ', u'ʃ', u'ʊ', u'ʒ', u'æ', u'ʌ', ]
diacriticList = [u'˺', u'ˌ', u'̩', u'̃', ]
vowelList = [u'', u'ei', u'i', u'', u'u', u'æ',
u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
u'ɪ', u'ʊ', u'ʌ', ]
def isVowel(char): def isVowel(char):
@@ -88,6 +93,108 @@ class LexicalTool():
multiword=multiword) multiword=multiword)
def _prepRESearchStr(matchStr, wordInitial='ok', wordFinal='ok',
spanSyllable='ok', stressedSyllable='ok'):
'''
Prepares a user's RE string for a search
'''
# Protect sounds that are two characters
# After this we can assume that each character represents a sound
# (We'll revert back when we're done processing the RE)
replList = ((u'ei', u'1'), (u'', u'2'), (u'', u'3'),
(u'', u'4'), (u'', u'5'), (u'ɑɪ', u'6'),
(u'ɔi', u'7'))
for charA, charB in replList:
matchStr = matchStr.replace(charA, charB)
# Characters to check between all other characters
# Don't check between all other characters if the character is already
# in the search string or
interleaveStr = None
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
if stressOpt and spanOpt:
interleaveStr = u"\.?ˈ?"
elif stressOpt:
interleaveStr = u"ˈ?"
elif spanOpt:
interleaveStr = u"\.?"
if interleaveStr is not None:
matchStr = interleaveStr.join(matchStr)
# Setting search boundaries
# We search on '[^\.#]' and not '.' so that the search doesn't span
# multiple syllables or words
if wordInitial == 'only':
matchStr = u'#' + matchStr
elif wordInitial == 'no':
# Match the closest preceeding syllable. If there is none, look
# for word boundary plus at least one other character
matchStr = u'(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
else:
matchStr = u'[#\.][^\.#]*?' + matchStr
if wordFinal == 'only':
matchStr = matchStr + u'#'
elif wordFinal == 'no':
matchStr = matchStr + u"(?:[^\.#]*?\.|[^\.#]+?#)"
else:
matchStr = matchStr + u'[^\.#]*?[#\.]'
# For sounds that are designated two characters, prevent
# detecting those sounds if the user wanted a sound
# designated by one of the contained characters
for charA, charB in [(u'e', u'i'), (u't', u'ʃ'), (u'o', u'ʊ'),
(u'd', u'ʒ'), (u'a', u'ʊ'), (u'ɑ' u'ɪ'),
(u'ɔ', u'i'), ]:
# Forward search ('a' and not 'ab')
startI = 0
while True:
try:
i = matchStr.index(charA, startI)
except ValueError:
break
if matchStr[i + 1] != charB:
forwardStr = u'(?!%s)' % charB
matchStr = matchStr[:i + 1] + forwardStr + matchStr[i + 1:]
startI = i + 1 + len(forwardStr)
# Backward search ('b' and not 'ab')
startI = 0
while True:
try:
i = matchStr.index(charB, startI)
except ValueError:
break
if matchStr[i - 1] != charA:
backStr = u'(?<!%s)' % charA
matchStr = matchStr[:i] + backStr + matchStr[i:]
startI = i + 1 + len(backStr)
# Revert the special sounds back from 1 character to 2 characters
for charA, charB in replList:
matchStr = matchStr.replace(charB, charA)
# Replace special characters
replDict = {"D": u"[tdsz]", # dentals
"F": u"[ʃʒfvszɵðh]", # fricatives
"S": u"[pbtdkg]", # stops
"N": u"[nmŋ]", # nasals
"R": u"[rɝɚ]", # rhotics
"V": u"(?:aʊ|ei|oʊ|ɑɪ|ɔi|[iuæɑɔəɛɪʊʌ]):?", # vowels
"B": u"\.", # syllable boundary
}
for char, replStr in replDict.items():
matchStr = matchStr.replace(char, replStr)
return matchStr
def search(searchList, matchStr, numSyllables=None, wordInitial='ok', def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok', wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
multiword='ok'): multiword='ok'):
@@ -107,51 +214,11 @@ def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
Regular expression syntax applies, so if you wanted to search for any Regular expression syntax applies, so if you wanted to search for any
word ending with a vowel or rhotic, matchStr = '(?:VR)#' word ending with a vowel or rhotic, matchStr = '(?:VR)#'
''' '''
# Characters to check between all other characters
# Don't check between all other characters if the character is already
# in the search string or
interleaveStr = None
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
if stressOpt and spanOpt:
interleaveStr = "\.?'?"
elif stressOpt:
interleaveStr = "'?"
elif spanOpt:
interleaveStr = "\.?"
if interleaveStr is not None:
matchStr = interleaveStr.join(matchStr)
# Setting search boundaries
# We search on '[^\.#]' and not '.' so that the search doesn't span
# multiple syllables or words
if wordInitial == 'only':
matchStr = '#' + matchStr
elif wordInitial == 'no':
# Match the closest preceeding syllable. If there is none, look
# for word boundary plus at least one other character
matchStr = '(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
else:
matchStr = '[#\.][^\.#]*?' + matchStr
if wordFinal == 'only':
matchStr = matchStr + '#'
elif wordFinal == 'no':
matchStr = matchStr + "(?:[^\.#]*?\.|[^\.#]+?#)"
else:
matchStr = matchStr + '[^\.#]*?[#\.]'
# Replace special characters
replDict = {"V": "(?:aI|aU|ei|oU|[AEIaeiu]):?",
"R": "[&39]?r",
"B": "\."}
for char, replStr in replDict.items():
matchStr = matchStr.replace(char, replStr)
# Run search for words # Run search for words
matchStr = _prepRESearchStr(matchStr, wordInitial, wordFinal,
spanSyllable, stressedSyllable)
compiledRE = re.compile(matchStr) compiledRE = re.compile(matchStr)
retList = [] retList = []
for word, pronList in searchList: for word, pronList in searchList:
@@ -173,10 +240,10 @@ def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
matchList = compiledRE.findall(searchPron) matchList = compiledRE.findall(searchPron)
if len(matchList) > 0: if len(matchList) > 0:
if stressedSyllable == 'only': if stressedSyllable == 'only':
if all(["'" not in match for match in matchList]): if all([u"ˈ" not in match for match in matchList]):
continue continue
if stressedSyllable == 'no': if stressedSyllable == 'no':
if all(["'" in match for match in matchList]): if all([u"ˈ" in match for match in matchList]):
continue continue
# For syllable spanning, we check if there is a syllable # For syllable spanning, we check if there is a syllable
@@ -211,11 +278,11 @@ def _parsePronunciation(pronunciationStr):
stressedPhoneList = [] stressedPhoneList = []
for i, syllable in enumerate(syllableList): for i, syllable in enumerate(syllableList):
for j, phone in enumerate(syllable): for j, phone in enumerate(syllable):
if "'" in phone: if u"ˈ" in phone:
stressedSyllableList.insert(0, i) stressedSyllableList.insert(0, i)
stressedPhoneList.insert(0, j) stressedPhoneList.insert(0, j)
break break
elif '"' in phone: elif u'ˌ' in phone:
stressedSyllableList.insert(i) stressedSyllableList.insert(i)
stressedPhoneList.insert(j) stressedPhoneList.insert(j)
+2 -1
View File
@@ -1,3 +1,4 @@
#encoding: utf-8
''' '''
Created on Oct 22, 2014 Created on Oct 22, 2014
@@ -76,7 +77,7 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
stressJ = None # stressJ = None #
if stressI is not None: if stressI is not None:
syllableList[stressI][stressJ] += "'" syllableList[stressI][stressJ] += u"ˈ"
i = 0 i = 0
# print(syllableList) # print(syllableList)
+2 -1
View File
@@ -1,3 +1,4 @@
#encoding: utf-8
''' '''
Created on Oct 15, 2014 Created on Oct 15, 2014
@@ -151,7 +152,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
hasStress = False hasStress = False
for syllable in syllableList: for syllable in syllableList:
for phone in syllable: for phone in syllable:
hasStress = "'" in phone or hasStress hasStress = u"ˈ" in phone or hasStress
if hasStress: if hasStress:
withStress.append(i) withStress.append(i)
+8 -3
View File
@@ -1,3 +1,4 @@
#encoding: utf-8
''' '''
Created on Oct 22, 2014 Created on Oct 22, 2014
@@ -12,16 +13,18 @@ from pysle import pronunciationtools
# In this first example we look up the syllabification of a word and get it's # In this first example we look up the syllabification of a word and get it's
# stress information. # stress information.
searchWord = 'pumpkins' searchWord = 'catatonic'
isleDict = isletool.LexicalTool('islev2.txt') isleDict = isletool.LexicalTool('ISLEdict.txt')
lookupResults = isleDict.lookup(searchWord) lookupResults = isleDict.lookup(searchWord)
firstEntry = lookupResults[0] firstEntry = lookupResults[0]
firstSyllableList = firstEntry[0] firstSyllableList = firstEntry[0]
firstSyllableList = ".".join([u" ".join(syllable) for syllable in firstSyllableList])
firstStressList = firstEntry[1] firstStressList = firstEntry[1]
print(searchWord) print(searchWord)
print(firstSyllableList, firstStressList) # 3rd syllable carries stress print(firstSyllableList)
print(firstStressList) # 3rd syllable carries stress
# Here we determine the syllabification of a word, as it was said. # Here we determine the syllabification of a word, as it was said.
@@ -43,4 +46,6 @@ print(anotherPhoneList)
print(stressedSyllableIndexList) # We can see the first syllable was elided print(stressedSyllableIndexList) # We can see the first syllable was elided
print(stressedPhoneIndexList) print(stressedPhoneIndexList)
print(flattenedStressIndexList) print(flattenedStressIndexList)
print(syllableList)
print(syllabification)
+4 -3
View File
@@ -1,3 +1,4 @@
#encoding: utf-8
''' '''
Created on July 08, 2016 Created on July 08, 2016
@@ -10,7 +11,7 @@ import random
from pysle import isletool from pysle import isletool
tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt" tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\ISLEdict.txt"
isleDict = isletool.LexicalTool(tmpPath) isleDict = isletool.LexicalTool(tmpPath)
def printOutMatches(matchStr, numSyllables=None, wordInitial='ok', def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
@@ -33,7 +34,7 @@ def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
if numMatches is not None and i > numMatches: if numMatches is not None and i > numMatches:
break break
word, pronList = matchTuple word, pronList = matchTuple
print("%s: %s" % (word, repr(pronList))) print("%s: %s" % (word, ",".join(pronList)))
print("") print("")
return matchList return matchList
@@ -47,7 +48,7 @@ printOutMatches("lBd", wordInitial="no", multiword='no',
numSyllables=3, numMatches=10) numSyllables=3, numMatches=10)
# words ending in 'inth' # words ending in 'inth'
matchList = printOutMatches("InT", wordFinal="only", numMatches=10) matchList = printOutMatches(u"ɪ", wordFinal="only", numMatches=10)
# that also start with 's' # that also start with 's'
matchList = printOutMatches("s", wordInitial="only", numMatches=10, matchList = printOutMatches("s", wordInitial="only", numMatches=10,