mirror of
https://github.com/wassname/pysle.git
synced 2026-06-27 16:10:05 +08:00
FEATURE: Updated to new isledict format. Now using unicode IPA
It made the code a little more complex and now the system is less typing friendly but is more intuitive (no more guessing how to pronounce a character). Update includes changes to documentation.
This commit is contained in:
+12
-7
@@ -31,7 +31,7 @@ What can you do with this library?
|
||||
- map an actual pronunciation to a dictionary pronunciation (can be used
|
||||
to automatically find speech errors)::
|
||||
|
||||
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',])
|
||||
pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['k', 'æ',])
|
||||
|
||||
- automatically syllabify a praat textgrid containing words and phones
|
||||
(e.g. force-aligned text) -- requires my
|
||||
@@ -53,6 +53,9 @@ Ver 1.4 (July 9, 2016)
|
||||
|
||||
- added search functionality
|
||||
|
||||
- ported code to use the new unicode IPA-based isledict
|
||||
(the old one was ascii)
|
||||
|
||||
Ver 1.3 (March 15, 2016)
|
||||
|
||||
- added indicies for stressed vowels
|
||||
@@ -74,12 +77,14 @@ Requirements
|
||||
================
|
||||
|
||||
- Before you use this library (before or after installing it) you will need
|
||||
to download the ILSEX dictionary. It can be downloaded here:
|
||||
to download the ILSEX dictionary. It can be downloaded here under the
|
||||
section 'English' linked under the text 'English Pronlex'
|
||||
(with a file name of ISLEdict.txt):
|
||||
|
||||
`ISLEX project page <http://www.isle.illinois.edu/sst/data/dict/>`_
|
||||
`ISLEX project page <http://isle.illinois.edu/sst/data/g2ps/>`_
|
||||
|
||||
`Direct link to the ISLEX file used in this project
|
||||
<http://www.isle.illinois.edu/sst/data/dict/islex/islev2.txt>`_ (islev2.txt)
|
||||
<http://isle.illinois.edu/sst/data/g2ps/English/ISLEdict.txt>`_ (ISLEdict.txt)
|
||||
|
||||
- ``Python 2.7.*`` or above
|
||||
|
||||
@@ -113,7 +118,7 @@ Here is a typical common usage::
|
||||
from pysle import isle
|
||||
isleDict = isle.LexicalTool('C:\islev2.dict')
|
||||
print isleDict.lookup('catatonic')[0] # Get the first pronunciation
|
||||
>> [['kh', '@,'], ['t_(', '&'], ['th', "A'"], ['n', 'I', 'kh']] [2]
|
||||
>> [['k', 'ˌæ'], ['t˺', 'ə'], ['t', 'ˈɑ'], ['n', 'ɪ', 'k']] [2, 0]
|
||||
|
||||
and another::
|
||||
|
||||
@@ -121,7 +126,7 @@ and another::
|
||||
from psyle import pronunciationTools
|
||||
|
||||
searchWord = 'another'
|
||||
anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced
|
||||
anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced (ASCII or IPA ok here)
|
||||
|
||||
returnList = pronunciationTools.findBestSyllabification(isleDict,
|
||||
searchWord,
|
||||
@@ -138,7 +143,7 @@ Citing pysle
|
||||
|
||||
Pysle is general purpose coding and doesn't need to be cited
|
||||
(you should cite the
|
||||
`ISLEX project <http://www.isle.illinois.edu/sst/data/dict/islex/index.shtml>`_
|
||||
`ISLEX project <http://isle.illinois.edu/sst/data/g2ps/>`_
|
||||
instead) but if you would like to, it can be cited like so:
|
||||
|
||||
Tim Mahrt. Pysle. https://github.com/timmahrt/pysle, 2016.
|
||||
|
||||
+122
-55
@@ -1,3 +1,4 @@
|
||||
#encoding: utf-8
|
||||
'''
|
||||
Created on Oct 11, 2012
|
||||
|
||||
@@ -7,14 +8,18 @@ Created on Oct 11, 2012
|
||||
import io
|
||||
import re
|
||||
|
||||
charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E',
|
||||
'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b',
|
||||
'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:',
|
||||
'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p',
|
||||
'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u',
|
||||
'v', 'w', 'y', 'z']
|
||||
|
||||
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ]
|
||||
charList = [u'#', u'.', u'aʊ', u'b', u'd', u'dʒ', u'ei', u'f', u'g',
|
||||
u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'oʊ', u'p',
|
||||
u'r', u's', u't', u'tʃ', u'u', u'v', u'w', u'z', u'æ',
|
||||
u'ð', u'ŋ', u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
|
||||
u'ɪ', u'ɵ', u'ɹ', u'ʃ', u'ʊ', u'ʒ', u'æ', u'ʌ', ]
|
||||
|
||||
diacriticList = [u'˺', u'ˌ', u'̩', u'̃', ]
|
||||
|
||||
vowelList = [u'aʊ', u'ei', u'i', u'oʊ', u'u', u'æ',
|
||||
u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
|
||||
u'ɪ', u'ʊ', u'ʌ', ]
|
||||
|
||||
|
||||
def isVowel(char):
|
||||
@@ -88,6 +93,108 @@ class LexicalTool():
|
||||
multiword=multiword)
|
||||
|
||||
|
||||
def _prepRESearchStr(matchStr, wordInitial='ok', wordFinal='ok',
|
||||
spanSyllable='ok', stressedSyllable='ok'):
|
||||
'''
|
||||
Prepares a user's RE string for a search
|
||||
'''
|
||||
|
||||
# Protect sounds that are two characters
|
||||
# After this we can assume that each character represents a sound
|
||||
# (We'll revert back when we're done processing the RE)
|
||||
replList = ((u'ei', u'1'), (u'tʃ', u'2'), (u'oʊ', u'3'),
|
||||
(u'dʒ', u'4'), (u'aʊ', u'5'), (u'ɑɪ', u'6'),
|
||||
(u'ɔi', u'7'))
|
||||
|
||||
for charA, charB in replList:
|
||||
matchStr = matchStr.replace(charA, charB)
|
||||
|
||||
# Characters to check between all other characters
|
||||
# Don't check between all other characters if the character is already
|
||||
# in the search string or
|
||||
interleaveStr = None
|
||||
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
|
||||
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
|
||||
if stressOpt and spanOpt:
|
||||
interleaveStr = u"\.?ˈ?"
|
||||
elif stressOpt:
|
||||
interleaveStr = u"ˈ?"
|
||||
elif spanOpt:
|
||||
interleaveStr = u"\.?"
|
||||
|
||||
if interleaveStr is not None:
|
||||
matchStr = interleaveStr.join(matchStr)
|
||||
|
||||
# Setting search boundaries
|
||||
# We search on '[^\.#]' and not '.' so that the search doesn't span
|
||||
# multiple syllables or words
|
||||
if wordInitial == 'only':
|
||||
matchStr = u'#' + matchStr
|
||||
elif wordInitial == 'no':
|
||||
# Match the closest preceeding syllable. If there is none, look
|
||||
# for word boundary plus at least one other character
|
||||
matchStr = u'(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
|
||||
else:
|
||||
matchStr = u'[#\.][^\.#]*?' + matchStr
|
||||
|
||||
if wordFinal == 'only':
|
||||
matchStr = matchStr + u'#'
|
||||
elif wordFinal == 'no':
|
||||
matchStr = matchStr + u"(?:[^\.#]*?\.|[^\.#]+?#)"
|
||||
else:
|
||||
matchStr = matchStr + u'[^\.#]*?[#\.]'
|
||||
|
||||
# For sounds that are designated two characters, prevent
|
||||
# detecting those sounds if the user wanted a sound
|
||||
# designated by one of the contained characters
|
||||
for charA, charB in [(u'e', u'i'), (u't', u'ʃ'), (u'o', u'ʊ'),
|
||||
(u'd', u'ʒ'), (u'a', u'ʊ'), (u'ɑ' u'ɪ'),
|
||||
(u'ɔ', u'i'), ]:
|
||||
|
||||
# Forward search ('a' and not 'ab')
|
||||
startI = 0
|
||||
while True:
|
||||
try:
|
||||
i = matchStr.index(charA, startI)
|
||||
except ValueError:
|
||||
break
|
||||
if matchStr[i + 1] != charB:
|
||||
forwardStr = u'(?!%s)' % charB
|
||||
matchStr = matchStr[:i + 1] + forwardStr + matchStr[i + 1:]
|
||||
startI = i + 1 + len(forwardStr)
|
||||
|
||||
# Backward search ('b' and not 'ab')
|
||||
startI = 0
|
||||
while True:
|
||||
try:
|
||||
i = matchStr.index(charB, startI)
|
||||
except ValueError:
|
||||
break
|
||||
if matchStr[i - 1] != charA:
|
||||
backStr = u'(?<!%s)' % charA
|
||||
matchStr = matchStr[:i] + backStr + matchStr[i:]
|
||||
startI = i + 1 + len(backStr)
|
||||
|
||||
# Revert the special sounds back from 1 character to 2 characters
|
||||
for charA, charB in replList:
|
||||
matchStr = matchStr.replace(charB, charA)
|
||||
|
||||
# Replace special characters
|
||||
replDict = {"D": u"[tdsz]", # dentals
|
||||
"F": u"[ʃʒfvszɵðh]", # fricatives
|
||||
"S": u"[pbtdkg]", # stops
|
||||
"N": u"[nmŋ]", # nasals
|
||||
"R": u"[rɝɚ]", # rhotics
|
||||
"V": u"(?:aʊ|ei|oʊ|ɑɪ|ɔi|[iuæɑɔəɛɪʊʌ]):?", # vowels
|
||||
"B": u"\.", # syllable boundary
|
||||
}
|
||||
|
||||
for char, replStr in replDict.items():
|
||||
matchStr = matchStr.replace(char, replStr)
|
||||
|
||||
return matchStr
|
||||
|
||||
|
||||
def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
|
||||
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||
multiword='ok'):
|
||||
@@ -107,51 +214,11 @@ def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
|
||||
Regular expression syntax applies, so if you wanted to search for any
|
||||
word ending with a vowel or rhotic, matchStr = '(?:VR)#'
|
||||
'''
|
||||
|
||||
# Characters to check between all other characters
|
||||
# Don't check between all other characters if the character is already
|
||||
# in the search string or
|
||||
interleaveStr = None
|
||||
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
|
||||
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
|
||||
if stressOpt and spanOpt:
|
||||
interleaveStr = "\.?'?"
|
||||
elif stressOpt:
|
||||
interleaveStr = "'?"
|
||||
elif spanOpt:
|
||||
interleaveStr = "\.?"
|
||||
|
||||
if interleaveStr is not None:
|
||||
matchStr = interleaveStr.join(matchStr)
|
||||
|
||||
# Setting search boundaries
|
||||
# We search on '[^\.#]' and not '.' so that the search doesn't span
|
||||
# multiple syllables or words
|
||||
if wordInitial == 'only':
|
||||
matchStr = '#' + matchStr
|
||||
elif wordInitial == 'no':
|
||||
# Match the closest preceeding syllable. If there is none, look
|
||||
# for word boundary plus at least one other character
|
||||
matchStr = '(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
|
||||
else:
|
||||
matchStr = '[#\.][^\.#]*?' + matchStr
|
||||
|
||||
if wordFinal == 'only':
|
||||
matchStr = matchStr + '#'
|
||||
elif wordFinal == 'no':
|
||||
matchStr = matchStr + "(?:[^\.#]*?\.|[^\.#]+?#)"
|
||||
else:
|
||||
matchStr = matchStr + '[^\.#]*?[#\.]'
|
||||
|
||||
# Replace special characters
|
||||
replDict = {"V": "(?:aI|aU|ei|oU|[AEIaeiu]):?",
|
||||
"R": "[&39]?r",
|
||||
"B": "\."}
|
||||
|
||||
for char, replStr in replDict.items():
|
||||
matchStr = matchStr.replace(char, replStr)
|
||||
|
||||
# Run search for words
|
||||
|
||||
matchStr = _prepRESearchStr(matchStr, wordInitial, wordFinal,
|
||||
spanSyllable, stressedSyllable)
|
||||
|
||||
compiledRE = re.compile(matchStr)
|
||||
retList = []
|
||||
for word, pronList in searchList:
|
||||
@@ -173,10 +240,10 @@ def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
|
||||
matchList = compiledRE.findall(searchPron)
|
||||
if len(matchList) > 0:
|
||||
if stressedSyllable == 'only':
|
||||
if all(["'" not in match for match in matchList]):
|
||||
if all([u"ˈ" not in match for match in matchList]):
|
||||
continue
|
||||
if stressedSyllable == 'no':
|
||||
if all(["'" in match for match in matchList]):
|
||||
if all([u"ˈ" in match for match in matchList]):
|
||||
continue
|
||||
|
||||
# For syllable spanning, we check if there is a syllable
|
||||
@@ -211,11 +278,11 @@ def _parsePronunciation(pronunciationStr):
|
||||
stressedPhoneList = []
|
||||
for i, syllable in enumerate(syllableList):
|
||||
for j, phone in enumerate(syllable):
|
||||
if "'" in phone:
|
||||
if u"ˈ" in phone:
|
||||
stressedSyllableList.insert(0, i)
|
||||
stressedPhoneList.insert(0, j)
|
||||
break
|
||||
elif '"' in phone:
|
||||
elif u'ˌ' in phone:
|
||||
stressedSyllableList.insert(i)
|
||||
stressedPhoneList.insert(j)
|
||||
|
||||
|
||||
+2
-1
@@ -1,3 +1,4 @@
|
||||
#encoding: utf-8
|
||||
'''
|
||||
Created on Oct 22, 2014
|
||||
|
||||
@@ -76,7 +77,7 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
|
||||
stressJ = None #
|
||||
|
||||
if stressI is not None:
|
||||
syllableList[stressI][stressJ] += "'"
|
||||
syllableList[stressI][stressJ] += u"ˈ"
|
||||
|
||||
i = 0
|
||||
# print(syllableList)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#encoding: utf-8
|
||||
'''
|
||||
Created on Oct 15, 2014
|
||||
|
||||
@@ -151,7 +152,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
|
||||
hasStress = False
|
||||
for syllable in syllableList:
|
||||
for phone in syllable:
|
||||
hasStress = "'" in phone or hasStress
|
||||
hasStress = u"ˈ" in phone or hasStress
|
||||
|
||||
if hasStress:
|
||||
withStress.append(i)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#encoding: utf-8
|
||||
'''
|
||||
Created on Oct 22, 2014
|
||||
|
||||
@@ -12,16 +13,18 @@ from pysle import pronunciationtools
|
||||
# In this first example we look up the syllabification of a word and get it's
|
||||
# stress information.
|
||||
|
||||
searchWord = 'pumpkins'
|
||||
isleDict = isletool.LexicalTool('islev2.txt')
|
||||
searchWord = 'catatonic'
|
||||
isleDict = isletool.LexicalTool('ISLEdict.txt')
|
||||
lookupResults = isleDict.lookup(searchWord)
|
||||
|
||||
firstEntry = lookupResults[0]
|
||||
firstSyllableList = firstEntry[0]
|
||||
firstSyllableList = ".".join([u" ".join(syllable) for syllable in firstSyllableList])
|
||||
firstStressList = firstEntry[1]
|
||||
|
||||
print(searchWord)
|
||||
print(firstSyllableList, firstStressList) # 3rd syllable carries stress
|
||||
print(firstSyllableList)
|
||||
print(firstStressList) # 3rd syllable carries stress
|
||||
|
||||
|
||||
# Here we determine the syllabification of a word, as it was said.
|
||||
@@ -43,4 +46,6 @@ print(anotherPhoneList)
|
||||
print(stressedSyllableIndexList) # We can see the first syllable was elided
|
||||
print(stressedPhoneIndexList)
|
||||
print(flattenedStressIndexList)
|
||||
print(syllableList)
|
||||
print(syllabification)
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#encoding: utf-8
|
||||
'''
|
||||
Created on July 08, 2016
|
||||
|
||||
@@ -10,7 +11,7 @@ import random
|
||||
|
||||
from pysle import isletool
|
||||
|
||||
tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt"
|
||||
tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\ISLEdict.txt"
|
||||
isleDict = isletool.LexicalTool(tmpPath)
|
||||
|
||||
def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
|
||||
@@ -33,7 +34,7 @@ def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
|
||||
if numMatches is not None and i > numMatches:
|
||||
break
|
||||
word, pronList = matchTuple
|
||||
print("%s: %s" % (word, repr(pronList)))
|
||||
print("%s: %s" % (word, ",".join(pronList)))
|
||||
print("")
|
||||
|
||||
return matchList
|
||||
@@ -47,7 +48,7 @@ printOutMatches("lBd", wordInitial="no", multiword='no',
|
||||
numSyllables=3, numMatches=10)
|
||||
|
||||
# words ending in 'inth'
|
||||
matchList = printOutMatches("InT", wordFinal="only", numMatches=10)
|
||||
matchList = printOutMatches(u"ɪnɵ", wordFinal="only", numMatches=10)
|
||||
|
||||
# that also start with 's'
|
||||
matchList = printOutMatches("s", wordInitial="only", numMatches=10,
|
||||
|
||||
Reference in New Issue
Block a user