diff --git a/README.rst b/README.rst index bd42a96..9fcfc00 100644 --- a/README.rst +++ b/README.rst @@ -31,7 +31,7 @@ What can you do with this library? - map an actual pronunciation to a dictionary pronunciation (can be used to automatically find speech errors):: - pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',]) + pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['k', 'æ',]) - automatically syllabify a praat textgrid containing words and phones (e.g. force-aligned text) -- requires my @@ -53,6 +53,9 @@ Ver 1.4 (July 9, 2016) - added search functionality +- ported code to use the new unicode IPA-based isledict + (the old one was ascii) + Ver 1.3 (March 15, 2016) - added indicies for stressed vowels @@ -74,12 +77,14 @@ Requirements ================ - Before you use this library (before or after installing it) you will need - to download the ILSEX dictionary. It can be downloaded here: + to download the ILSEX dictionary. It can be downloaded here under the + section 'English' linked under the text 'English Pronlex' + (with a file name of ISLEdict.txt): - `ISLEX project page `_ + `ISLEX project page `_ `Direct link to the ISLEX file used in this project - `_ (islev2.txt) + `_ (ISLEdict.txt) - ``Python 2.7.*`` or above @@ -113,7 +118,7 @@ Here is a typical common usage:: from pysle import isle isleDict = isle.LexicalTool('C:\islev2.dict') print isleDict.lookup('catatonic')[0] # Get the first pronunciation - >> [['kh', '@,'], ['t_(', '&'], ['th', "A'"], ['n', 'I', 'kh']] [2] + >> [['k', 'ˌæ'], ['t˺', 'ə'], ['t', 'ˈɑ'], ['n', 'ɪ', 'k']] [2, 0] and another:: @@ -121,7 +126,7 @@ and another:: from psyle import pronunciationTools searchWord = 'another' - anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced + anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced (ASCII or IPA ok here) returnList = pronunciationTools.findBestSyllabification(isleDict, searchWord, @@ -138,7 +143,7 @@ Citing pysle Pysle is general purpose coding and doesn't need to be cited (you should cite the -`ISLEX project `_ +`ISLEX project `_ instead) but if you would like to, it can be cited like so: Tim Mahrt. Pysle. https://github.com/timmahrt/pysle, 2016. diff --git a/pysle/isletool.py b/pysle/isletool.py index b57b64d..5722a87 100644 --- a/pysle/isletool.py +++ b/pysle/isletool.py @@ -1,3 +1,4 @@ +#encoding: utf-8 ''' Created on Oct 11, 2012 @@ -7,14 +8,18 @@ Created on Oct 11, 2012 import io import re -charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E', - 'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b', - 'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:', - 'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p', - 'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u', - 'v', 'w', 'y', 'z'] -vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ] +charList = [u'#', u'.', u'aʊ', u'b', u'd', u'dʒ', u'ei', u'f', u'g', + u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'oʊ', u'p', + u'r', u's', u't', u'tʃ', u'u', u'v', u'w', u'z', u'æ', + u'ð', u'ŋ', u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ', + u'ɪ', u'ɵ', u'ɹ', u'ʃ', u'ʊ', u'ʒ', u'æ', u'ʌ', ] + +diacriticList = [u'˺', u'ˌ', u'̩', u'̃', ] + +vowelList = [u'aʊ', u'ei', u'i', u'oʊ', u'u', u'æ', + u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ', + u'ɪ', u'ʊ', u'ʌ', ] def isVowel(char): @@ -88,6 +93,108 @@ class LexicalTool(): multiword=multiword) +def _prepRESearchStr(matchStr, wordInitial='ok', wordFinal='ok', + spanSyllable='ok', stressedSyllable='ok'): + ''' + Prepares a user's RE string for a search + ''' + + # Protect sounds that are two characters + # After this we can assume that each character represents a sound + # (We'll revert back when we're done processing the RE) + replList = ((u'ei', u'1'), (u'tʃ', u'2'), (u'oʊ', u'3'), + (u'dʒ', u'4'), (u'aʊ', u'5'), (u'ɑɪ', u'6'), + (u'ɔi', u'7')) + + for charA, charB in replList: + matchStr = matchStr.replace(charA, charB) + + # Characters to check between all other characters + # Don't check between all other characters if the character is already + # in the search string or + interleaveStr = None + stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only') + spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only') + if stressOpt and spanOpt: + interleaveStr = u"\.?ˈ?" + elif stressOpt: + interleaveStr = u"ˈ?" + elif spanOpt: + interleaveStr = u"\.?" + + if interleaveStr is not None: + matchStr = interleaveStr.join(matchStr) + + # Setting search boundaries + # We search on '[^\.#]' and not '.' so that the search doesn't span + # multiple syllables or words + if wordInitial == 'only': + matchStr = u'#' + matchStr + elif wordInitial == 'no': + # Match the closest preceeding syllable. If there is none, look + # for word boundary plus at least one other character + matchStr = u'(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr + else: + matchStr = u'[#\.][^\.#]*?' + matchStr + + if wordFinal == 'only': + matchStr = matchStr + u'#' + elif wordFinal == 'no': + matchStr = matchStr + u"(?:[^\.#]*?\.|[^\.#]+?#)" + else: + matchStr = matchStr + u'[^\.#]*?[#\.]' + + # For sounds that are designated two characters, prevent + # detecting those sounds if the user wanted a sound + # designated by one of the contained characters + for charA, charB in [(u'e', u'i'), (u't', u'ʃ'), (u'o', u'ʊ'), + (u'd', u'ʒ'), (u'a', u'ʊ'), (u'ɑ' u'ɪ'), + (u'ɔ', u'i'), ]: + + # Forward search ('a' and not 'ab') + startI = 0 + while True: + try: + i = matchStr.index(charA, startI) + except ValueError: + break + if matchStr[i + 1] != charB: + forwardStr = u'(?!%s)' % charB + matchStr = matchStr[:i + 1] + forwardStr + matchStr[i + 1:] + startI = i + 1 + len(forwardStr) + + # Backward search ('b' and not 'ab') + startI = 0 + while True: + try: + i = matchStr.index(charB, startI) + except ValueError: + break + if matchStr[i - 1] != charA: + backStr = u'(? 0: if stressedSyllable == 'only': - if all(["'" not in match for match in matchList]): + if all([u"ˈ" not in match for match in matchList]): continue if stressedSyllable == 'no': - if all(["'" in match for match in matchList]): + if all([u"ˈ" in match for match in matchList]): continue # For syllable spanning, we check if there is a syllable @@ -211,11 +278,11 @@ def _parsePronunciation(pronunciationStr): stressedPhoneList = [] for i, syllable in enumerate(syllableList): for j, phone in enumerate(syllable): - if "'" in phone: + if u"ˈ" in phone: stressedSyllableList.insert(0, i) stressedPhoneList.insert(0, j) break - elif '"' in phone: + elif u'ˌ' in phone: stressedSyllableList.insert(i) stressedPhoneList.insert(j) diff --git a/pysle/praattools.py b/pysle/praattools.py index d15f0c8..07526a8 100644 --- a/pysle/praattools.py +++ b/pysle/praattools.py @@ -1,3 +1,4 @@ +#encoding: utf-8 ''' Created on Oct 22, 2014 @@ -76,7 +77,7 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName, stressJ = None # if stressI is not None: - syllableList[stressI][stressJ] += "'" + syllableList[stressI][stressJ] += u"ˈ" i = 0 # print(syllableList) diff --git a/pysle/pronunciationtools.py b/pysle/pronunciationtools.py index b34e447..f4d6bc8 100644 --- a/pysle/pronunciationtools.py +++ b/pysle/pronunciationtools.py @@ -1,3 +1,4 @@ +#encoding: utf-8 ''' Created on Oct 15, 2014 @@ -151,7 +152,7 @@ def _findBestPronunciation(isleDict, wordText, aPron): hasStress = False for syllable in syllableList: for phone in syllable: - hasStress = "'" in phone or hasStress + hasStress = u"ˈ" in phone or hasStress if hasStress: withStress.append(i) diff --git a/test/basic_examples.py b/test/basic_examples.py index 76f5c73..34eba8e 100644 --- a/test/basic_examples.py +++ b/test/basic_examples.py @@ -1,3 +1,4 @@ +#encoding: utf-8 ''' Created on Oct 22, 2014 @@ -12,16 +13,18 @@ from pysle import pronunciationtools # In this first example we look up the syllabification of a word and get it's # stress information. -searchWord = 'pumpkins' -isleDict = isletool.LexicalTool('islev2.txt') +searchWord = 'catatonic' +isleDict = isletool.LexicalTool('ISLEdict.txt') lookupResults = isleDict.lookup(searchWord) firstEntry = lookupResults[0] firstSyllableList = firstEntry[0] +firstSyllableList = ".".join([u" ".join(syllable) for syllable in firstSyllableList]) firstStressList = firstEntry[1] print(searchWord) -print(firstSyllableList, firstStressList) # 3rd syllable carries stress +print(firstSyllableList) +print(firstStressList) # 3rd syllable carries stress # Here we determine the syllabification of a word, as it was said. @@ -43,4 +46,6 @@ print(anotherPhoneList) print(stressedSyllableIndexList) # We can see the first syllable was elided print(stressedPhoneIndexList) print(flattenedStressIndexList) +print(syllableList) +print(syllabification) diff --git a/test/dictionary_search.py b/test/dictionary_search.py index 1a9dbb6..5040d60 100644 --- a/test/dictionary_search.py +++ b/test/dictionary_search.py @@ -1,3 +1,4 @@ +#encoding: utf-8 ''' Created on July 08, 2016 @@ -10,7 +11,7 @@ import random from pysle import isletool -tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt" +tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\ISLEdict.txt" isleDict = isletool.LexicalTool(tmpPath) def printOutMatches(matchStr, numSyllables=None, wordInitial='ok', @@ -33,7 +34,7 @@ def printOutMatches(matchStr, numSyllables=None, wordInitial='ok', if numMatches is not None and i > numMatches: break word, pronList = matchTuple - print("%s: %s" % (word, repr(pronList))) + print("%s: %s" % (word, ",".join(pronList))) print("") return matchList @@ -45,10 +46,10 @@ printOutMatches("dV", stressedSyllable="only", spanSyllable="no", # 3-syllable word with an 'ld' sequence that spans a syllable boundary printOutMatches("lBd", wordInitial="no", multiword='no', numSyllables=3, numMatches=10) - + # words ending in 'inth' -matchList = printOutMatches("InT", wordFinal="only", numMatches=10) - +matchList = printOutMatches(u"ɪnɵ", wordFinal="only", numMatches=10) + # that also start with 's' matchList = printOutMatches("s", wordInitial="only", numMatches=10, matchList=matchList, multiword="no")