diff --git a/README.rst b/README.rst
index bd42a96..9fcfc00 100644
--- a/README.rst
+++ b/README.rst
@@ -31,7 +31,7 @@ What can you do with this library?
- map an actual pronunciation to a dictionary pronunciation (can be used
to automatically find speech errors)::
- pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['kh', 'ae',])
+ pysle.pronunciationtools.findClosestPronunciation(isleDict, 'cat', ['k', 'æ',])
- automatically syllabify a praat textgrid containing words and phones
(e.g. force-aligned text) -- requires my
@@ -53,6 +53,9 @@ Ver 1.4 (July 9, 2016)
- added search functionality
+- ported code to use the new unicode IPA-based isledict
+ (the old one was ascii)
+
Ver 1.3 (March 15, 2016)
- added indicies for stressed vowels
@@ -74,12 +77,14 @@ Requirements
================
- Before you use this library (before or after installing it) you will need
- to download the ILSEX dictionary. It can be downloaded here:
+ to download the ILSEX dictionary. It can be downloaded here under the
+ section 'English' linked under the text 'English Pronlex'
+ (with a file name of ISLEdict.txt):
- `ISLEX project page `_
+ `ISLEX project page `_
`Direct link to the ISLEX file used in this project
- `_ (islev2.txt)
+ `_ (ISLEdict.txt)
- ``Python 2.7.*`` or above
@@ -113,7 +118,7 @@ Here is a typical common usage::
from pysle import isle
isleDict = isle.LexicalTool('C:\islev2.dict')
print isleDict.lookup('catatonic')[0] # Get the first pronunciation
- >> [['kh', '@,'], ['t_(', '&'], ['th', "A'"], ['n', 'I', 'kh']] [2]
+ >> [['k', 'ˌæ'], ['t˺', 'ə'], ['t', 'ˈɑ'], ['n', 'ɪ', 'k']] [2, 0]
and another::
@@ -121,7 +126,7 @@ and another::
from psyle import pronunciationTools
searchWord = 'another'
- anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced
+ anotherPhoneList = ['n', '@', 'th', 'r'] # Actually produced (ASCII or IPA ok here)
returnList = pronunciationTools.findBestSyllabification(isleDict,
searchWord,
@@ -138,7 +143,7 @@ Citing pysle
Pysle is general purpose coding and doesn't need to be cited
(you should cite the
-`ISLEX project `_
+`ISLEX project `_
instead) but if you would like to, it can be cited like so:
Tim Mahrt. Pysle. https://github.com/timmahrt/pysle, 2016.
diff --git a/pysle/isletool.py b/pysle/isletool.py
index b57b64d..5722a87 100644
--- a/pysle/isletool.py
+++ b/pysle/isletool.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
'''
Created on Oct 11, 2012
@@ -7,14 +8,18 @@ Created on Oct 11, 2012
import io
import re
-charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E',
- 'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b',
- 'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:',
- 'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p',
- 'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u',
- 'v', 'w', 'y', 'z']
-vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ]
+charList = [u'#', u'.', u'aʊ', u'b', u'd', u'dʒ', u'ei', u'f', u'g',
+ u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'oʊ', u'p',
+ u'r', u's', u't', u'tʃ', u'u', u'v', u'w', u'z', u'æ',
+ u'ð', u'ŋ', u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
+ u'ɪ', u'ɵ', u'ɹ', u'ʃ', u'ʊ', u'ʒ', u'æ', u'ʌ', ]
+
+diacriticList = [u'˺', u'ˌ', u'̩', u'̃', ]
+
+vowelList = [u'aʊ', u'ei', u'i', u'oʊ', u'u', u'æ',
+ u'ɑ', u'ɑɪ', u'ɔ', u'ɔi', u'ə', u'ɚ', u'ɛ', u'ɝ',
+ u'ɪ', u'ʊ', u'ʌ', ]
def isVowel(char):
@@ -88,6 +93,108 @@ class LexicalTool():
multiword=multiword)
+def _prepRESearchStr(matchStr, wordInitial='ok', wordFinal='ok',
+ spanSyllable='ok', stressedSyllable='ok'):
+ '''
+ Prepares a user's RE string for a search
+ '''
+
+ # Protect sounds that are two characters
+ # After this we can assume that each character represents a sound
+ # (We'll revert back when we're done processing the RE)
+ replList = ((u'ei', u'1'), (u'tʃ', u'2'), (u'oʊ', u'3'),
+ (u'dʒ', u'4'), (u'aʊ', u'5'), (u'ɑɪ', u'6'),
+ (u'ɔi', u'7'))
+
+ for charA, charB in replList:
+ matchStr = matchStr.replace(charA, charB)
+
+ # Characters to check between all other characters
+ # Don't check between all other characters if the character is already
+ # in the search string or
+ interleaveStr = None
+ stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
+ spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
+ if stressOpt and spanOpt:
+ interleaveStr = u"\.?ˈ?"
+ elif stressOpt:
+ interleaveStr = u"ˈ?"
+ elif spanOpt:
+ interleaveStr = u"\.?"
+
+ if interleaveStr is not None:
+ matchStr = interleaveStr.join(matchStr)
+
+ # Setting search boundaries
+ # We search on '[^\.#]' and not '.' so that the search doesn't span
+ # multiple syllables or words
+ if wordInitial == 'only':
+ matchStr = u'#' + matchStr
+ elif wordInitial == 'no':
+ # Match the closest preceeding syllable. If there is none, look
+ # for word boundary plus at least one other character
+ matchStr = u'(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
+ else:
+ matchStr = u'[#\.][^\.#]*?' + matchStr
+
+ if wordFinal == 'only':
+ matchStr = matchStr + u'#'
+ elif wordFinal == 'no':
+ matchStr = matchStr + u"(?:[^\.#]*?\.|[^\.#]+?#)"
+ else:
+ matchStr = matchStr + u'[^\.#]*?[#\.]'
+
+ # For sounds that are designated two characters, prevent
+ # detecting those sounds if the user wanted a sound
+ # designated by one of the contained characters
+ for charA, charB in [(u'e', u'i'), (u't', u'ʃ'), (u'o', u'ʊ'),
+ (u'd', u'ʒ'), (u'a', u'ʊ'), (u'ɑ' u'ɪ'),
+ (u'ɔ', u'i'), ]:
+
+ # Forward search ('a' and not 'ab')
+ startI = 0
+ while True:
+ try:
+ i = matchStr.index(charA, startI)
+ except ValueError:
+ break
+ if matchStr[i + 1] != charB:
+ forwardStr = u'(?!%s)' % charB
+ matchStr = matchStr[:i + 1] + forwardStr + matchStr[i + 1:]
+ startI = i + 1 + len(forwardStr)
+
+ # Backward search ('b' and not 'ab')
+ startI = 0
+ while True:
+ try:
+ i = matchStr.index(charB, startI)
+ except ValueError:
+ break
+ if matchStr[i - 1] != charA:
+ backStr = u'(? 0:
if stressedSyllable == 'only':
- if all(["'" not in match for match in matchList]):
+ if all([u"ˈ" not in match for match in matchList]):
continue
if stressedSyllable == 'no':
- if all(["'" in match for match in matchList]):
+ if all([u"ˈ" in match for match in matchList]):
continue
# For syllable spanning, we check if there is a syllable
@@ -211,11 +278,11 @@ def _parsePronunciation(pronunciationStr):
stressedPhoneList = []
for i, syllable in enumerate(syllableList):
for j, phone in enumerate(syllable):
- if "'" in phone:
+ if u"ˈ" in phone:
stressedSyllableList.insert(0, i)
stressedPhoneList.insert(0, j)
break
- elif '"' in phone:
+ elif u'ˌ' in phone:
stressedSyllableList.insert(i)
stressedPhoneList.insert(j)
diff --git a/pysle/praattools.py b/pysle/praattools.py
index d15f0c8..07526a8 100644
--- a/pysle/praattools.py
+++ b/pysle/praattools.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
'''
Created on Oct 22, 2014
@@ -76,7 +77,7 @@ def syllabifyTextgrid(isleDict, tg, wordTierName, phoneTierName,
stressJ = None #
if stressI is not None:
- syllableList[stressI][stressJ] += "'"
+ syllableList[stressI][stressJ] += u"ˈ"
i = 0
# print(syllableList)
diff --git a/pysle/pronunciationtools.py b/pysle/pronunciationtools.py
index b34e447..f4d6bc8 100644
--- a/pysle/pronunciationtools.py
+++ b/pysle/pronunciationtools.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
'''
Created on Oct 15, 2014
@@ -151,7 +152,7 @@ def _findBestPronunciation(isleDict, wordText, aPron):
hasStress = False
for syllable in syllableList:
for phone in syllable:
- hasStress = "'" in phone or hasStress
+ hasStress = u"ˈ" in phone or hasStress
if hasStress:
withStress.append(i)
diff --git a/test/basic_examples.py b/test/basic_examples.py
index 76f5c73..34eba8e 100644
--- a/test/basic_examples.py
+++ b/test/basic_examples.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
'''
Created on Oct 22, 2014
@@ -12,16 +13,18 @@ from pysle import pronunciationtools
# In this first example we look up the syllabification of a word and get it's
# stress information.
-searchWord = 'pumpkins'
-isleDict = isletool.LexicalTool('islev2.txt')
+searchWord = 'catatonic'
+isleDict = isletool.LexicalTool('ISLEdict.txt')
lookupResults = isleDict.lookup(searchWord)
firstEntry = lookupResults[0]
firstSyllableList = firstEntry[0]
+firstSyllableList = ".".join([u" ".join(syllable) for syllable in firstSyllableList])
firstStressList = firstEntry[1]
print(searchWord)
-print(firstSyllableList, firstStressList) # 3rd syllable carries stress
+print(firstSyllableList)
+print(firstStressList) # 3rd syllable carries stress
# Here we determine the syllabification of a word, as it was said.
@@ -43,4 +46,6 @@ print(anotherPhoneList)
print(stressedSyllableIndexList) # We can see the first syllable was elided
print(stressedPhoneIndexList)
print(flattenedStressIndexList)
+print(syllableList)
+print(syllabification)
diff --git a/test/dictionary_search.py b/test/dictionary_search.py
index 1a9dbb6..5040d60 100644
--- a/test/dictionary_search.py
+++ b/test/dictionary_search.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
'''
Created on July 08, 2016
@@ -10,7 +11,7 @@ import random
from pysle import isletool
-tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt"
+tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\ISLEdict.txt"
isleDict = isletool.LexicalTool(tmpPath)
def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
@@ -33,7 +34,7 @@ def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
if numMatches is not None and i > numMatches:
break
word, pronList = matchTuple
- print("%s: %s" % (word, repr(pronList)))
+ print("%s: %s" % (word, ",".join(pronList)))
print("")
return matchList
@@ -45,10 +46,10 @@ printOutMatches("dV", stressedSyllable="only", spanSyllable="no",
# 3-syllable word with an 'ld' sequence that spans a syllable boundary
printOutMatches("lBd", wordInitial="no", multiword='no',
numSyllables=3, numMatches=10)
-
+
# words ending in 'inth'
-matchList = printOutMatches("InT", wordFinal="only", numMatches=10)
-
+matchList = printOutMatches(u"ɪnɵ", wordFinal="only", numMatches=10)
+
# that also start with 's'
matchList = printOutMatches("s", wordInitial="only", numMatches=10,
matchList=matchList, multiword="no")