diff --git a/pysle/isletool.py b/pysle/isletool.py index 1b7bdf2..b57b64d 100644 --- a/pysle/isletool.py +++ b/pysle/isletool.py @@ -5,10 +5,26 @@ Created on Oct 11, 2012 ''' import io +import re + +charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E', + 'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b', + 'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:', + 'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p', + 'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u', + 'v', 'w', 'y', 'z'] vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ] +def isVowel(char): + return any([vowel in char for vowel in vowelList]) + + +def sequenceMatch(matchChar, searchStr): + return matchChar in searchStr + + class WordNotInISLE(Exception): def __init__(self, word): @@ -62,6 +78,123 @@ class LexicalTool(): return pronList + def search(self, matchStr, numSyllables=None, wordInitial='ok', + wordFinal='ok', spanSyllable='ok', stressedSyllable='ok', + multiword='ok'): + return search(self.data.items(), matchStr, numSyllables=numSyllables, + wordInitial=wordInitial, wordFinal=wordFinal, + spanSyllable=spanSyllable, + stressedSyllable=stressedSyllable, + multiword=multiword) + + +def search(searchList, matchStr, numSyllables=None, wordInitial='ok', + wordFinal='ok', spanSyllable='ok', stressedSyllable='ok', + multiword='ok'): + ''' + Searches for matching words in the dictionary with regular expressions + + wordInitial, wordFinal, spanSyllable, stressSyllable, and multiword + can take three different values: 'ok', 'only', or 'no'. + + Special search characters: + 'V' - any vowel + 'R' - any rhotic + '#' - word boundary + 'B' - syllable boundary + '.' - anything + + Regular expression syntax applies, so if you wanted to search for any + word ending with a vowel or rhotic, matchStr = '(?:VR)#' + ''' + + # Characters to check between all other characters + # Don't check between all other characters if the character is already + # in the search string or + interleaveStr = None + stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only') + spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only') + if stressOpt and spanOpt: + interleaveStr = "\.?'?" + elif stressOpt: + interleaveStr = "'?" + elif spanOpt: + interleaveStr = "\.?" + + if interleaveStr is not None: + matchStr = interleaveStr.join(matchStr) + + # Setting search boundaries + # We search on '[^\.#]' and not '.' so that the search doesn't span + # multiple syllables or words + if wordInitial == 'only': + matchStr = '#' + matchStr + elif wordInitial == 'no': + # Match the closest preceeding syllable. If there is none, look + # for word boundary plus at least one other character + matchStr = '(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr + else: + matchStr = '[#\.][^\.#]*?' + matchStr + + if wordFinal == 'only': + matchStr = matchStr + '#' + elif wordFinal == 'no': + matchStr = matchStr + "(?:[^\.#]*?\.|[^\.#]+?#)" + else: + matchStr = matchStr + '[^\.#]*?[#\.]' + + # Replace special characters + replDict = {"V": "(?:aI|aU|ei|oU|[AEIaeiu]):?", + "R": "[&39]?r", + "B": "\."} + + for char, replStr in replDict.items(): + matchStr = matchStr.replace(char, replStr) + + # Run search for words + compiledRE = re.compile(matchStr) + retList = [] + for word, pronList in searchList: + newPronList = [] + for pron in pronList: + searchPron = pron.replace(",", "").replace(" ", "") + if numSyllables is not None: + if numSyllables != searchPron.count('.') + 1: + continue + + # Is this a compound word? + if multiword == 'only': + if searchPron.count('#') == 2: + continue + elif multiword == 'no': + if searchPron.count('#') > 2: + continue + + matchList = compiledRE.findall(searchPron) + if len(matchList) > 0: + if stressedSyllable == 'only': + if all(["'" not in match for match in matchList]): + continue + if stressedSyllable == 'no': + if all(["'" in match for match in matchList]): + continue + + # For syllable spanning, we check if there is a syllable + # marker inside (not at the border) of the match. + if spanSyllable == 'only': + if all(["." not in txt[1:-1] for txt in matchList]): + continue + if spanSyllable == 'no': + if all(["." in txt[1:-1] for txt in matchList]): + continue + newPronList.append(pron) + + if len(newPronList) > 0: + retList.append((word, newPronList)) + + retList.sort() + return retList + def _parsePronunciation(pronunciationStr): ''' diff --git a/test/dictionary_search.py b/test/dictionary_search.py new file mode 100644 index 0000000..1a9dbb6 --- /dev/null +++ b/test/dictionary_search.py @@ -0,0 +1,54 @@ +''' +Created on July 08, 2016 + +@author: tmahrt + +Basic examples of common usage. +''' + +import random + +from pysle import isletool + +tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt" +isleDict = isletool.LexicalTool(tmpPath) + +def printOutMatches(matchStr, numSyllables=None, wordInitial='ok', + wordFinal='ok', spanSyllable='ok', stressedSyllable='ok', + multiword='ok', numMatches=None, matchList=None): + + if matchList is None: + matchList = isleDict.search(matchStr, numSyllables, wordInitial, + wordFinal, spanSyllable, stressedSyllable, + multiword) + else: + matchList = isletool.search(matchList, matchStr, numSyllables, wordInitial, + wordFinal, spanSyllable, stressedSyllable, + multiword) + + if numMatches is not None and len(matchList) > numMatches: + random.shuffle(matchList) + + for i, matchTuple in enumerate(matchList): + if numMatches is not None and i > numMatches: + break + word, pronList = matchTuple + print("%s: %s" % (word, repr(pronList))) + print("") + + return matchList + +# 2-syllable words with a stressed syllable containing 'dV' but not word initially +printOutMatches("dV", stressedSyllable="only", spanSyllable="no", + wordInitial="no", numSyllables=2, numMatches=10) + +# 3-syllable word with an 'ld' sequence that spans a syllable boundary +printOutMatches("lBd", wordInitial="no", multiword='no', + numSyllables=3, numMatches=10) + +# words ending in 'inth' +matchList = printOutMatches("InT", wordFinal="only", numMatches=10) + +# that also start with 's' +matchList = printOutMatches("s", wordInitial="only", numMatches=10, + matchList=matchList, multiword="no")