FEATURE: Added searching of words by pronunciation

Based on regular expressions with some keyword parameters
to simplify search queries.  A set of examples is
provided.
This commit is contained in:
Tim Mahrt
2016-07-09 17:22:12 +02:00
parent ea0bc5c5cd
commit b76454f626
2 changed files with 187 additions and 0 deletions
+133
View File
@@ -5,10 +5,26 @@ Created on Oct 11, 2012
'''
import io
import re
charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E',
'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b',
'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:',
'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p',
'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u',
'v', 'w', 'y', 'z']
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ]
def isVowel(char):
return any([vowel in char for vowel in vowelList])
def sequenceMatch(matchChar, searchStr):
return matchChar in searchStr
class WordNotInISLE(Exception):
def __init__(self, word):
@@ -62,6 +78,123 @@ class LexicalTool():
return pronList
def search(self, matchStr, numSyllables=None, wordInitial='ok',
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
multiword='ok'):
return search(self.data.items(), matchStr, numSyllables=numSyllables,
wordInitial=wordInitial, wordFinal=wordFinal,
spanSyllable=spanSyllable,
stressedSyllable=stressedSyllable,
multiword=multiword)
def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
multiword='ok'):
'''
Searches for matching words in the dictionary with regular expressions
wordInitial, wordFinal, spanSyllable, stressSyllable, and multiword
can take three different values: 'ok', 'only', or 'no'.
Special search characters:
'V' - any vowel
'R' - any rhotic
'#' - word boundary
'B' - syllable boundary
'.' - anything
Regular expression syntax applies, so if you wanted to search for any
word ending with a vowel or rhotic, matchStr = '(?:VR)#'
'''
# Characters to check between all other characters
# Don't check between all other characters if the character is already
# in the search string or
interleaveStr = None
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
if stressOpt and spanOpt:
interleaveStr = "\.?'?"
elif stressOpt:
interleaveStr = "'?"
elif spanOpt:
interleaveStr = "\.?"
if interleaveStr is not None:
matchStr = interleaveStr.join(matchStr)
# Setting search boundaries
# We search on '[^\.#]' and not '.' so that the search doesn't span
# multiple syllables or words
if wordInitial == 'only':
matchStr = '#' + matchStr
elif wordInitial == 'no':
# Match the closest preceeding syllable. If there is none, look
# for word boundary plus at least one other character
matchStr = '(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
else:
matchStr = '[#\.][^\.#]*?' + matchStr
if wordFinal == 'only':
matchStr = matchStr + '#'
elif wordFinal == 'no':
matchStr = matchStr + "(?:[^\.#]*?\.|[^\.#]+?#)"
else:
matchStr = matchStr + '[^\.#]*?[#\.]'
# Replace special characters
replDict = {"V": "(?:aI|aU|ei|oU|[AEIaeiu]):?",
"R": "[&39]?r",
"B": "\."}
for char, replStr in replDict.items():
matchStr = matchStr.replace(char, replStr)
# Run search for words
compiledRE = re.compile(matchStr)
retList = []
for word, pronList in searchList:
newPronList = []
for pron in pronList:
searchPron = pron.replace(",", "").replace(" ", "")
if numSyllables is not None:
if numSyllables != searchPron.count('.') + 1:
continue
# Is this a compound word?
if multiword == 'only':
if searchPron.count('#') == 2:
continue
elif multiword == 'no':
if searchPron.count('#') > 2:
continue
matchList = compiledRE.findall(searchPron)
if len(matchList) > 0:
if stressedSyllable == 'only':
if all(["'" not in match for match in matchList]):
continue
if stressedSyllable == 'no':
if all(["'" in match for match in matchList]):
continue
# For syllable spanning, we check if there is a syllable
# marker inside (not at the border) of the match.
if spanSyllable == 'only':
if all(["." not in txt[1:-1] for txt in matchList]):
continue
if spanSyllable == 'no':
if all(["." in txt[1:-1] for txt in matchList]):
continue
newPronList.append(pron)
if len(newPronList) > 0:
retList.append((word, newPronList))
retList.sort()
return retList
def _parsePronunciation(pronunciationStr):
'''
+54
View File
@@ -0,0 +1,54 @@
'''
Created on July 08, 2016
@author: tmahrt
Basic examples of common usage.
'''
import random
from pysle import isletool
tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt"
isleDict = isletool.LexicalTool(tmpPath)
def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
multiword='ok', numMatches=None, matchList=None):
if matchList is None:
matchList = isleDict.search(matchStr, numSyllables, wordInitial,
wordFinal, spanSyllable, stressedSyllable,
multiword)
else:
matchList = isletool.search(matchList, matchStr, numSyllables, wordInitial,
wordFinal, spanSyllable, stressedSyllable,
multiword)
if numMatches is not None and len(matchList) > numMatches:
random.shuffle(matchList)
for i, matchTuple in enumerate(matchList):
if numMatches is not None and i > numMatches:
break
word, pronList = matchTuple
print("%s: %s" % (word, repr(pronList)))
print("")
return matchList
# 2-syllable words with a stressed syllable containing 'dV' but not word initially
printOutMatches("dV", stressedSyllable="only", spanSyllable="no",
wordInitial="no", numSyllables=2, numMatches=10)
# 3-syllable word with an 'ld' sequence that spans a syllable boundary
printOutMatches("lBd", wordInitial="no", multiword='no',
numSyllables=3, numMatches=10)
# words ending in 'inth'
matchList = printOutMatches("InT", wordFinal="only", numMatches=10)
# that also start with 's'
matchList = printOutMatches("s", wordInitial="only", numMatches=10,
matchList=matchList, multiword="no")